Add reproduction script mode (#20)

nielstron · web-flow · commit 5d6e00e165b6 · 2025-03-06T09:40:55.000+01:00
* Adapt params and execution specs to allow reproduction script mode

* Fix

* Add grading/log-parser for reproduction script

* Add documentation for reproduction script mode

* Fix imports

* Fix

* Fix

* Fix description
diff --git a/README.md b/README.md
@@ -62,12 +62,19 @@ python -m src.main \
     --run_id <run_id>
     # use --predictions_path 'gold' to verify the gold patches
     # use --run_id to name the evaluation run
+    # use --exec_mode reproduction_script --reproduction_script_name <script_name> to run in reproduction script mode (see below)
 ```
 
 This command will generate docker build logs (`image_build_logs`) and evaluation logs (`run_instance_swt_logs`) in the current directory.
-
 The final evaluation results will be stored in the `evaluation_results` directory.
 
+### Unit Test mode vs. Reproduction Script mode
+
+By default, SWT-Bench operates in unit test mode, where model predictions are treated as unit tests to be integrated into the existing test suite. The evaluation harness runs the modified parts of the test suite and reports changes to compute the success rate. Successful patches add a pass-to-fail test without causing existing tests to fail.
+
+In the simpler reproduction script mode, model predictions are considered standalone scripts that reproduce issues. The evaluation harness runs the script on the codebase and determines success based on the script's exit code: 0 for pass and 1 for fail. The test suite is not executed in this mode.
+
+
 ## Reporting results
 
 To assess the result of a single run, we provide a simple script to assess a single evaluation run.
@@ -137,6 +144,8 @@ For our evaluation of OpenHands, we automatically discard all top-level files to
 Moreover, for the evaluation of the agent in the correct environment, we discard changes to `setup.py`, `pyproject.toml` and `requirements.txt` files, as they are changed by the test setup and conflict with the repeated evaluation.
 To find the exact setup used for OpenHands, check out the branch [`feat/CI`](https://github.com/logic-star-ai/swt-bench/tree/feat/CI).
 
+AEGIS was evaluated in reproduction script mode.
+
 ## 🏗 Building SWT-Bench and Zero-Shot inference
 
 To recreate the SWT-Bench dataset or create one with your own flavoring
diff --git a/docs/index.html b/docs/index.html
@@ -207,7 +207,7 @@ <h2 class="title is-4 is-spaced">News</h2>
               <tbody>
                 <tr>
                   <td>🆕&nbsp;<a href="https://arxiv.org/pdf/2411.18015">AEGIS</a><sup>&Dagger;</sup></td>
-                  <td>47.8%</td>
+                  <td>46.4%</td>
                   <td>26.0%</td>
                   <td><time>2025-02-17</time></td>
                   <td><a href="https://files.sri.inf.ethz.ch/swt-bench/aegis/">🔗</a></td>
@@ -356,7 +356,7 @@ <h2 class="title is-4 is-spaced">News</h2>
       </div>
       <div class="columns is-max-desktop">
         <div class="column is-centered">
-          <p class="is-size-7">The results reported here are evaluation results on SWT-Bench Lite and Verified. We have independently executed submitted predictions for verification. <sup>&Dagger;</sup> Generates stand-alone reproduction scripts and does not attempt integration into the test framework. <sup>#</sup> This approach leverages execution feedback from a correctly set-up <a title="Continuous Integration" href="https://en.wikipedia.org/wiki/Continuous_integration">CI</a> environment. </p>
+          <p class="is-size-7">The results reported here are evaluation results on SWT-Bench Lite and Verified. We have independently executed submitted predictions for verification. <sup>&Dagger;</sup> Generates stand-alone reproduction scripts and does not attempt integration into the test framework. <sup>#</sup> Leverages execution feedback from a correctly set-up <a title="Continuous Integration" href="https://en.wikipedia.org/wiki/Continuous_integration">CI</a> environment. </p>
         </div>
       </div>
     </div>
diff --git a/src/__init__.py b/src/__init__.py
@@ -1,11 +1,12 @@
-__version__ = "2.0.2"
+__version__ = "1.2.0"
 
 from src.constants import (
     KEY_INSTANCE_ID,
     KEY_MODEL,
     KEY_PREDICTION,
     MAP_REPO_TO_TEST_FRAMEWORK,
     MAP_VERSION_TO_INSTALL,
+    ResolvedStatus
 )
 
 from src.docker_build import (
@@ -33,7 +34,6 @@
     get_eval_report,
     get_pred_report,
     get_resolution_success,
-    ResolvedStatus,
     TestStatus,
 )
 
diff --git a/src/dataset.py b/src/dataset.py
@@ -1,6 +1,6 @@
 import json
 import pathlib
-from typing import List, Tuple, Optional, Dict, cast
+from typing import Dict, cast
 import re
 from datasets import load_dataset, Dataset, load_from_disk
 
diff --git a/src/docker_build.py b/src/docker_build.py
@@ -8,7 +8,6 @@
 from tqdm import tqdm
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
-import os
 from docker.models.containers import Container
 
 from src.constants import (
@@ -18,7 +17,6 @@
     MAP_VERSION_TO_INSTALL,
 )
 from src.test_spec import (
-    get_test_specs_from_dataset,
     make_test_spec,
     TestSpec
 )
@@ -52,6 +50,7 @@ def __str__(self):
         )
 
 BuildMode = Literal["cli", "api"]
+ExecMode = Literal["unit_test", "reproduction_script"]
 
 def docker_build_cli(
     build_dir: Path,
diff --git a/src/docker_utils.py b/src/docker_utils.py
@@ -1,8 +1,4 @@
 import pathlib
-import subprocess
-import tempfile
-from io import BytesIO
-from typing import Literal
 import base64
 
 import docker
diff --git a/src/dockerfiles.py b/src/dockerfiles.py
@@ -1,5 +1,3 @@
-from functools import partial
-
 # IF you change the base image, you need to rebuild all images (run with --force_rebuild)
 _DOCKERFILE_BASE = r"""
 FROM --platform={platform} ubuntu:22.04
diff --git a/src/exec_spec.py b/src/exec_spec.py
@@ -4,7 +4,7 @@
 import re
 
 from dataclasses import dataclass, asdict
-from typing import Union, List, Optional
+from typing import Union, List, Optional, Literal
 
 from src.constants import (
     SWEbenchInstance,
@@ -25,6 +25,8 @@
 
 DIFF_MODIFIED_FILE_REGEX = r"--- a/(.*)"
 
+ExecMode = Literal["unit_test", "reproduction_script"]
+
 
 @dataclass
 class ExecSpec:
@@ -50,6 +52,8 @@ class ExecSpec:
     rm_image: bool = False
     force_rebuild: bool = False
 
+    exec_mode: ExecMode = "unit_test"
+    reproduction_script_name: Optional[str] = None
     compute_coverage: bool = False
 
     @property
@@ -67,6 +71,20 @@ def as_dict(self):
 
     @property
     def test_command(self):
+        trace_path = "/root/trace.py"
+        changed_files_pattern = "({})".format("|".join(re.escape(x) for x in self.coverage_files))
+        trace_pattern = f"python3 {trace_path} --count -C coverage.cover --include-pattern '/testbed/{changed_files_pattern}'"
+
+        if self.exec_mode == "reproduction_script":
+            reproduction_script_path = f"/testbed/{self.reproduction_script_name}"
+            # executes just the reproduction script to determine the exit status
+            test_command = f"python3 {reproduction_script_path}"
+            if not self.compute_coverage:
+                return test_command
+            # executes the coverage script first to compute coverage, then the reproduction script to determine the exit status
+            return f"{trace_pattern} {reproduction_script_path} && {test_command}"
+
+        # otherwise execute the test suite command
         test_command = " ".join(
             [
                 MAP_REPO_TO_TEST_FRAMEWORK[self.repo][self.version],
@@ -76,10 +94,6 @@ def test_command(self):
         if not self.compute_coverage:
             return test_command
 
-        trace_path = "/root/trace.py"
-        changed_files_pattern = "({})".format("|".join(re.escape(x) for x in self.coverage_files))
-        trace_pattern = f"python3 {trace_path} --count -C coverage.cover --include-pattern '/testbed/{changed_files_pattern}'"
-
         cleaned_test_cmd = test_command.replace("--tb=no", "")
 
         if re.findall(r"python(3?) -m", cleaned_test_cmd):
@@ -255,12 +269,16 @@ def eval_script_list(self):
 
         if "install" in install:
             eval_commands.append(install["install"])
+        if self.exec_mode == "reproduction_script":
+            exit_mode_command = ["echo $?"]
+        else:
+            exit_mode_command = []
 
         if self.compute_coverage:
             cat_coverage_commands = ["cat coverage.cover"]
         else:
             cat_coverage_commands = []
-        eval_commands += apply_patch_commands + [test_command] + cat_coverage_commands + reset_commands
+        eval_commands += apply_patch_commands + [test_command] + exit_mode_command + cat_coverage_commands + reset_commands
 
         return eval_commands
 
@@ -352,7 +370,11 @@ def get_exec_specs_from_dataset(dataset: Union[list[SWEbenchInstance], list[Exec
     return list(map(make_exec_spec, dataset))
 
 
-def make_exec_spec(instance: SWEbenchInstance) -> ExecSpec:
+def make_exec_spec(
+        instance: SWEbenchInstance,
+        exec_mode: ExecMode = "unit_test",
+        reproduction_script_name: Optional[str] = None,
+) -> ExecSpec:
     if isinstance(instance, ExecSpec):
         return instance
     instance_id = instance["instance_id"]
@@ -387,4 +409,6 @@ def make_exec_spec(instance: SWEbenchInstance) -> ExecSpec:
         test_directives=test_directives,
         patch_list=patch_list,
         coverage_files=changed_files,
+        exec_mode=exec_mode,
+        reproduction_script_name=reproduction_script_name,
     )
diff --git a/src/grading.py b/src/grading.py
@@ -4,9 +4,9 @@
 import json
 from unidiff import PatchSet
 
+from src.exec_spec import ExecMode
 from src.constants import (
     APPLY_PATCH_FAIL,
-    APPLY_PATCH_PASS,
     FAIL_TO_FAIL,
     FAIL_TO_PASS,
     PASS_TO_FAIL,
@@ -15,11 +15,9 @@
     RESET_FAILED,
     TESTS_ERROR,
     TESTS_TIMEOUT,
-    ResolvedStatus,
     TestStatus,
 )
-from src.test_spec import TestSpec
-from src.log_parsers import MAP_REPO_TO_PARSER
+from src.log_parsers import MAP_REPO_TO_PARSER, parse_log_reproduction_script
 from src.utils import get_log_dir, setup_logging
 
 # MARK: Utility functions
@@ -45,27 +43,35 @@ def test_failed(case: str, sm: dict[str, str]) -> bool:
     )
 
 
-def get_logs_eval(log_fp: str, repo: str) -> tuple[dict[str, str], bool]:
+def get_logs_eval(
+        log_fp: str,
+        repo: str,
+        exec_mode: ExecMode,
+) -> tuple[dict[str, str], bool]:
     """
     Retrieve evaluation results for a task instance from its corresponding log file
 
     Args:
         log_fp (str): path to log file
+        repo (str): repository name
+        exec_mode (ExecMode): execution mode
+        reproduction_script_name (str): name of reproduction script
     Returns:
         bool: whether the patch applied successfully
         dict: status map
     
     TODO(john-b-yang): Check this is working properly...
     """
     # Convert e.g. "logs/scikit-learn__scikit-learn-12421/test_output.txt" to "scikit-learn/scikit-learn"
-    log_parser = MAP_REPO_TO_PARSER[repo]
+    log_parser = MAP_REPO_TO_PARSER[repo] if exec_mode != "reproduction_script" else parse_log_reproduction_script
 
     if not Path(log_fp).exists():
         # likely due to a timeout
         return {}, False
     with open(log_fp) as f:
         raw_content = f.read()
         # remove installation logs
+    # NOTE: does not work when not computing coverage
     content = re.split(r"\n\+ python3 [^\n]*trace.py --count -C coverage.cover [^\n]*\n", raw_content, flags=re.MULTILINE)[1]
     # remove coverage dumps
     content = content.split("\n+ cat coverage.cover")[0]
@@ -383,7 +389,15 @@ def get_pred_report(
     return report_map
 
 
-def report_results(patch_id: str, run_id: str, golden_code_patch, output_paths: Optional[List[str]], instance_id: str, repo: str) -> dict[str, dict[str, bool]]:
+def report_results(
+        patch_id: str,
+        run_id: str,
+        golden_code_patch,
+        output_paths: Optional[List[str]],
+        instance_id: str,
+        repo: str,
+        exec_mode: ExecMode,
+) -> dict[str, dict[str, bool]]:
     log_dir = get_log_dir(run_id, patch_id, instance_id)
     logger, report_path = setup_logging(log_dir, instance_id)
 
@@ -395,7 +409,7 @@ def report_results(patch_id: str, run_id: str, golden_code_patch, output_paths:
     patch_applied = []
     if output_paths is not None:
         for output_path in output_paths:
-            test_result, patch_applied_ = get_logs_eval(output_path, repo)
+            test_result, patch_applied_ = get_logs_eval(output_path, repo, exec_mode)
             patch_applied.append(patch_applied_)
             coverage_result = get_coverage_eval(output_path)
             test_results.append(test_result)
diff --git a/src/log_parsers.py b/src/log_parsers.py
@@ -205,6 +205,17 @@ def parse_log_matplotlib(log: str) -> dict[str, str]:
             test_status_map[test_case[1]] = test_case[0]
     return test_status_map
 
+def parse_log_reproduction_script(log: str) -> dict[str, str]:
+    """
+    If there is a nonzero exit code log a "main" test case with status "FAILED"
+    """
+    exit_code = re.findall(r"^\+ echo (\d+)$", log, re.MULTILINE)
+    if not exit_code:
+        return {}
+    name = "reproduction_script"
+    status = TestStatus.PASSED.value if exit_code[0] == "0" else TestStatus.FAILED.value
+    return {name: status}
+
 
 parse_log_astroid = parse_log_pytest
 parse_log_flask = parse_log_pytest
diff --git a/src/main.py b/src/main.py
@@ -1,5 +1,8 @@
 from typing import TYPE_CHECKING
+
+
 if TYPE_CHECKING:
+    from exec_spec import ExecMode
     from docker_build import BuildMode
 
 import docker
@@ -35,6 +38,8 @@ def run(
         timeout: int,
         build_mode: "BuildMode" = "api",
         skip_eval: bool = False,
+        exec_mode: "ExecMode" = "unit_test",
+        reproduction_script_name: Optional[str] = None,
     ):
     """
     Run evaluation harness for the given dataset and predictions.
@@ -71,11 +76,11 @@ def run(
     else:
         # build environment images + run instances
         # build_env_images(client, dataset, force_rebuild, max_workers)
-        run_instances(predicted_tests, dataset, compute_coverage, cache_level, clean, force_rebuild, max_workers, run_id, patch_types, timeout, client, build_mode)
+        run_instances(predicted_tests, dataset, compute_coverage, cache_level, clean, force_rebuild, max_workers, run_id, patch_types, timeout, client, build_mode, exec_mode, reproduction_script_name)
 
     # clean images + make final report
     clean_images(client, existing_images, cache_level, clean)
-    make_run_report(predicted_tests, full_dataset, client, run_id)
+    make_run_report(predicted_tests, full_dataset, client, run_id, exec_mode)
 
 
 if __name__ == "__main__":
@@ -121,6 +126,12 @@ def run(
     parser.add_argument(
         "--skip_eval", type=str2bool, default=False, help="Skip evaluation and only generate reports"
     )
+    parser.add_argument(
+        "--exec_mode", type=str, choices=["unit_test", "reproduction_script"], default="unit_test", help="Choose execution mode of generated patches. unit_test: run patch as part of test suite. parses test logs for deciding pass/fail.  reproduction_script: run patch as a separate script. return status of script decides pass/fail"
+    )
+    parser.add_argument(
+        "--reproduction_script_name", type=str, default=None, help="Name of the reproduction script to run in exec_mode reproduction_script"
+    )
     args = parser.parse_args()
 
     run(**vars(args))
diff --git a/src/report.py b/src/report.py
@@ -1,7 +1,5 @@
-
 from tabulate import tabulate
 import fire
-from pathlib import Path
 
 from figures.util import *
 
diff --git a/src/run_evaluation.py b/src/run_evaluation.py
diff --git a/src/test_spec.py b/src/test_spec.py

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,6 @@`
`8`	`8`	`from tqdm import tqdm`
`9`	`9`	`from concurrent.futures import ThreadPoolExecutor, as_completed`
`10`	`10`	`from pathlib import Path`
`11`		`-import os`
`12`	`11`	`from docker.models.containers import Container`
`13`	`12`
`14`	`13`	`from src.constants import (`
`@@ -18,7 +17,6 @@`
`18`	`17`	`MAP_VERSION_TO_INSTALL,`
`19`	`18`	`)`
`20`	`19`	`from src.test_spec import (`
`21`		`- get_test_specs_from_dataset,`
`22`	`20`	`make_test_spec,`
`23`	`21`	`TestSpec`
`24`	`22`	`)`
`@@ -52,6 +50,7 @@ def __str__(self):`
`52`	`50`	`)`
`53`	`51`
`54`	`52`	`BuildMode = Literal["cli", "api"]`
	`53`	`+ExecMode = Literal["unit_test", "reproduction_script"]`
`55`	`54`
`56`	`55`	`def docker_build_cli(`
`57`	`56`	`build_dir: Path,`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-from functools import partial`
`2`		`-`
`3`	`1`	`# IF you change the base image, you need to rebuild all images (run with --force_rebuild)`
`4`	`2`	`_DOCKERFILE_BASE = r"""`
`5`	`3`	`FROM --platform={platform} ubuntu:22.04`