integrate testbench as E2E replay test

KRRT7 · KRRT7 · commit 7f2167ad506b · 2025-03-11T14:53:09.000-07:00
diff --git a/.github/workflows/end-to-end-test-tracer-replay_testbench.yaml b/.github/workflows/end-to-end-test-tracer-replay_testbench.yaml
@@ -0,0 +1,41 @@
+name: end-to-end-test
+
+on:
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  tracer-replay-testbench:
+    runs-on: ubuntu-latest
+    env:
+      CODEFLASH_AIS_SERVER: prod
+      POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
+      CODEFLASH_API_KEY: ${{ secrets.CODEFLASH_API_KEY }}
+      COLUMNS: 110
+      MAX_RETRIES: 3
+      RETRY_DELAY: 5
+      EXPECTED_IMPROVEMENT_PCT: 10
+      CODEFLASH_END_TO_END: 1
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set up Python 3.11 for CLI
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: 3.11.6
+
+      - name: Install dependencies (CLI)
+        run: |
+          uv tool install poetry
+          uv venv
+          source .venv/bin/activate
+          poetry install --with dev
+
+      - name: Run Codeflash to optimize code
+        id: optimize_code
+        run: |
+          source .venv/bin/activate
+          poetry run python tests/scripts/end_to_end_test_tracer_replay_testbench.py
diff --git a/code_to_optimize/code_directories/simple_tracer_e2e/testbench.py b/code_to_optimize/code_directories/simple_tracer_e2e/testbench.py
@@ -0,0 +1,24 @@
+from concurrent.futures import ThreadPoolExecutor
+
+def funcA(number):
+    k = 0
+    for i in range(number * 100):
+        k += i
+    # Simplify the for loop by using sum with a range object
+    j = sum(range(number))
+
+    # Use a generator expression directly in join for more efficiency
+    return " ".join(str(i) for i in range(number))
+
+
+def test_threadpool() -> None:
+    pool = ThreadPoolExecutor(max_workers=3)
+    args = list(range(10, 31, 10))
+    result = pool.map(funcA, args)
+
+    for r in result:
+        print(r)
+
+
+if __name__ == "__main__":
+    test_threadpool()
diff --git a/tests/scripts/end_to_end_test_tracer_replay.py b/tests/scripts/end_to_end_test_tracer_replay.py
@@ -7,17 +7,17 @@
 def run_test(expected_improvement_pct: int) -> bool:
     config = TestConfig(
         trace_mode=True,
+        trace_load="workload",
         min_improvement_x=0.1,
         expected_unit_tests=1,
         coverage_expectations=[
-            CoverageExpectation(function_name="funcA", expected_coverage=100.0, expected_lines=[2, 3, 4, 6, 9])
+            CoverageExpectation(function_name="funcA", expected_coverage=100.0, expected_lines=[2, 3, 4, 6, 9]),
         ],
     )
     cwd = (
         pathlib.Path(__file__).parent.parent.parent / "code_to_optimize" / "code_directories" / "simple_tracer_e2e"
     ).resolve()
     return run_codeflash_command(cwd, config, expected_improvement_pct)
 
-
 if __name__ == "__main__":
     exit(run_with_retries(run_test, int(os.getenv("EXPECTED_IMPROVEMENT_PCT", 10))))
diff --git a/tests/scripts/end_to_end_test_tracer_replay_testbench.py b/tests/scripts/end_to_end_test_tracer_replay_testbench.py
@@ -0,0 +1,25 @@
+import os
+import pathlib
+
+from end_to_end_test_utilities import CoverageExpectation, TestConfig, run_codeflash_command, run_with_retries
+
+
+def run_test(expected_improvement_pct: int) -> bool:
+    config = TestConfig(
+        trace_mode=True,
+        trace_load="testbench",
+        min_improvement_x=0.1,
+        expected_unit_tests=1,
+        coverage_expectations=[
+            CoverageExpectation(function_name="funcA", expected_coverage=100.0, expected_lines=[4, 5, 6, 8, 11])
+        ],
+    )
+    cwd = (
+        pathlib.Path(__file__).parent.parent.parent / "code_to_optimize" / "code_directories" / "simple_tracer_e2e"
+    ).resolve()
+    return run_codeflash_command(cwd, config, expected_improvement_pct)
+
+
+
+if __name__ == "__main__":
+    exit(run_with_retries(run_test, int(os.getenv("EXPECTED_IMPROVEMENT_PCT", 10))))
diff --git a/tests/scripts/end_to_end_test_utilities.py b/tests/scripts/end_to_end_test_utilities.py
@@ -25,6 +25,7 @@ class TestConfig:
     expected_unit_tests: Optional[int] = None
     min_improvement_x: float = 0.1
     trace_mode: bool = False
+    trace_load: str = "workload"
     coverage_expectations: list[CoverageExpectation] = field(default_factory=list)
 
 
@@ -80,7 +81,10 @@ def run_codeflash_command(
 ) -> bool:
     logging.basicConfig(level=logging.INFO)
     if config.trace_mode:
-        return run_trace_test(cwd, config, expected_improvement_pct)
+        if config.trace_load == "workload":
+            return run_trace_test(cwd, config, expected_improvement_pct)
+        if config.trace_load == "testbench":
+            return run_trace_test2(cwd, config, expected_improvement_pct)
 
     path_to_file = cwd / config.file_path
     file_contents = path_to_file.read_text("utf-8")
@@ -228,6 +232,54 @@ def run_trace_test(cwd: pathlib.Path, config: TestConfig, expected_improvement_p
     return validate_output(stdout, return_code, expected_improvement_pct, config)
 
 
+def run_trace_test2(cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int) -> bool:
+    # First command: Run the tracer
+    test_root = cwd / "tests" / (config.test_framework or "")
+    clear_directory(test_root)
+    command = ["python", "-m", "codeflash.tracer", "-o", "codeflash.trace", "testbench.py"]
+    process = subprocess.Popen(
+        command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=str(cwd), env=os.environ.copy()
+    )
+
+    output = []
+    for line in process.stdout:
+        logging.info(line.strip())
+        output.append(line)
+
+    return_code = process.wait()
+    stdout = "".join(output)
+
+    if return_code != 0:
+        logging.error(f"Tracer command returned exit code {return_code}")
+        return False
+
+    functions_traced = re.search(r"Traced (\d+) function calls successfully and replay test created at - (.*)$", stdout)
+    if not functions_traced or int(functions_traced.group(1)) != 5:
+        logging.error("Expected 5 traced functions")
+        return False
+
+    replay_test_path = pathlib.Path(functions_traced.group(2))
+    if not replay_test_path.exists():
+        logging.error(f"Replay test file missing at {replay_test_path}")
+        return False
+
+    # Second command: Run optimization
+    command = ["python", "../../../codeflash/main.py", "--replay-test", str(replay_test_path), "--no-pr"]
+    process = subprocess.Popen(
+        command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=str(cwd), env=os.environ.copy()
+    )
+
+    output = []
+    for line in process.stdout:
+        logging.info(line.strip())
+        output.append(line)
+
+    return_code = process.wait()
+    stdout = "".join(output)
+
+    return validate_output(stdout, return_code, expected_improvement_pct, config)
+
+
 def run_with_retries(test_func, *args, **kwargs) -> bool:
     max_retries = int(os.getenv("MAX_RETRIES", 3))
     retry_delay = int(os.getenv("RETRY_DELAY", 5))