new end to end test for benchmarking bubble sort

alvin-r · alvin-r · commit 96dd78092ec0 · 2025-04-02T16:15:27.000-07:00
diff --git a/.github/workflows/end-to-end-benchmark-test.yaml b/.github/workflows/end-to-end-benchmark-test.yaml
@@ -0,0 +1,41 @@
+name: end-to-end-test
+
+on:
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  benchmark-bubble-sort-optimization:
+    runs-on: ubuntu-latest
+    env:
+      CODEFLASH_AIS_SERVER: prod
+      POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
+      CODEFLASH_API_KEY: ${{ secrets.CODEFLASH_API_KEY }}
+      COLUMNS: 110
+      MAX_RETRIES: 3
+      RETRY_DELAY: 5
+      EXPECTED_IMPROVEMENT_PCT: 5
+      CODEFLASH_END_TO_END: 1
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set up Python 3.11 for CLI
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: 3.11.6
+
+      - name: Install dependencies (CLI)
+        run: |
+          uv tool install poetry
+          uv venv
+          source .venv/bin/activate
+          poetry install --with dev
+
+      - name: Run Codeflash to optimize code
+        id: optimize_code with benchmarks
+        run: |
+          source .venv/bin/activate
+          poetry run python tests/scripts/end_to_end_test_benchmark_sort.py
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -449,6 +449,7 @@ def determine_best_candidate(
                         speedup_ratios[candidate.optimization_id] = perf_gain
 
                         tree = Tree(f"Candidate #{candidate_index} - Runtime Information")
+                        benchmark_tree = None
                         if speedup_critic(
                             candidate_result, original_code_baseline.runtime, best_runtime_until_now
                         ) and quantity_of_tests_critic(candidate_result):
@@ -499,9 +500,9 @@ def determine_best_candidate(
                             console.print(benchmark_tree)
                         console.rule()
 
-                        self.write_code_and_helpers(
-                            self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path
-                        )
+                    self.write_code_and_helpers(
+                        self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path
+                    )
 
             except KeyboardInterrupt as e:
                 self.write_code_and_helpers(
diff --git a/tests/scripts/end_to_end_test_benchmark_sort.py b/tests/scripts/end_to_end_test_benchmark_sort.py
@@ -0,0 +1,26 @@
+import os
+import pathlib
+
+from end_to_end_test_utilities import CoverageExpectation, TestConfig, run_codeflash_command, run_with_retries
+
+
+def run_test(expected_improvement_pct: int) -> bool:
+    cwd = (pathlib.Path(__file__).parent.parent.parent / "code_to_optimize").resolve()
+    config = TestConfig(
+        file_path=pathlib.Path("bubble_sort.py"),
+        function_name="sorter",
+        benchmarks_root=cwd / "tests" / "pytest" / "benchmarks",
+        test_framework="pytest",
+        min_improvement_x=1.0,
+        coverage_expectations=[
+            CoverageExpectation(
+                function_name="sorter", expected_coverage=100.0, expected_lines=[2, 3, 4, 5, 6, 7, 8, 9, 10]
+            )
+        ],
+    )
+
+    return run_codeflash_command(cwd, config, expected_improvement_pct)
+
+
+if __name__ == "__main__":
+    exit(run_with_retries(run_test, int(os.getenv("EXPECTED_IMPROVEMENT_PCT", 5))))
diff --git a/tests/scripts/end_to_end_test_utilities.py b/tests/scripts/end_to_end_test_utilities.py
@@ -26,6 +26,7 @@ class TestConfig:
     min_improvement_x: float = 0.1
     trace_mode: bool = False
     coverage_expectations: list[CoverageExpectation] = field(default_factory=list)
+    benchmarks_root: Optional[pathlib.Path] = None
 
 
 def clear_directory(directory_path: str | pathlib.Path) -> None:
@@ -85,8 +86,8 @@ def run_codeflash_command(
     path_to_file = cwd / config.file_path
     file_contents = path_to_file.read_text("utf-8")
     test_root = cwd / "tests" / (config.test_framework or "")
-    command = build_command(cwd, config, test_root)
 
+    command = build_command(cwd, config, test_root, config.benchmarks_root if config.benchmarks_root else None)
     process = subprocess.Popen(
         command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=str(cwd), env=os.environ.copy()
     )
@@ -116,7 +117,7 @@ def run_codeflash_command(
     return validated
 
 
-def build_command(cwd: pathlib.Path, config: TestConfig, test_root: pathlib.Path) -> list[str]:
+def build_command(cwd: pathlib.Path, config: TestConfig, test_root: pathlib.Path, benchmarks_root:pathlib.Path|None = None) -> list[str]:
     python_path = "../../../codeflash/main.py" if "code_directories" in str(cwd) else "../codeflash/main.py"
 
     base_command = ["python", python_path, "--file", config.file_path, "--no-pr"]
@@ -127,7 +128,8 @@ def build_command(cwd: pathlib.Path, config: TestConfig, test_root: pathlib.Path
         base_command.extend(
             ["--test-framework", config.test_framework, "--tests-root", str(test_root), "--module-root", str(cwd)]
         )
-
+    if benchmarks_root:
+        base_command.extend(["--benchmark", "--benchmarks-root", str(benchmarks_root)])
     return base_command