Merge pull request #752 from codeflash-ai/end-to-end-test

KRRT7 · web-flow · commit 9aa34d9b64c1 · 2025-09-23T00:44:48.000-07:00
add End to end test for async optimization
diff --git a/.github/workflows/e2e-async.yaml b/.github/workflows/e2e-async.yaml
@@ -0,0 +1,69 @@
+name: E2E - Async
+
+on:
+  pull_request:
+    paths:
+      - '**'  # Trigger for all paths
+
+  workflow_dispatch:
+
+jobs:
+  async-optimization:
+    # Dynamically determine if environment is needed only when workflow files change and contributor is external
+    environment: ${{ (github.event_name == 'workflow_dispatch' || (contains(toJSON(github.event.pull_request.files.*.filename), '.github/workflows/') && github.event.pull_request.user.login != 'misrasaurabh1' && github.event.pull_request.user.login != 'KRRT7')) && 'external-trusted-contributors' || '' }}
+
+    runs-on: ubuntu-latest
+    env:
+      CODEFLASH_AIS_SERVER: prod
+      POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }}
+      CODEFLASH_API_KEY: ${{ secrets.CODEFLASH_API_KEY }}
+      COLUMNS: 110
+      MAX_RETRIES: 3
+      RETRY_DELAY: 5
+      EXPECTED_IMPROVEMENT_PCT: 10
+      CODEFLASH_END_TO_END: 1
+    steps:
+      - name: 🛎️ Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Validate PR
+        run: |
+          # Check for any workflow changes
+          if git diff --name-only "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}" | grep -q "^.github/workflows/"; then
+            echo "⚠️ Workflow changes detected."
+
+            # Get the PR author
+            AUTHOR="${{ github.event.pull_request.user.login }}"
+            echo "PR Author: $AUTHOR"
+
+            # Allowlist check
+            if [[ "$AUTHOR" == "misrasaurabh1" || "$AUTHOR" == "KRRT7" ]]; then
+              echo "✅ Authorized user ($AUTHOR). Proceeding."
+            elif [[ "${{ github.event.pull_request.state }}" == "open" ]]; then
+              echo "✅ PR triggered by 'pull_request_target' and is open. Assuming protection rules are in place. Proceeding."
+            else
+              echo "⛔ Unauthorized user ($AUTHOR) attempting to modify workflows. Exiting."
+              exit 1
+            fi
+          else
+            echo "✅ No workflow file changes detected. Proceeding."
+          fi
+
+      - name: Set up Python 3.11 for CLI
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: 3.11.6
+
+      - name: Install dependencies (CLI)
+        run: |
+          uv sync
+          
+      - name: Run Codeflash to optimize async code
+        id: optimize_async_code
+        run: |
+          uv run python tests/scripts/end_to_end_test_async.py
diff --git a/code_to_optimize/code_directories/async_e2e/main.py b/code_to_optimize/code_directories/async_e2e/main.py
@@ -1,4 +1,16 @@
 import time
-async def fake_api_call(delay, data):
-    time.sleep(0.0001)
-    return f"Processed: {data}"
+import asyncio
+
+
+async def retry_with_backoff(func, max_retries=3):
+    if max_retries < 1:
+        raise ValueError("max_retries must be at least 1")
+    last_exception = None
+    for attempt in range(max_retries):
+        try:
+            return await func()
+        except Exception as e:
+            last_exception = e
+            if attempt < max_retries - 1:
+                time.sleep(0.0001 * attempt)
+    raise last_exception
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -658,6 +658,15 @@ def determine_best_candidate(
                         )
                         tree.add(f"Speedup percentage: {perf_gain * 100:.1f}%")
                         tree.add(f"Speedup ratio: {perf_gain + 1:.3f}X")
+                        if (
+                            original_code_baseline.async_throughput is not None
+                            and candidate_result.async_throughput is not None
+                        ):
+                            throughput_gain_value = throughput_gain(
+                                original_throughput=original_code_baseline.async_throughput,
+                                optimized_throughput=candidate_result.async_throughput,
+                            )
+                            tree.add(f"Throughput gain: {throughput_gain_value * 100:.1f}%")
                     console.print(tree)
                     if self.args.benchmark and benchmark_tree:
                         console.print(benchmark_tree)
@@ -1199,6 +1208,8 @@ def find_and_process_best_optimization(
                     function_name=function_to_optimize_qualified_name,
                     file_path=self.function_to_optimize.file_path,
                     benchmark_details=processed_benchmark_info.benchmark_details if processed_benchmark_info else None,
+                    original_async_throughput=original_code_baseline.async_throughput,
+                    best_async_throughput=best_optimization.async_throughput,
                 )
 
                 self.replace_function_and_helpers_with_optimized_code(
@@ -1284,7 +1295,7 @@ def process_review(
         original_throughput_str = None
         optimized_throughput_str = None
         throughput_improvement_str = None
-        
+
         if (
             self.function_to_optimize.is_async
             and original_code_baseline.async_throughput is not None
@@ -1297,7 +1308,7 @@ def process_review(
                 optimized_throughput=best_optimization.async_throughput,
             )
             throughput_improvement_str = f"{throughput_improvement_value * 100:.1f}%"
-        
+
         new_explanation_raw_str = self.aiservice_client.get_new_explanation(
             source_code=code_context.read_writable_code.flat,
             dependency_code=code_context.read_only_context_code,
@@ -1324,6 +1335,8 @@ def process_review(
             function_name=explanation.function_name,
             file_path=explanation.file_path,
             benchmark_details=explanation.benchmark_details,
+            original_async_throughput=explanation.original_async_throughput,
+            best_async_throughput=explanation.best_async_throughput,
         )
         self.log_successful_optimization(new_explanation, generated_tests, exp_type)
 
@@ -1551,7 +1564,8 @@ def establish_original_code_baseline(
                 async_throughput = calculate_function_throughput_from_test_results(
                     benchmarking_results, self.function_to_optimize.function_name
                 )
-                logger.info(f"Original async function throughput: {async_throughput} calls/second")
+                logger.debug(f"Original async function throughput: {async_throughput} calls/second")
+                console.rule()
 
             if self.args.benchmark:
                 replay_benchmarking_test_results = benchmarking_results.group_by_benchmarks(
diff --git a/codeflash/result/critic.py b/codeflash/result/critic.py
@@ -82,9 +82,6 @@ def speedup_critic(
                 original_throughput=original_async_throughput, optimized_throughput=candidate_result.async_throughput
             )
             throughput_improved = throughput_gain_value > MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD
-            logger.info(
-                f"Async throughput gain: {throughput_gain_value * 100:.1f}% (original: {original_async_throughput}, optimized: {candidate_result.async_throughput})"
-            )
 
         throughput_is_best = (
             best_throughput_until_now is None or candidate_result.async_throughput > best_throughput_until_now
diff --git a/codeflash/result/explanation.py b/codeflash/result/explanation.py
@@ -11,6 +11,7 @@
 
 from codeflash.code_utils.time_utils import humanize_runtime
 from codeflash.models.models import BenchmarkDetail, TestResults
+from codeflash.result.critic import performance_gain, throughput_gain
 
 
 @dataclass(frozen=True, config={"arbitrary_types_allowed": True})
@@ -23,9 +24,29 @@ class Explanation:
     function_name: str
     file_path: Path
     benchmark_details: Optional[list[BenchmarkDetail]] = None
+    original_async_throughput: Optional[int] = None
+    best_async_throughput: Optional[int] = None
 
     @property
     def perf_improvement_line(self) -> str:
+        runtime_improvement = self.speedup
+        
+        if (
+            self.original_async_throughput is not None 
+            and self.best_async_throughput is not None 
+            and self.original_async_throughput > 0
+        ):
+            throughput_improvement = throughput_gain(
+                original_throughput=self.original_async_throughput,
+                optimized_throughput=self.best_async_throughput,
+            )
+            
+            # Use throughput metrics if throughput improvement is better or runtime got worse
+            if throughput_improvement > runtime_improvement or runtime_improvement <= 0:
+                throughput_pct = f"{throughput_improvement * 100:,.0f}%"
+                throughput_x = f"{throughput_improvement + 1:,.2f}x"
+                return f"{throughput_pct} improvement ({throughput_x} faster)."
+        
         return f"{self.speedup_pct} improvement ({self.speedup_x} faster)."
 
     @property
@@ -45,6 +66,24 @@ def to_console_string(self) -> str:
         # TODO: Sometimes the explanation says something similar to "This is the code that was optimized", remove such parts
         original_runtime_human = humanize_runtime(self.original_runtime_ns)
         best_runtime_human = humanize_runtime(self.best_runtime_ns)
+        
+        # Determine if we're showing throughput or runtime improvements
+        runtime_improvement = self.speedup
+        is_using_throughput_metric = False
+        
+        if (
+            self.original_async_throughput is not None 
+            and self.best_async_throughput is not None 
+            and self.original_async_throughput > 0
+        ):
+            throughput_improvement = throughput_gain(
+                original_throughput=self.original_async_throughput,
+                optimized_throughput=self.best_async_throughput,
+            )
+            
+            if throughput_improvement > runtime_improvement or runtime_improvement <= 0:
+                is_using_throughput_metric = True
+        
         benchmark_info = ""
 
         if self.benchmark_details:
@@ -85,10 +124,18 @@ def to_console_string(self) -> str:
             console.print(table)
             benchmark_info = cast("StringIO", console.file).getvalue() + "\n"  # Cast for mypy
 
+        if is_using_throughput_metric:
+            performance_description = (
+                f"Throughput improved from {self.original_async_throughput} to {self.best_async_throughput} operations/second "
+                f"(runtime: {original_runtime_human} → {best_runtime_human})\n\n"
+            )
+        else:
+            performance_description = f"Runtime went down from {original_runtime_human} to {best_runtime_human} \n\n"
+        
         return (
             f"Optimized {self.function_name} in {self.file_path}\n"
             f"{self.perf_improvement_line}\n"
-            f"Runtime went down from {original_runtime_human} to {best_runtime_human} \n\n"
+            + performance_description
             + (benchmark_info if benchmark_info else "")
             + self.raw_explanation_message
             + " \n\n"
diff --git a/tests/scripts/end_to_end_test_async.py b/tests/scripts/end_to_end_test_async.py
@@ -6,14 +6,14 @@
 
 def run_test(expected_improvement_pct: int) -> bool:
     config = TestConfig(
-        file_path="workload.py",
-        expected_unit_tests=1,
+        file_path="main.py",
+        expected_unit_tests=0,
         min_improvement_x=0.1,
         coverage_expectations=[
             CoverageExpectation(
-                function_name="process_data_list",
+                function_name="retry_with_backoff",
                 expected_coverage=100.0,
-                expected_lines=[5, 7, 8, 9, 10, 12],
+                expected_lines=[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
             )
         ],
     )

Original file line number	Diff line number	Diff line change
`@@ -82,9 +82,6 @@ def speedup_critic(`
`82`	`82`	`original_throughput=original_async_throughput, optimized_throughput=candidate_result.async_throughput`
`83`	`83`	`)`
`84`	`84`	`throughput_improved = throughput_gain_value > MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD`
`85`		`- logger.info(`
`86`		`- f"Async throughput gain: {throughput_gain_value * 100:.1f}% (original: {original_async_throughput}, optimized: {candidate_result.async_throughput})"`
`87`		`- )`
`88`	`85`
`89`	`86`	`throughput_is_best = (`
`90`	`87`	`best_throughput_until_now is None or candidate_result.async_throughput > best_throughput_until_now`