critic

KRRT7 · KRRT7 · commit 8e516fc8682a · 2025-09-17T18:25:56.000-07:00
diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py
@@ -11,3 +11,4 @@
 MIN_TESTCASE_PASSED_THRESHOLD = 6
 REPEAT_OPTIMIZATION_PROBABILITY = 0.1
 DEFAULT_IMPORTANCE_THRESHOLD = 0.001
+MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD = 0.10  # 10% minimum improvement for async throughput
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -77,7 +77,13 @@
     TestType,
 )
 from codeflash.result.create_pr import check_create_pr, existing_tests_source_for
-from codeflash.result.critic import coverage_critic, performance_gain, quantity_of_tests_critic, speedup_critic
+from codeflash.result.critic import (
+    coverage_critic,
+    performance_gain,
+    quantity_of_tests_critic,
+    speedup_critic,
+    throughput_gain,
+)
 from codeflash.result.explanation import Explanation
 from codeflash.telemetry.posthog_cf import ph
 from codeflash.verification.concolic_testing import generate_concolic_tests
@@ -566,7 +572,11 @@ def determine_best_candidate(
                     tree = Tree(f"Candidate #{candidate_index} - Runtime Information")
                     benchmark_tree = None
                     if speedup_critic(
-                        candidate_result, original_code_baseline.runtime, best_runtime_until_now=None
+                        candidate_result,
+                        original_code_baseline.runtime,
+                        best_runtime_until_now=None,
+                        original_async_throughput=original_code_baseline.async_throughput,
+                        best_throughput_until_now=None,
                     ) and quantity_of_tests_critic(candidate_result):
                         tree.add("This candidate is faster than the original code. 🚀")  # TODO: Change this description
                         tree.add(f"Original summed runtime: {humanize_runtime(original_code_baseline.runtime)}")
@@ -577,6 +587,19 @@ def determine_best_candidate(
                         )
                         tree.add(f"Speedup percentage: {perf_gain * 100:.1f}%")
                         tree.add(f"Speedup ratio: {perf_gain + 1:.3f}X")
+                        logger.info(f"orig_async_throughput: {original_code_baseline.async_throughput}")
+                        logger.info(f"candidate_result.async_throughput: {candidate_result.async_throughput}")
+                        if (
+                            original_code_baseline.async_throughput is not None
+                            and candidate_result.async_throughput is not None
+                        ):
+                            throughput_gain_value = throughput_gain(
+                                original_throughput=original_code_baseline.async_throughput,
+                                optimized_throughput=candidate_result.async_throughput,
+                            )
+                            tree.add(f"Original async throughput: {original_code_baseline.async_throughput} executions")
+                            tree.add(f"Optimized async throughput: {candidate_result.async_throughput} executions")
+                            tree.add(f"Throughput improvement: {throughput_gain_value * 100:.1f}%")
                         line_profile_test_results = self.line_profiler_step(
                             code_context=code_context,
                             original_helper_code=original_helper_code,
@@ -1509,10 +1532,12 @@ def establish_original_code_baseline(
                 for result in benchmarking_results.test_results:
                     if result.stdout:
                         all_stdout += result.stdout
-
+                logger.info("Calculating async function throughput from test output...")
+                logger.info(f"All stdout for async throughput calculation:\n{all_stdout}")
                 async_throughput = calculate_function_throughput_from_stdout(
                     all_stdout, self.function_to_optimize.function_name
                 )
+                logger.info(f"Original async function throughput: {async_throughput} calls/second")
 
             if self.args.benchmark:
                 replay_benchmarking_test_results = benchmarking_results.group_by_benchmarks(
@@ -1680,7 +1705,6 @@ def run_optimized_candidate(
                     if result.stdout:
                         all_stdout += result.stdout
 
-
                 candidate_async_throughput = calculate_function_throughput_from_stdout(
                     all_stdout, self.function_to_optimize.function_name
                 )
diff --git a/codeflash/result/critic.py b/codeflash/result/critic.py
@@ -8,6 +8,7 @@
     COVERAGE_THRESHOLD,
     MIN_IMPROVEMENT_THRESHOLD,
     MIN_TESTCASE_PASSED_THRESHOLD,
+    MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD,
 )
 from codeflash.models.models import TestType
 
@@ -25,31 +26,73 @@ def performance_gain(*, original_runtime_ns: int, optimized_runtime_ns: int) ->
     return (original_runtime_ns - optimized_runtime_ns) / optimized_runtime_ns
 
 
+def throughput_gain(*, original_throughput: int, optimized_throughput: int) -> float:
+    """Calculate the throughput gain of an optimized code over the original code.
+
+    This value multiplied by 100 gives the percentage improvement in throughput.
+    For throughput, higher values are better (more executions per time period).
+    """
+    if original_throughput == 0:
+        return 0.0
+    return (optimized_throughput - original_throughput) / original_throughput
+
+
 def speedup_critic(
     candidate_result: OptimizedCandidateResult,
     original_code_runtime: int,
     best_runtime_until_now: int | None,
     *,
     disable_gh_action_noise: bool = False,
+    original_async_throughput: int | None = None,
+    best_throughput_until_now: int | None = None,
 ) -> bool:
     """Take in a correct optimized Test Result and decide if the optimization should actually be surfaced to the user.
 
-    Ensure that the optimization is actually faster than the original code, above the noise floor.
-    The noise floor is a function of the original code runtime. Currently, the noise floor is 2xMIN_IMPROVEMENT_THRESHOLD
-    when the original runtime is less than 10 microseconds, and becomes MIN_IMPROVEMENT_THRESHOLD for any higher runtime.
-    The noise floor is doubled when benchmarking on a (noisy) GitHub Action virtual instance, also we want to be more confident there.
+    Evaluates both runtime performance and async throughput improvements.
+
+    For runtime performance:
+    - Ensures the optimization is actually faster than the original code, above the noise floor.
+    - The noise floor is a function of the original code runtime. Currently, the noise floor is 2xMIN_IMPROVEMENT_THRESHOLD
+      when the original runtime is less than 10 microseconds, and becomes MIN_IMPROVEMENT_THRESHOLD for any higher runtime.
+    - The noise floor is doubled when benchmarking on a (noisy) GitHub Action virtual instance.
+
+    For async throughput (when available):
+    - Evaluates throughput improvements using MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD
+    - Throughput improvements complement runtime improvements for async functions
     """
+    # Runtime performance evaluation
     noise_floor = 3 * MIN_IMPROVEMENT_THRESHOLD if original_code_runtime < 10000 else MIN_IMPROVEMENT_THRESHOLD
     if not disable_gh_action_noise and env_utils.is_ci():
         noise_floor = noise_floor * 2  # Increase the noise floor in GitHub Actions mode
 
     perf_gain = performance_gain(
         original_runtime_ns=original_code_runtime, optimized_runtime_ns=candidate_result.best_test_runtime
     )
-    if best_runtime_until_now is None:
-        # collect all optimizations with this
-        return bool(perf_gain > noise_floor)
-    return bool(perf_gain > noise_floor and candidate_result.best_test_runtime < best_runtime_until_now)
+    runtime_improved = perf_gain > noise_floor
+
+    # Check runtime comparison with best so far
+    runtime_is_best = best_runtime_until_now is None or candidate_result.best_test_runtime < best_runtime_until_now
+
+    # Async throughput evaluation (if throughput data is available)
+    throughput_improved = True  # Default to True if no throughput data
+    throughput_is_best = True   # Default to True if no throughput data
+
+    if original_async_throughput is not None and candidate_result.async_throughput is not None:
+        if original_async_throughput > 0:  # Avoid division by zero
+            throughput_gain_value = throughput_gain(
+                original_throughput=original_async_throughput,
+                optimized_throughput=candidate_result.async_throughput
+            )
+            throughput_improved = throughput_gain_value > MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD
+            logger.debug(f"Async throughput gain: {throughput_gain_value * 100:.1f}% (original: {original_async_throughput}, optimized: {candidate_result.async_throughput})")
+
+        throughput_is_best = best_throughput_until_now is None or candidate_result.async_throughput > best_throughput_until_now
+
+    # For async functions with throughput data, both runtime and throughput should improve
+    # For sync functions or when throughput data is unavailable, only runtime matters
+    if original_async_throughput is not None and candidate_result.async_throughput is not None:
+        return runtime_improved and runtime_is_best and throughput_improved and throughput_is_best
+    return runtime_improved and runtime_is_best
 
 
 def quantity_of_tests_critic(candidate_result: OptimizedCandidateResult | OriginalCodeBaseline) -> bool:
diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py
@@ -94,10 +94,13 @@ def calculate_function_throughput_from_stdout(stdout: str, function_name: str) -
 
     # Count completed executions for the specific function only
     function_throughput = 0
-
+    logger.info(f"Total start matches: {len(start_matches)}, Total end matches: {len(end_matches)}")
     for start_match in start_matches:
         # Check if this execution is for the function we're interested in and has a matching end tag
         # function_name is at index 2 in the match tuple
+        logger.info(f"Start match: {start_match}")
+        logger.info(f"End matches: {end_matches_set}")
+        logger.info(f"Function name: {function_name}")
         if start_match in end_matches_set and len(start_match) > 2 and start_match[2] == function_name:
             function_throughput += 1
 
diff --git a/tests/test_critic.py b/tests/test_critic.py
@@ -14,7 +14,13 @@
     TestResults,
     TestType,
 )
-from codeflash.result.critic import coverage_critic, performance_gain, quantity_of_tests_critic, speedup_critic
+from codeflash.result.critic import (
+    coverage_critic,
+    performance_gain,
+    quantity_of_tests_critic,
+    speedup_critic,
+    throughput_gain,
+)
 
 
 def test_performance_gain() -> None:
@@ -429,3 +435,159 @@ def test_coverage_critic() -> None:
     )
 
     assert coverage_critic(unittest_coverage, "unittest") is True
+
+
+def test_throughput_gain() -> None:
+    """Test throughput_gain calculation."""
+    # Test basic throughput improvement
+    assert throughput_gain(original_throughput=100, optimized_throughput=150) == 0.5  # 50% improvement
+
+    # Test no improvement
+    assert throughput_gain(original_throughput=100, optimized_throughput=100) == 0.0
+
+    # Test regression
+    assert throughput_gain(original_throughput=100, optimized_throughput=80) == -0.2  # 20% regression
+
+    # Test zero original throughput (edge case)
+    assert throughput_gain(original_throughput=0, optimized_throughput=50) == 0.0
+
+    # Test large improvement
+    assert throughput_gain(original_throughput=50, optimized_throughput=200) == 3.0  # 300% improvement
+
+
+def test_speedup_critic_with_async_throughput() -> None:
+    """Test speedup_critic with async throughput evaluation."""
+    original_code_runtime = 10000  # 10 microseconds
+    original_async_throughput = 100
+
+    # Test case 1: Both runtime and throughput improve significantly
+    candidate_result = OptimizedCandidateResult(
+        max_loop_count=5,
+        best_test_runtime=8000,  # 20% runtime improvement
+        behavior_test_results=TestResults(),
+        benchmarking_test_results=TestResults(),
+        optimization_candidate_index=0,
+        total_candidate_timing=8000,
+        async_throughput=120,  # 20% throughput improvement
+    )
+
+    assert speedup_critic(
+        candidate_result=candidate_result,
+        original_code_runtime=original_code_runtime,
+        best_runtime_until_now=None,
+        original_async_throughput=original_async_throughput,
+        best_throughput_until_now=None,
+        disable_gh_action_noise=True
+    )
+
+    # Test case 2: Runtime improves but throughput doesn't meet threshold
+    candidate_result = OptimizedCandidateResult(
+        max_loop_count=5,
+        best_test_runtime=8000,  # 20% runtime improvement
+        behavior_test_results=TestResults(),
+        benchmarking_test_results=TestResults(),
+        optimization_candidate_index=0,
+        total_candidate_timing=8000,
+        async_throughput=105,  # Only 5% throughput improvement (below 10% threshold)
+    )
+
+    assert not speedup_critic(
+        candidate_result=candidate_result,
+        original_code_runtime=original_code_runtime,
+        best_runtime_until_now=None,
+        original_async_throughput=original_async_throughput,
+        best_throughput_until_now=None,
+        disable_gh_action_noise=True
+    )
+
+    # Test case 3: Throughput improves but runtime doesn't meet threshold
+    candidate_result = OptimizedCandidateResult(
+        max_loop_count=5,
+        best_test_runtime=9800,  # Only 2% runtime improvement (below 5% threshold)
+        behavior_test_results=TestResults(),
+        benchmarking_test_results=TestResults(),
+        optimization_candidate_index=0,
+        total_candidate_timing=9800,
+        async_throughput=120,  # 20% throughput improvement
+    )
+
+    assert not speedup_critic(
+        candidate_result=candidate_result,
+        original_code_runtime=original_code_runtime,
+        best_runtime_until_now=None,
+        original_async_throughput=original_async_throughput,
+        best_throughput_until_now=None,
+        disable_gh_action_noise=True
+    )
+
+    # Test case 4: No throughput data - should fall back to runtime-only evaluation
+    candidate_result = OptimizedCandidateResult(
+        max_loop_count=5,
+        best_test_runtime=8000,  # 20% runtime improvement
+        behavior_test_results=TestResults(),
+        benchmarking_test_results=TestResults(),
+        optimization_candidate_index=0,
+        total_candidate_timing=8000,
+        async_throughput=None,  # No throughput data
+    )
+
+    assert speedup_critic(
+        candidate_result=candidate_result,
+        original_code_runtime=original_code_runtime,
+        best_runtime_until_now=None,
+        original_async_throughput=None,  # No original throughput data
+        best_throughput_until_now=None,
+        disable_gh_action_noise=True
+    )
+
+    # Test case 5: Test best_throughput_until_now comparison
+    candidate_result = OptimizedCandidateResult(
+        max_loop_count=5,
+        best_test_runtime=8000,  # 20% runtime improvement
+        behavior_test_results=TestResults(),
+        benchmarking_test_results=TestResults(),
+        optimization_candidate_index=0,
+        total_candidate_timing=8000,
+        async_throughput=115,  # 15% throughput improvement
+    )
+
+    # Should pass when no best throughput yet
+    assert speedup_critic(
+        candidate_result=candidate_result,
+        original_code_runtime=original_code_runtime,
+        best_runtime_until_now=None,
+        original_async_throughput=original_async_throughput,
+        best_throughput_until_now=None,
+        disable_gh_action_noise=True
+    )
+
+    # Should fail when there's a better throughput already
+    assert not speedup_critic(
+        candidate_result=candidate_result,
+        original_code_runtime=original_code_runtime,
+        best_runtime_until_now=None,
+        original_async_throughput=original_async_throughput,
+        best_throughput_until_now=120,  # Better throughput already exists
+        disable_gh_action_noise=True
+    )
+
+    # Test case 6: Zero original throughput (edge case)
+    candidate_result = OptimizedCandidateResult(
+        max_loop_count=5,
+        best_test_runtime=8000,  # 20% runtime improvement
+        behavior_test_results=TestResults(),
+        benchmarking_test_results=TestResults(),
+        optimization_candidate_index=0,
+        total_candidate_timing=8000,
+        async_throughput=50,
+    )
+
+    # Should pass when original throughput is 0 (throughput evaluation skipped)
+    assert speedup_critic(
+        candidate_result=candidate_result,
+        original_code_runtime=original_code_runtime,
+        best_runtime_until_now=None,
+        original_async_throughput=0,  # Zero original throughput
+        best_throughput_until_now=None,
+        disable_gh_action_noise=True
+    )