Merge branch 'granular-async-instrumentation' of https://github.com/codeflash-ai/codeflash into async-support-for

KRRT7 · KRRT7 · commit a6072b98d80c · 2025-09-08T17:12:47.000-07:00
diff --git a/codeflash/code_utils/codeflash_wrap_decorator.py b/codeflash/code_utils/codeflash_wrap_decorator.py
@@ -299,12 +299,6 @@ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:  # noqa: ANN401
         else:
             async_wrapper.index[test_id] = 0
 
-        # Initialize cumulative throughput tracking
-        if not hasattr(async_wrapper, "start_time"):
-            async_wrapper.start_time = time.perf_counter()
-        if not hasattr(async_wrapper, "total_operations"):
-            async_wrapper.total_operations = 0
-
         codeflash_test_index = async_wrapper.index[test_id]
         invocation_id = f"{line_id}_{codeflash_test_index}"
         test_stdout_tag = f"{test_module_name}:{(test_class_name + '.' if test_class_name else '')}{test_name}:{function_name}:{loop_index}:{invocation_id}"
@@ -325,12 +319,7 @@ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:  # noqa: ANN401
         finally:
             gc.enable()
 
-        # Update cumulative throughput tracking
-        async_wrapper.total_operations += 1
-        elapsed_time = time.perf_counter() - async_wrapper.start_time
-        throughput = async_wrapper.total_operations / elapsed_time if elapsed_time > 0 else 0
-
-        print(f"!######{test_stdout_tag}:{codeflash_duration}:throughput_{throughput:.2f}_ops_per_sec######!")
+        print(f"!######{test_stdout_tag}:{codeflash_duration}######!")
 
         if exception:
             raise exception
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -552,7 +552,6 @@ class FunctionTestInvocation:
     timed_out: Optional[bool]
     verification_type: Optional[str] = VerificationType.FUNCTION_CALL
     stdout: Optional[str] = None
-    throughput: Optional[float] = None  # Operations per second
 
     @property
     def unique_invocation_loop_id(self) -> str:
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -566,11 +566,7 @@ def determine_best_candidate(
                     tree = Tree(f"Candidate #{candidate_index} - Runtime Information")
                     benchmark_tree = None
                     if speedup_critic(
-                        candidate_result,
-                        original_code_baseline.runtime,
-                        None,
-                        self.function_to_optimize,
-                        original_baseline_results=original_code_baseline,
+                        candidate_result, original_code_baseline.runtime, best_runtime_until_now=None
                     ) and quantity_of_tests_critic(candidate_result):
                         tree.add("This candidate is faster than the original code. 🚀")  # TODO: Change this description
                         tree.add(f"Original summed runtime: {humanize_runtime(original_code_baseline.runtime)}")
diff --git a/codeflash/result/critic.py b/codeflash/result/critic.py
@@ -12,8 +12,7 @@
 from codeflash.models.models import TestType
 
 if TYPE_CHECKING:
-    from codeflash.discovery.functions_to_optimize import FunctionToOptimize
-    from codeflash.models.models import CoverageData, OptimizedCandidateResult, OriginalCodeBaseline, TestResults
+    from codeflash.models.models import CoverageData, OptimizedCandidateResult, OriginalCodeBaseline
 
 
 def performance_gain(*, original_runtime_ns: int, optimized_runtime_ns: int) -> float:
@@ -30,29 +29,16 @@ def speedup_critic(
     candidate_result: OptimizedCandidateResult,
     original_code_runtime: int,
     best_runtime_until_now: int | None,
-    function_to_optimize: FunctionToOptimize,
     *,
     disable_gh_action_noise: bool = False,
-    original_baseline_results: OriginalCodeBaseline | None = None,
 ) -> bool:
     """Take in a correct optimized Test Result and decide if the optimization should actually be surfaced to the user.
 
-    For async functions, dispatches to async_speedup_critic for specialized evaluation.
-    For sync functions, uses traditional runtime-only evaluation.
-
     Ensure that the optimization is actually faster than the original code, above the noise floor.
     The noise floor is a function of the original code runtime. Currently, the noise floor is 2xMIN_IMPROVEMENT_THRESHOLD
     when the original runtime is less than 10 microseconds, and becomes MIN_IMPROVEMENT_THRESHOLD for any higher runtime.
     The noise floor is doubled when benchmarking on a (noisy) GitHub Action virtual instance, also we want to be more confident there.
     """
-    if function_to_optimize.is_async and original_baseline_results:
-        return async_speedup_critic(
-            candidate_result=candidate_result,
-            original_baseline_results=original_baseline_results,
-            best_runtime_until_now=best_runtime_until_now,
-            disable_gh_action_noise=disable_gh_action_noise,
-        )
-
     noise_floor = 3 * MIN_IMPROVEMENT_THRESHOLD if original_code_runtime < 10000 else MIN_IMPROVEMENT_THRESHOLD
     if not disable_gh_action_noise and env_utils.is_ci():
         noise_floor = noise_floor * 2  # Increase the noise floor in GitHub Actions mode
@@ -61,64 +47,11 @@ def speedup_critic(
         original_runtime_ns=original_code_runtime, optimized_runtime_ns=candidate_result.best_test_runtime
     )
     if best_runtime_until_now is None:
+        # collect all optimizations with this
         return bool(perf_gain > noise_floor)
     return bool(perf_gain > noise_floor and candidate_result.best_test_runtime < best_runtime_until_now)
 
 
-def async_speedup_critic(
-    candidate_result: OptimizedCandidateResult,
-    original_baseline_results: OriginalCodeBaseline,
-    best_runtime_until_now: int | None,
-    *,
-    disable_gh_action_noise: bool = False,
-) -> bool:
-    """Simplified speedup evaluation for async functions with throughput-first approach.
-
-    For async functions:
-    1. If throughput data exists and shows improvement, accept the optimization
-    2. Otherwise, fall back to traditional runtime evaluation
-    """
-    # Calculate noise floor with same logic as sync functions
-    noise_floor = (
-        3 * MIN_IMPROVEMENT_THRESHOLD if original_baseline_results.runtime < 10000 else MIN_IMPROVEMENT_THRESHOLD
-    )
-    if not disable_gh_action_noise and env_utils.is_ci():
-        noise_floor = noise_floor * 2  # Increase the noise floor in GitHub Actions mode
-
-    # Check for throughput improvement first
-    candidate_throughput = _calculate_average_throughput(candidate_result.benchmarking_test_results)
-    original_throughput = _calculate_average_throughput(original_baseline_results.benchmarking_test_results)
-
-    if original_throughput and original_throughput > 0 and candidate_throughput:
-        throughput_gain = (candidate_throughput - original_throughput) / original_throughput
-        if throughput_gain > noise_floor:
-            # Throughput improved above noise floor - accept optimization
-            return (
-                True if best_runtime_until_now is None else candidate_result.best_test_runtime < best_runtime_until_now
-            )
-
-    # Fall back to traditional runtime evaluation
-    perf_gain = performance_gain(
-        original_runtime_ns=original_baseline_results.runtime, optimized_runtime_ns=candidate_result.best_test_runtime
-    )
-
-    if best_runtime_until_now is None:
-        return bool(perf_gain > noise_floor)
-    return bool(perf_gain > noise_floor and candidate_result.best_test_runtime < best_runtime_until_now)
-
-
-def _calculate_average_throughput(test_results: TestResults) -> float | None:
-    """Calculate average throughput from test results that have throughput data."""
-    throughput_values = [
-        result.throughput for result in test_results.test_results if result.throughput is not None and result.did_pass
-    ]
-
-    if not throughput_values:
-        return None
-
-    return sum(throughput_values) / len(throughput_values)
-
-
 def quantity_of_tests_critic(candidate_result: OptimizedCandidateResult | OriginalCodeBaseline) -> bool:
     test_results = candidate_result.behavior_test_results
     report = test_results.get_test_pass_fail_report_by_type()
diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py
@@ -37,9 +37,7 @@ def parse_func(file_path: Path) -> XMLParser:
 
 
 matches_re_start = re.compile(r"!\$######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######\$!\n")
-matches_re_end = re.compile(
-    r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)(?::throughput_([\d\.]+)_ops_per_sec)?######!"
-)
+matches_re_end = re.compile(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!")
 
 
 def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, test_config: TestConfig) -> TestResults:
@@ -95,7 +93,6 @@ def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, tes
                         return_value=test_pickle,
                         timed_out=False,
                         verification_type=VerificationType.FUNCTION_CALL,
-                        throughput=None,
                     )
                 )
         except Exception as e:
@@ -163,7 +160,6 @@ def parse_sqlite_test_results(sqlite_file_path: Path, test_files: TestFiles, tes
                     return_value=ret_val,
                     timed_out=False,
                     verification_type=VerificationType(verification_type) if verification_type else None,
-                    throughput=None,
                 )
             )
         except Exception:
@@ -297,27 +293,22 @@ def parse_test_xml(
                         return_value=None,
                         timed_out=timed_out,
                         stdout="",
-                        throughput=None,
                     )
                 )
 
             else:
                 for match_index, match in enumerate(begin_matches):
                     groups = match.groups()
                     end_match = end_matches.get(groups)
-                    iteration_id, runtime, throughput = groups[5], None, None
+                    iteration_id, runtime = groups[5], None
                     if end_match:
                         stdout = sys_stdout[match.end() : end_match.start()]
-                        end_groups = end_match.groups()
-                        split_val = end_groups[5].split(":")
+                        split_val = end_match.groups()[5].split(":")
                         if len(split_val) > 1:
                             iteration_id = split_val[0]
                             runtime = int(split_val[1])
                         else:
                             iteration_id, runtime = split_val[0], None
-                        # Extract throughput if present (group 6 is the throughput capture group)
-                        if len(end_groups) > 6 and end_groups[6] is not None:
-                            throughput = float(end_groups[6])
                     elif match_index == len(begin_matches) - 1:
                         stdout = sys_stdout[match.end() :]
                     else:
@@ -341,7 +332,6 @@ def parse_test_xml(
                             return_value=None,
                             timed_out=timed_out,
                             stdout=stdout,
-                            throughput=throughput,
                         )
                     )
 
@@ -430,7 +420,6 @@ def merge_test_results(
                         if result_bin.verification_type
                         else None,
                         stdout=xml_result.stdout,
-                        throughput=None,
                     )
                 )
         elif xml_results.test_results[0].id.iteration_id is not None:
@@ -461,7 +450,6 @@ def merge_test_results(
                         if bin_result.verification_type
                         else None,
                         stdout=xml_result.stdout,
-                        throughput=None,
                     )
                 )
         else:
@@ -489,7 +477,6 @@ def merge_test_results(
                         if bin_result.verification_type
                         else None,
                         stdout=xml_result.stdout,
-                        throughput=None,
                     )
                 )