basic async critic - WIP

KRRT7 · KRRT7 · commit a3640751fa33 · 2025-09-08T14:59:04.000-07:00
diff --git a/codeflash/code_utils/codeflash_wrap_decorator.py b/codeflash/code_utils/codeflash_wrap_decorator.py
@@ -299,6 +299,12 @@ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:  # noqa: ANN401
         else:
             async_wrapper.index[test_id] = 0
 
+        # Initialize cumulative throughput tracking
+        if not hasattr(async_wrapper, "start_time"):
+            async_wrapper.start_time = time.perf_counter()
+        if not hasattr(async_wrapper, "total_operations"):
+            async_wrapper.total_operations = 0
+
         codeflash_test_index = async_wrapper.index[test_id]
         invocation_id = f"{line_id}_{codeflash_test_index}"
         test_stdout_tag = f"{test_module_name}:{(test_class_name + '.' if test_class_name else '')}{test_name}:{function_name}:{loop_index}:{invocation_id}"
@@ -319,7 +325,12 @@ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:  # noqa: ANN401
         finally:
             gc.enable()
 
-        print(f"!######{test_stdout_tag}:{codeflash_duration}######!")
+        # Update cumulative throughput tracking
+        async_wrapper.total_operations += 1
+        elapsed_time = time.perf_counter() - async_wrapper.start_time
+        throughput = async_wrapper.total_operations / elapsed_time if elapsed_time > 0 else 0
+
+        print(f"!######{test_stdout_tag}:{codeflash_duration}:throughput_{throughput:.2f}_ops_per_sec######!")
 
         if exception:
             raise exception
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -552,6 +552,7 @@ class FunctionTestInvocation:
     timed_out: Optional[bool]
     verification_type: Optional[str] = VerificationType.FUNCTION_CALL
     stdout: Optional[str] = None
+    throughput: Optional[float] = None  # Operations per second
 
     @property
     def unique_invocation_loop_id(self) -> str:
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -566,7 +566,11 @@ def determine_best_candidate(
                     tree = Tree(f"Candidate #{candidate_index} - Runtime Information")
                     benchmark_tree = None
                     if speedup_critic(
-                        candidate_result, original_code_baseline.runtime, best_runtime_until_now=None
+                        candidate_result, 
+                        original_code_baseline.runtime, 
+                        None, 
+                        self.function_to_optimize,
+                        original_baseline_results=original_code_baseline
                     ) and quantity_of_tests_critic(candidate_result):
                         tree.add("This candidate is faster than the original code. 🚀")  # TODO: Change this description
                         tree.add(f"Original summed runtime: {humanize_runtime(original_code_baseline.runtime)}")
diff --git a/codeflash/result/critic.py b/codeflash/result/critic.py
@@ -12,7 +12,8 @@
 from codeflash.models.models import TestType
 
 if TYPE_CHECKING:
-    from codeflash.models.models import CoverageData, OptimizedCandidateResult, OriginalCodeBaseline
+    from codeflash.discovery.functions_to_optimize import FunctionToOptimize
+    from codeflash.models.models import CoverageData, OptimizedCandidateResult, OriginalCodeBaseline, TestResults
 
 
 def performance_gain(*, original_runtime_ns: int, optimized_runtime_ns: int) -> float:
@@ -29,16 +30,29 @@ def speedup_critic(
     candidate_result: OptimizedCandidateResult,
     original_code_runtime: int,
     best_runtime_until_now: int | None,
+    function_to_optimize: FunctionToOptimize,
     *,
     disable_gh_action_noise: bool = False,
+    original_baseline_results: OriginalCodeBaseline | None = None,
 ) -> bool:
     """Take in a correct optimized Test Result and decide if the optimization should actually be surfaced to the user.
 
+    For async functions, dispatches to async_speedup_critic for specialized evaluation.
+    For sync functions, uses traditional runtime-only evaluation.
+
     Ensure that the optimization is actually faster than the original code, above the noise floor.
     The noise floor is a function of the original code runtime. Currently, the noise floor is 2xMIN_IMPROVEMENT_THRESHOLD
     when the original runtime is less than 10 microseconds, and becomes MIN_IMPROVEMENT_THRESHOLD for any higher runtime.
     The noise floor is doubled when benchmarking on a (noisy) GitHub Action virtual instance, also we want to be more confident there.
     """
+    if function_to_optimize.is_async and original_baseline_results:
+        return async_speedup_critic(
+            candidate_result=candidate_result,
+            original_baseline_results=original_baseline_results,
+            best_runtime_until_now=best_runtime_until_now,
+            disable_gh_action_noise=disable_gh_action_noise,
+        )
+
     noise_floor = 3 * MIN_IMPROVEMENT_THRESHOLD if original_code_runtime < 10000 else MIN_IMPROVEMENT_THRESHOLD
     if not disable_gh_action_noise and env_utils.is_ci():
         noise_floor = noise_floor * 2  # Increase the noise floor in GitHub Actions mode
@@ -47,11 +61,62 @@ def speedup_critic(
         original_runtime_ns=original_code_runtime, optimized_runtime_ns=candidate_result.best_test_runtime
     )
     if best_runtime_until_now is None:
-        # collect all optimizations with this
         return bool(perf_gain > noise_floor)
     return bool(perf_gain > noise_floor and candidate_result.best_test_runtime < best_runtime_until_now)
 
 
+def async_speedup_critic(
+    candidate_result: OptimizedCandidateResult,
+    original_baseline_results: OriginalCodeBaseline,
+    best_runtime_until_now: int | None,
+    *,
+    disable_gh_action_noise: bool = False,
+) -> bool:
+    """Simplified speedup evaluation for async functions with throughput-first approach.
+
+    For async functions:
+    1. If throughput data exists and shows improvement, accept the optimization
+    2. Otherwise, fall back to traditional runtime evaluation
+    """
+    # Calculate noise floor with same logic as sync functions
+    noise_floor = (
+        3 * MIN_IMPROVEMENT_THRESHOLD if original_baseline_results.runtime < 10000 else MIN_IMPROVEMENT_THRESHOLD
+    )
+    if not disable_gh_action_noise and env_utils.is_ci():
+        noise_floor = noise_floor * 2  # Increase the noise floor in GitHub Actions mode
+
+    # Check for throughput improvement first
+    candidate_throughput = _calculate_average_throughput(candidate_result.benchmarking_test_results)
+    original_throughput = _calculate_average_throughput(original_baseline_results.benchmarking_test_results)
+
+    if original_throughput and original_throughput > 0 and candidate_throughput:
+        throughput_gain = (candidate_throughput - original_throughput) / original_throughput
+        if throughput_gain > noise_floor:
+            # Throughput improved above noise floor - accept optimization
+            return True if best_runtime_until_now is None else candidate_result.best_test_runtime < best_runtime_until_now
+
+    # Fall back to traditional runtime evaluation
+    perf_gain = performance_gain(
+        original_runtime_ns=original_baseline_results.runtime, optimized_runtime_ns=candidate_result.best_test_runtime
+    )
+
+    if best_runtime_until_now is None:
+        return bool(perf_gain > noise_floor)
+    return bool(perf_gain > noise_floor and candidate_result.best_test_runtime < best_runtime_until_now)
+
+
+def _calculate_average_throughput(test_results: TestResults) -> float | None:
+    """Calculate average throughput from test results that have throughput data."""
+    throughput_values = [
+        result.throughput for result in test_results.test_results if result.throughput is not None and result.did_pass
+    ]
+
+    if not throughput_values:
+        return None
+
+    return sum(throughput_values) / len(throughput_values)
+
+
 def quantity_of_tests_critic(candidate_result: OptimizedCandidateResult | OriginalCodeBaseline) -> bool:
     test_results = candidate_result.behavior_test_results
     report = test_results.get_test_pass_fail_report_by_type()
diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py
@@ -37,7 +37,7 @@ def parse_func(file_path: Path) -> XMLParser:
 
 
 matches_re_start = re.compile(r"!\$######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######\$!\n")
-matches_re_end = re.compile(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!")
+matches_re_end = re.compile(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)(?::throughput_([\d\.]+)_ops_per_sec)?######!")
 
 
 def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, test_config: TestConfig) -> TestResults:
@@ -93,6 +93,7 @@ def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, tes
                         return_value=test_pickle,
                         timed_out=False,
                         verification_type=VerificationType.FUNCTION_CALL,
+                        throughput=None,
                     )
                 )
         except Exception as e:
@@ -160,6 +161,7 @@ def parse_sqlite_test_results(sqlite_file_path: Path, test_files: TestFiles, tes
                     return_value=ret_val,
                     timed_out=False,
                     verification_type=VerificationType(verification_type) if verification_type else None,
+                    throughput=None,
                 )
             )
         except Exception:
@@ -293,22 +295,27 @@ def parse_test_xml(
                         return_value=None,
                         timed_out=timed_out,
                         stdout="",
+                        throughput=None,
                     )
                 )
 
             else:
                 for match_index, match in enumerate(begin_matches):
                     groups = match.groups()
                     end_match = end_matches.get(groups)
-                    iteration_id, runtime = groups[5], None
+                    iteration_id, runtime, throughput = groups[5], None, None
                     if end_match:
                         stdout = sys_stdout[match.end() : end_match.start()]
-                        split_val = end_match.groups()[5].split(":")
+                        end_groups = end_match.groups()
+                        split_val = end_groups[5].split(":")
                         if len(split_val) > 1:
                             iteration_id = split_val[0]
                             runtime = int(split_val[1])
                         else:
                             iteration_id, runtime = split_val[0], None
+                        # Extract throughput if present (group 6 is the throughput capture group)
+                        if len(end_groups) > 6 and end_groups[6] is not None:
+                            throughput = float(end_groups[6])
                     elif match_index == len(begin_matches) - 1:
                         stdout = sys_stdout[match.end() :]
                     else:
@@ -332,6 +339,7 @@ def parse_test_xml(
                             return_value=None,
                             timed_out=timed_out,
                             stdout=stdout,
+                            throughput=throughput,
                         )
                     )
 
@@ -420,6 +428,7 @@ def merge_test_results(
                         if result_bin.verification_type
                         else None,
                         stdout=xml_result.stdout,
+                        throughput=None,
                     )
                 )
         elif xml_results.test_results[0].id.iteration_id is not None:
@@ -450,6 +459,7 @@ def merge_test_results(
                         if bin_result.verification_type
                         else None,
                         stdout=xml_result.stdout,
+                        throughput=None,
                     )
                 )
         else:
@@ -477,6 +487,7 @@ def merge_test_results(
                         if bin_result.verification_type
                         else None,
                         stdout=xml_result.stdout,
+                        throughput=None,
                     )
                 )
 

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ def parse_func(file_path: Path) -> XMLParser:`
`37`	`37`
`38`	`38`
`39`	`39`	`matches_re_start = re.compile(r"!\$######(.?):(.?)([^\.:]?):(.?):(.?):(.?)######\$!\n")`
`40`		`-matches_re_end = re.compile(r"!######(.?):(.?)([^\.:]?):(.?):(.?):(.?)######!")`
	`40`	`+matches_re_end = re.compile(r"!######(.?):(.?)([^\.:]?):(.?):(.?):(.?)(?::throughput_([\d\.]+)_ops_per_sec)?######!")`
`41`	`41`
`42`	`42`
`43`	`43`	`def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, test_config: TestConfig) -> TestResults:`
`@@ -93,6 +93,7 @@ def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, tes`
`93`	`93`	`return_value=test_pickle,`
`94`	`94`	`timed_out=False,`
`95`	`95`	`verification_type=VerificationType.FUNCTION_CALL,`
	`96`	`+ throughput=None,`
`96`	`97`	`)`
`97`	`98`	`)`
`98`	`99`	`except Exception as e:`
`@@ -160,6 +161,7 @@ def parse_sqlite_test_results(sqlite_file_path: Path, test_files: TestFiles, tes`
`160`	`161`	`return_value=ret_val,`
`161`	`162`	`timed_out=False,`
`162`	`163`	`verification_type=VerificationType(verification_type) if verification_type else None,`
	`164`	`+ throughput=None,`
`163`	`165`	`)`
`164`	`166`	`)`
`165`	`167`	`except Exception:`
`@@ -293,22 +295,27 @@ def parse_test_xml(`
`293`	`295`	`return_value=None,`
`294`	`296`	`timed_out=timed_out,`
`295`	`297`	`stdout="",`
	`298`	`+ throughput=None,`
`296`	`299`	`)`
`297`	`300`	`)`
`298`	`301`
`299`	`302`	`else:`
`300`	`303`	`for match_index, match in enumerate(begin_matches):`
`301`	`304`	`groups = match.groups()`
`302`	`305`	`end_match = end_matches.get(groups)`
`303`		`- iteration_id, runtime = groups[5], None`
	`306`	`+ iteration_id, runtime, throughput = groups[5], None, None`
`304`	`307`	`if end_match:`
`305`	`308`	`stdout = sys_stdout[match.end() : end_match.start()]`
`306`		`- split_val = end_match.groups()[5].split(":")`
	`309`	`+ end_groups = end_match.groups()`
	`310`	`+ split_val = end_groups[5].split(":")`
`307`	`311`	`if len(split_val) > 1:`
`308`	`312`	`iteration_id = split_val[0]`
`309`	`313`	`runtime = int(split_val[1])`
`310`	`314`	`else:`
`311`	`315`	`iteration_id, runtime = split_val[0], None`
	`316`	`+ # Extract throughput if present (group 6 is the throughput capture group)`
	`317`	`+ if len(end_groups) > 6 and end_groups[6] is not None:`
	`318`	`+ throughput = float(end_groups[6])`
`312`	`319`	`elif match_index == len(begin_matches) - 1:`
`313`	`320`	`stdout = sys_stdout[match.end() :]`
`314`	`321`	`else:`
`@@ -332,6 +339,7 @@ def parse_test_xml(`
`332`	`339`	`return_value=None,`
`333`	`340`	`timed_out=timed_out,`
`334`	`341`	`stdout=stdout,`
	`342`	`+ throughput=throughput,`
`335`	`343`	`)`
`336`	`344`	`)`
`337`	`345`
`@@ -420,6 +428,7 @@ def merge_test_results(`
`420`	`428`	`if result_bin.verification_type`
`421`	`429`	`else None,`
`422`	`430`	`stdout=xml_result.stdout,`
	`431`	`+ throughput=None,`
`423`	`432`	`)`
`424`	`433`	`)`
`425`	`434`	`elif xml_results.test_results[0].id.iteration_id is not None:`
`@@ -450,6 +459,7 @@ def merge_test_results(`
`450`	`459`	`if bin_result.verification_type`
`451`	`460`	`else None,`
`452`	`461`	`stdout=xml_result.stdout,`
	`462`	`+ throughput=None,`
`453`	`463`	`)`
`454`	`464`	`)`
`455`	`465`	`else:`
`@@ -477,6 +487,7 @@ def merge_test_results(`
`477`	`487`	`if bin_result.verification_type`
`478`	`488`	`else None,`
`479`	`489`	`stdout=xml_result.stdout,`
	`490`	`+ throughput=None,`
`480`	`491`	`)`
`481`	`492`	`)`
`482`	`493`