Merge pull request codeflash-ai#517 from codeflash-ai/early-skip-if-insufficient-number-of-tests-passed

misrasaurabh1 · web-flow · commit 2812dd5bbc06 · 2025-07-05T17:58:50.000-07:00
early skip if quantity of tests not sufficient for original baseline
diff --git a/codeflash/github/PrComment.py b/codeflash/github/PrComment.py
@@ -18,14 +18,14 @@ class PrComment:
     relative_file_path: str
     speedup_x: str
     speedup_pct: str
-    winning_behavioral_test_results: TestResults
+    winning_behavior_test_results: TestResults
     winning_benchmarking_test_results: TestResults
     benchmark_details: Optional[list[BenchmarkDetail]] = None
 
     def to_json(self) -> dict[str, Union[dict[str, dict[str, int]], int, str, Optional[list[BenchmarkDetail]]]]:
         report_table = {
             test_type.to_name(): result
-            for test_type, result in self.winning_behavioral_test_results.get_test_pass_fail_report_by_type().items()
+            for test_type, result in self.winning_behavior_test_results.get_test_pass_fail_report_by_type().items()
             if test_type.to_name()
         }
 
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -78,7 +78,7 @@ class BestOptimization(BaseModel):
     helper_functions: list[FunctionSource]
     runtime: int
     replay_performance_gain: Optional[dict[BenchmarkKey, float]] = None
-    winning_behavioral_test_results: TestResults
+    winning_behavior_test_results: TestResults
     winning_benchmarking_test_results: TestResults
     winning_replay_benchmarking_test_results: Optional[TestResults] = None
 
@@ -278,7 +278,7 @@ class FunctionParent:
 
 
 class OriginalCodeBaseline(BaseModel):
-    behavioral_test_results: TestResults
+    behavior_test_results: TestResults
     benchmarking_test_results: TestResults
     replay_benchmarking_test_results: Optional[dict[BenchmarkKey, TestResults]] = None
     line_profile_results: dict
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -488,7 +488,7 @@ def determine_best_candidate(
                                 candidate=candidate,
                                 helper_functions=code_context.helper_functions,
                                 runtime=best_test_runtime,
-                                winning_behavioral_test_results=candidate_result.behavior_test_results,
+                                winning_behavior_test_results=candidate_result.behavior_test_results,
                                 replay_performance_gain=replay_perf_gain if self.args.benchmark else None,
                                 winning_benchmarking_test_results=candidate_result.benchmarking_test_results,
                                 winning_replay_benchmarking_test_results=candidate_result.benchmarking_test_results,
@@ -575,7 +575,7 @@ def log_successful_optimization(
                 "original_runtime": explanation.original_runtime_ns,
                 "winning_test_results": {
                     tt.to_name(): v
-                    for tt, v in explanation.winning_behavioral_test_results.get_test_pass_fail_report_by_type().items()
+                    for tt, v in explanation.winning_behavior_test_results.get_test_pass_fail_report_by_type().items()
                 },
             },
         )
@@ -898,8 +898,9 @@ def setup_and_establish_baseline(
             return Failure(baseline_result.failure())
 
         original_code_baseline, test_functions_to_remove = baseline_result.unwrap()
-        if isinstance(original_code_baseline, OriginalCodeBaseline) and not coverage_critic(
-            original_code_baseline.coverage_results, self.args.test_framework
+        if isinstance(original_code_baseline, OriginalCodeBaseline) and (
+            not coverage_critic(original_code_baseline.coverage_results, self.args.test_framework)
+            or not quantity_of_tests_critic(original_code_baseline)
         ):
             if self.args.override_fixtures:
                 restore_conftest(original_conftest_content)
@@ -971,7 +972,7 @@ def find_and_process_best_optimization(
                     )
                 explanation = Explanation(
                     raw_explanation_message=best_optimization.candidate.explanation,
-                    winning_behavioral_test_results=best_optimization.winning_behavioral_test_results,
+                    winning_behavior_test_results=best_optimization.winning_behavior_test_results,
                     winning_benchmarking_test_results=best_optimization.winning_benchmarking_test_results,
                     original_runtime_ns=original_code_baseline.runtime,
                     best_runtime_ns=best_optimization.runtime,
@@ -1203,7 +1204,7 @@ def establish_original_code_baseline(
             return Success(
                 (
                     OriginalCodeBaseline(
-                        behavioral_test_results=behavioral_results,
+                        behavior_test_results=behavioral_results,
                         benchmarking_test_results=benchmarking_results,
                         replay_benchmarking_test_results=replay_benchmarking_test_results
                         if self.args.benchmark
@@ -1267,7 +1268,7 @@ def run_optimized_candidate(
                 )
             )
             console.rule()
-            if compare_test_results(baseline_results.behavioral_test_results, candidate_behavior_results):
+            if compare_test_results(baseline_results.behavior_test_results, candidate_behavior_results):
                 logger.info("Test results matched!")
                 console.rule()
             else:
diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
@@ -163,7 +163,7 @@ def check_create_pr(
                 relative_file_path=relative_path,
                 speedup_x=explanation.speedup_x,
                 speedup_pct=explanation.speedup_pct,
-                winning_behavioral_test_results=explanation.winning_behavioral_test_results,
+                winning_behavior_test_results=explanation.winning_behavior_test_results,
                 winning_benchmarking_test_results=explanation.winning_benchmarking_test_results,
                 benchmark_details=explanation.benchmark_details,
             ),
@@ -210,7 +210,7 @@ def check_create_pr(
                 relative_file_path=relative_path,
                 speedup_x=explanation.speedup_x,
                 speedup_pct=explanation.speedup_pct,
-                winning_behavioral_test_results=explanation.winning_behavioral_test_results,
+                winning_behavior_test_results=explanation.winning_behavior_test_results,
                 winning_benchmarking_test_results=explanation.winning_benchmarking_test_results,
                 benchmark_details=explanation.benchmark_details,
             ),
diff --git a/codeflash/result/critic.py b/codeflash/result/critic.py
@@ -12,7 +12,7 @@
 from codeflash.models.models import TestType
 
 if TYPE_CHECKING:
-    from codeflash.models.models import CoverageData, OptimizedCandidateResult
+    from codeflash.models.models import CoverageData, OptimizedCandidateResult, OriginalCodeBaseline
 
 
 def performance_gain(*, original_runtime_ns: int, optimized_runtime_ns: int) -> float:
@@ -50,7 +50,7 @@ def speedup_critic(
     return bool(perf_gain > noise_floor and candidate_result.best_test_runtime < best_runtime_until_now)
 
 
-def quantity_of_tests_critic(candidate_result: OptimizedCandidateResult) -> bool:
+def quantity_of_tests_critic(candidate_result: OptimizedCandidateResult | OriginalCodeBaseline) -> bool:
     test_results = candidate_result.behavior_test_results
     report = test_results.get_test_pass_fail_report_by_type()
 
diff --git a/codeflash/result/explanation.py b/codeflash/result/explanation.py
@@ -16,7 +16,7 @@
 @dataclass(frozen=True, config={"arbitrary_types_allowed": True})
 class Explanation:
     raw_explanation_message: str
-    winning_behavioral_test_results: TestResults
+    winning_behavior_test_results: TestResults
     winning_benchmarking_test_results: TestResults
     original_runtime_ns: int
     best_runtime_ns: int
@@ -93,7 +93,7 @@ def to_console_string(self) -> str:
             + self.raw_explanation_message
             + " \n\n"
             + "The new optimized code was tested for correctness. The results are listed below.\n"
-            + f"{TestResults.report_to_string(self.winning_behavioral_test_results.get_test_pass_fail_report_by_type())}\n"
+            + f"{TestResults.report_to_string(self.winning_behavior_test_results.get_test_pass_fail_report_by_type())}\n"
         )
 
     def explanation_message(self) -> str: