cleanup

KRRT7 · KRRT7 · commit 4866d82b1507 · 2025-10-25T23:45:19.000-05:00
diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py
@@ -1,4 +1,5 @@
 import sys
+from collections import defaultdict
 
 from codeflash.cli_cmds.console import logger
 from codeflash.models.models import TestResults, TestType, VerificationType
@@ -14,14 +15,47 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
     original_recursion_limit = sys.getrecursionlimit()
     if original_recursion_limit < INCREASED_RECURSION_LIMIT:
         sys.setrecursionlimit(INCREASED_RECURSION_LIMIT)  # Increase recursion limit to avoid RecursionError
+
+    # Separate Hypothesis tests from other test types for semantic comparison
+    # Hypothesis tests are always compared semantically (by test function, not example count)
+    original_hypothesis = [
+        r for r in original_results.test_results if r.test_type == TestType.HYPOTHESIS_TEST and r.loop_index == 1
+    ]
+    candidate_hypothesis = [
+        r for r in candidate_results.test_results if r.test_type == TestType.HYPOTHESIS_TEST and r.loop_index == 1
+    ]
+
+    # Compare Hypothesis tests semantically if any are present
+    if original_hypothesis or candidate_hypothesis:
+        logger.debug(
+            f"Comparing Hypothesis tests: original={len(original_hypothesis)} examples, "
+            f"candidate={len(candidate_hypothesis)} examples"
+        )
+        hypothesis_equal = _compare_hypothesis_tests_semantic(original_hypothesis, candidate_hypothesis)
+        if not hypothesis_equal:
+            logger.info("Hypothesis comparison failed")
+            sys.setrecursionlimit(original_recursion_limit)
+            return False
+        logger.debug("Hypothesis comparison passed")
+
     test_ids_superset = original_results.get_all_unique_invocation_loop_ids().union(
         set(candidate_results.get_all_unique_invocation_loop_ids())
     )
+    logger.debug(f"Total test IDs in superset: {len(test_ids_superset)}")
     are_equal: bool = True
     did_all_timeout: bool = True
     for test_id in test_ids_superset:
         original_test_result = original_results.get_by_unique_invocation_loop_id(test_id)
         cdd_test_result = candidate_results.get_by_unique_invocation_loop_id(test_id)
+
+        # Skip Hypothesis tests - already compared semantically above
+        if original_test_result and original_test_result.test_type == TestType.HYPOTHESIS_TEST:
+            did_all_timeout = False  # Hypothesis tests are checked separately, not timed out
+            continue
+        if cdd_test_result and cdd_test_result.test_type == TestType.HYPOTHESIS_TEST:
+            did_all_timeout = False
+            continue
+
         if cdd_test_result is not None and original_test_result is None:
             continue
         # If helper function instance_state verification is not present, that's ok. continue
@@ -33,6 +67,11 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
             continue
         if original_test_result is None or cdd_test_result is None:
             are_equal = False
+            logger.debug(
+                f"Test result mismatch: test_id={test_id}, "
+                f"original_present={original_test_result is not None}, "
+                f"candidate_present={cdd_test_result is not None}"
+            )
             break
         did_all_timeout = did_all_timeout and original_test_result.timed_out
         if original_test_result.timed_out:
@@ -80,5 +119,89 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
             break
     sys.setrecursionlimit(original_recursion_limit)
     if did_all_timeout:
+        logger.debug("Comparison failed: all tests timed out")
         return False
+    logger.debug(f"Final comparison result: are_equal={are_equal}")
     return are_equal
+
+
+def _compare_hypothesis_tests_semantic(original_hypothesis: list, candidate_hypothesis: list) -> bool:
+    """Compare Hypothesis tests by test function, not by example count.
+
+    Hypothesis can generate different numbers of examples between runs due to:
+    - Timing differences
+    - Early stopping
+    - Shrinking behavior
+    - Performance differences
+
+    What matters is whether the test functions themselves pass or fail,
+    not how many examples Hypothesis generated.
+    """
+
+    # Group by test function (excluding loop index and iteration_id from comparison)
+    def get_test_key(test_result):
+        """Get unique key for a Hypothesis test function."""
+        return (
+            test_result.id.test_module_path,
+            test_result.id.test_class_name,
+            test_result.id.test_function_name,
+            test_result.id.function_getting_tested,
+        )
+
+    # Group original results by test function
+    original_by_func = defaultdict(list)
+    for result in original_hypothesis:
+        original_by_func[get_test_key(result)].append(result)
+
+    # Group candidate results by test function
+    candidate_by_func = defaultdict(list)
+    for result in candidate_hypothesis:
+        candidate_by_func[get_test_key(result)].append(result)
+
+    # Log summary statistics
+    orig_total_examples = sum(len(examples) for examples in original_by_func.values())
+    cand_total_examples = sum(len(examples) for examples in candidate_by_func.values())
+
+    logger.debug(
+        f"Hypothesis comparison: Original={len(original_by_func)} test functions ({orig_total_examples} examples), "
+        f"Candidate={len(candidate_by_func)} test functions ({cand_total_examples} examples)"
+    )
+
+    # Check if all test functions in original are present in candidate
+    missing_funcs = set(original_by_func.keys()) - set(candidate_by_func.keys())
+    if missing_funcs:
+        logger.warning(
+            f"Hypothesis test functions missing in candidate: {len(missing_funcs)} functions. "
+            f"First missing: {missing_funcs.__iter__().__next__()}"
+        )
+        return False
+
+    # Compare each test function's results
+    for test_key in original_by_func:
+        if test_key not in candidate_by_func:
+            continue  # Already handled above
+
+        orig_examples = original_by_func[test_key]
+        cand_examples = candidate_by_func[test_key]
+
+        # Check if any original example failed
+        orig_had_failure = any(not ex.did_pass for ex in orig_examples)
+        cand_had_failure = any(not ex.did_pass for ex in cand_examples)
+
+        # If original had failures, candidate must also have failures (or be missing, already handled)
+        # If original passed, candidate must pass (but can have different example counts)
+        if orig_had_failure != cand_had_failure:
+            logger.debug(
+                f"Hypothesis test function behavior mismatch: {test_key} "
+                f"(original_failed={orig_had_failure}, candidate_failed={cand_had_failure})"
+            )
+            return False
+
+        if abs(len(orig_examples) - len(cand_examples)) > 10:
+            logger.info(
+                f"Hypothesis test '{test_key[2]}': example counts differ "
+                f"(original={len(orig_examples)}, candidate={len(cand_examples)}). "
+                f"This is expected when code performance changes."
+            )
+
+    return True