cleanup strategies

KRRT7 · KRRT7 · commit 19fc55750933 · 2025-10-26T02:08:40.000-05:00
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -240,6 +240,7 @@ def __init__(
         self.function_benchmark_timings = function_benchmark_timings if function_benchmark_timings else {}
         self.total_benchmark_timings = total_benchmark_timings if total_benchmark_timings else {}
         self.replay_tests_dir = replay_tests_dir if replay_tests_dir else None
+        self.hypothesis_tests_dir: Path | None = None
         self.generate_and_instrument_tests_results: (
             tuple[GeneratedTestsList, dict[str, set[FunctionCalledInTest]], OptimizationSet] | None
         ) = None
@@ -1147,7 +1148,11 @@ def generate_tests_and_optimizations(
             generate_concolic_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast
         )
         future_hypothesis_tests = self.executor.submit(
-            generate_hypothesis_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast
+            generate_hypothesis_tests,
+            self.test_cfg,
+            self.args,
+            self.function_to_optimize,
+            self.function_to_optimize_ast,
         )
         futures = [*future_tests, future_optimization_candidates, future_concolic_tests, future_hypothesis_tests]
         if run_experiment:
@@ -1201,7 +1206,8 @@ def generate_tests_and_optimizations(
             logger.warning(f"Failed to generate and instrument tests for {self.function_to_optimize.function_name}")
             return Failure(f"/!\\ NO TESTS GENERATED for {self.function_to_optimize.function_name}")
         function_to_concolic_tests, concolic_test_str = future_concolic_tests.result()
-        function_to_hypothesis_tests, hypothesis_test_str = future_hypothesis_tests.result()
+        function_to_hypothesis_tests, hypothesis_test_str, hypothesis_test_suite_dir = future_hypothesis_tests.result()
+        self.hypothesis_tests_dir = hypothesis_test_suite_dir
 
         count_tests = len(tests)
         if concolic_test_str:
@@ -2051,7 +2057,11 @@ def cleanup_generated_files(self) -> None:
             paths_to_cleanup.append(test_file.instrumented_behavior_file_path)
             paths_to_cleanup.append(test_file.benchmarking_file_path)
 
+        if self.hypothesis_tests_dir and self.hypothesis_tests_dir.exists():
+            paths_to_cleanup.append(self.hypothesis_tests_dir)
+
         cleanup_paths(paths_to_cleanup)
+        self.hypothesis_tests_dir = None
 
     def get_test_env(
         self, codeflash_loop_index: int, codeflash_test_iteration: int, codeflash_tracer_disable: int = 1
diff --git a/codeflash/optimization/optimizer.py b/codeflash/optimization/optimizer.py
@@ -53,6 +53,7 @@ def __init__(self, args: Namespace) -> None:
         self.experiment_id = os.getenv("CODEFLASH_EXPERIMENT_ID", None)
         self.local_aiservice_client = LocalAiServiceClient() if self.experiment_id else None
         self.replay_tests_dir = None
+        self.hypothesis_tests_dirs: list[Path] = []  # Track all hypothesis test directories
         self.functions_checkpoint: CodeflashRunCheckpoint | None = None
         self.current_function_being_optimized: FunctionToOptimize | None = None  # current only for the LSP
         self.current_function_optimizer: FunctionOptimizer | None = None
@@ -337,6 +338,8 @@ def run(self) -> None:
                             function_optimizer  # needed to clean up from the outside of this function
                         )
                         best_optimization = function_optimizer.optimize_function()
+                        if function_optimizer.hypothesis_tests_dir:
+                            self.hypothesis_tests_dirs.append(function_optimizer.hypothesis_tests_dir)
                         if self.functions_checkpoint:
                             self.functions_checkpoint.add_function_to_checkpoint(
                                 function_to_optimize.qualified_name_with_modules_from_root(self.args.project_root)
@@ -430,7 +433,11 @@ def cleanup_temporary_paths(self) -> None:
 
         if self.current_function_optimizer:
             self.current_function_optimizer.cleanup_generated_files()
-        cleanup_paths([self.test_cfg.concolic_test_root_dir, self.replay_tests_dir])
+
+        paths_to_cleanup = [self.test_cfg.concolic_test_root_dir, self.replay_tests_dir]
+        paths_to_cleanup.extend(self.hypothesis_tests_dirs)
+        cleanup_paths(paths_to_cleanup)
+        self.hypothesis_tests_dirs.clear()
 
     def worktree_mode(self) -> None:
         if self.current_worktree:
diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py
@@ -167,16 +167,6 @@ def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, st
         f"Candidate={len(candidate_by_func)} test functions ({cand_total_examples} examples)"
     )
 
-    # Check if all test functions in original are present in candidate
-    missing_funcs = set(original_by_func.keys()) - set(candidate_by_func.keys())
-    if missing_funcs:
-        logger.warning(
-            f"Hypothesis test functions missing in candidate: {len(missing_funcs)} functions. "
-            f"First missing: {missing_funcs.__iter__().__next__()}"
-        )
-        return False
-
-    # Compare each test function's results
     for test_key in original_by_func:
         if test_key not in candidate_by_func:
             continue  # Already handled above
@@ -196,12 +186,4 @@ def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, st
                 f"(original_failed={orig_had_failure}, candidate_failed={cand_had_failure})"
             )
             return False
-
-        if abs(len(orig_examples) - len(cand_examples)) > 10:
-            logger.info(
-                f"Hypothesis test '{test_key[2]}': example counts differ "
-                f"(original={len(orig_examples)}, candidate={len(cand_examples)}). "
-                f"This is expected when code performance changes."
-            )
-
     return True
diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py
@@ -182,7 +182,7 @@ def visit_Call(self, node: ast.Call) -> ast.Call:
 
 def generate_hypothesis_tests(
     test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST
-) -> tuple[dict[str, list[FunctionCalledInTest]], str]:
+) -> tuple[dict[str, list[FunctionCalledInTest]], str, Path | None]:
     """Generate property-based tests using Hypothesis ghostwriter.
 
     This function:
@@ -193,12 +193,14 @@ def generate_hypothesis_tests(
     5. Formats the tests with the project formatter
 
     Returns:
-        Tuple of (function_to_tests_map, test_suite_code)
+        Tuple of (function_to_tests_map, test_suite_code, hypothesis_test_suite_dir)
+        The hypothesis_test_suite_dir is None if no tests were generated.
 
     """
     start_time = time.perf_counter()
     function_to_hypothesis_tests: dict[str, list[FunctionCalledInTest]] = {}
     hypothesis_test_suite_code: str = ""
+    hypothesis_test_suite_dir: Path | None = None
 
     if (
         test_cfg.project_root_path
@@ -212,8 +214,6 @@ def generate_hypothesis_tests(
             qualified_function_path = get_qualified_function_path(
                 function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name
             )
-            logger.info(f"command: hypothesis write {qualified_function_path}")
-
             hypothesis_result = subprocess.run(
                 ["hypothesis", "write", qualified_function_path],
                 capture_output=True,
@@ -226,11 +226,11 @@ def generate_hypothesis_tests(
             logger.debug("Hypothesis test generation timed out")
             end_time = time.perf_counter()
             logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
-            return function_to_hypothesis_tests, hypothesis_test_suite_code
+            return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir
 
         if hypothesis_result.returncode == 0:
             hypothesis_test_suite_code = hypothesis_result.stdout
-            hypothesis_test_suite_dir = Path(tempfile.mkdtemp(dir=test_cfg.tests_root))
+            hypothesis_test_suite_dir = Path(tempfile.mkdtemp(prefix="codeflash_hypothesis_", dir=test_cfg.tests_root))
             hypothesis_path = hypothesis_test_suite_dir / "test_hypothesis.py"
             hypothesis_path.write_text(hypothesis_test_suite_code, encoding="utf8")
 
@@ -250,12 +250,11 @@ def generate_hypothesis_tests(
 
             unparsed = filter_hypothesis_tests_by_function_name(original_code, function_to_optimize.function_name)
 
-            console.print(f"modified src: {unparsed}")
-
             hypothesis_test_suite_code = format_code(
                 args.formatter_cmds,
                 hypothesis_path,
                 optimized_code=make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)),
+                print_status=False,
             )
             with hypothesis_path.open("w", encoding="utf-8") as f:
                 f.write(hypothesis_test_suite_code)
@@ -269,7 +268,7 @@ def generate_hypothesis_tests(
             console.rule()
             end_time = time.perf_counter()
             logger.debug(f"Generated hypothesis tests in {end_time - start_time:.2f} seconds")
-            return function_to_hypothesis_tests, hypothesis_test_suite_code
+            return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir
 
         logger.debug(
             f"Error running hypothesis write {': ' + hypothesis_result.stderr if hypothesis_result.stderr else '.'}"
@@ -278,4 +277,4 @@ def generate_hypothesis_tests(
 
     end_time = time.perf_counter()
     logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
-    return function_to_hypothesis_tests, hypothesis_test_suite_code
+    return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir