fix: track and cleanup hypothesis test temp directories

KRRT7 · KRRT7 · commit dfb3927c9c81 · 2025-10-26T01:44:38.000-05:00
- Modified generate_hypothesis_tests() to return the temp directory Path
- Added hypothesis_tests_dir tracking in FunctionOptimizer
- Extended cleanup_generated_files() to remove hypothesis test directories
- Added hypothesis_tests_dirs list in Optimizer to track all directories
- Updated cleanup_temporary_paths() to cleanup hypothesis test directories
- Ensures cleanup on success, errors, and KeyboardInterrupt
- Changed temp dir prefix to 'codeflash_hypothesis_' for clarity
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -240,6 +240,7 @@ def __init__(
         self.function_benchmark_timings = function_benchmark_timings if function_benchmark_timings else {}
         self.total_benchmark_timings = total_benchmark_timings if total_benchmark_timings else {}
         self.replay_tests_dir = replay_tests_dir if replay_tests_dir else None
+        self.hypothesis_tests_dir: Path | None = None
         self.generate_and_instrument_tests_results: (
             tuple[GeneratedTestsList, dict[str, set[FunctionCalledInTest]], OptimizationSet] | None
         ) = None
@@ -1147,7 +1148,11 @@ def generate_tests_and_optimizations(
             generate_concolic_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast
         )
         future_hypothesis_tests = self.executor.submit(
-            generate_hypothesis_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast
+            generate_hypothesis_tests,
+            self.test_cfg,
+            self.args,
+            self.function_to_optimize,
+            self.function_to_optimize_ast,
         )
         futures = [*future_tests, future_optimization_candidates, future_concolic_tests, future_hypothesis_tests]
         if run_experiment:
@@ -1201,7 +1206,8 @@ def generate_tests_and_optimizations(
             logger.warning(f"Failed to generate and instrument tests for {self.function_to_optimize.function_name}")
             return Failure(f"/!\\ NO TESTS GENERATED for {self.function_to_optimize.function_name}")
         function_to_concolic_tests, concolic_test_str = future_concolic_tests.result()
-        function_to_hypothesis_tests, hypothesis_test_str = future_hypothesis_tests.result()
+        function_to_hypothesis_tests, hypothesis_test_str, hypothesis_test_suite_dir = future_hypothesis_tests.result()
+        self.hypothesis_tests_dir = hypothesis_test_suite_dir
 
         count_tests = len(tests)
         if concolic_test_str:
@@ -2051,7 +2057,12 @@ def cleanup_generated_files(self) -> None:
             paths_to_cleanup.append(test_file.instrumented_behavior_file_path)
             paths_to_cleanup.append(test_file.benchmarking_file_path)
 
+        # Add hypothesis test directory to cleanup
+        if self.hypothesis_tests_dir and self.hypothesis_tests_dir.exists():
+            paths_to_cleanup.append(self.hypothesis_tests_dir)
+
         cleanup_paths(paths_to_cleanup)
+        self.hypothesis_tests_dir = None
 
     def get_test_env(
         self, codeflash_loop_index: int, codeflash_test_iteration: int, codeflash_tracer_disable: int = 1
diff --git a/codeflash/optimization/optimizer.py b/codeflash/optimization/optimizer.py
@@ -53,6 +53,7 @@ def __init__(self, args: Namespace) -> None:
         self.experiment_id = os.getenv("CODEFLASH_EXPERIMENT_ID", None)
         self.local_aiservice_client = LocalAiServiceClient() if self.experiment_id else None
         self.replay_tests_dir = None
+        self.hypothesis_tests_dirs: list[Path] = []  # Track all hypothesis test directories
         self.functions_checkpoint: CodeflashRunCheckpoint | None = None
         self.current_function_being_optimized: FunctionToOptimize | None = None  # current only for the LSP
         self.current_function_optimizer: FunctionOptimizer | None = None
@@ -337,6 +338,9 @@ def run(self) -> None:
                             function_optimizer  # needed to clean up from the outside of this function
                         )
                         best_optimization = function_optimizer.optimize_function()
+                        # Track hypothesis test directory for cleanup
+                        if function_optimizer.hypothesis_tests_dir:
+                            self.hypothesis_tests_dirs.append(function_optimizer.hypothesis_tests_dir)
                         if self.functions_checkpoint:
                             self.functions_checkpoint.add_function_to_checkpoint(
                                 function_to_optimize.qualified_name_with_modules_from_root(self.args.project_root)
@@ -430,7 +434,12 @@ def cleanup_temporary_paths(self) -> None:
 
         if self.current_function_optimizer:
             self.current_function_optimizer.cleanup_generated_files()
-        cleanup_paths([self.test_cfg.concolic_test_root_dir, self.replay_tests_dir])
+
+        # Cleanup all temporary test directories
+        paths_to_cleanup = [self.test_cfg.concolic_test_root_dir, self.replay_tests_dir]
+        paths_to_cleanup.extend(self.hypothesis_tests_dirs)
+        cleanup_paths(paths_to_cleanup)
+        self.hypothesis_tests_dirs.clear()
 
     def worktree_mode(self) -> None:
         if self.current_worktree:
diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py
@@ -182,7 +182,7 @@ def visit_Call(self, node: ast.Call) -> ast.Call:
 
 def generate_hypothesis_tests(
     test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST
-) -> tuple[dict[str, list[FunctionCalledInTest]], str]:
+) -> tuple[dict[str, list[FunctionCalledInTest]], str, Path | None]:
     """Generate property-based tests using Hypothesis ghostwriter.
 
     This function:
@@ -193,12 +193,14 @@ def generate_hypothesis_tests(
     5. Formats the tests with the project formatter
 
     Returns:
-        Tuple of (function_to_tests_map, test_suite_code)
+        Tuple of (function_to_tests_map, test_suite_code, hypothesis_test_suite_dir)
+        The hypothesis_test_suite_dir is None if no tests were generated.
 
     """
     start_time = time.perf_counter()
     function_to_hypothesis_tests: dict[str, list[FunctionCalledInTest]] = {}
     hypothesis_test_suite_code: str = ""
+    hypothesis_test_suite_dir: Path | None = None
 
     if (
         test_cfg.project_root_path
@@ -226,11 +228,11 @@ def generate_hypothesis_tests(
             logger.debug("Hypothesis test generation timed out")
             end_time = time.perf_counter()
             logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
-            return function_to_hypothesis_tests, hypothesis_test_suite_code
+            return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir
 
         if hypothesis_result.returncode == 0:
             hypothesis_test_suite_code = hypothesis_result.stdout
-            hypothesis_test_suite_dir = Path(tempfile.mkdtemp(dir=test_cfg.tests_root))
+            hypothesis_test_suite_dir = Path(tempfile.mkdtemp(prefix="codeflash_hypothesis_", dir=test_cfg.tests_root))
             hypothesis_path = hypothesis_test_suite_dir / "test_hypothesis.py"
             hypothesis_path.write_text(hypothesis_test_suite_code, encoding="utf8")
 
@@ -269,7 +271,7 @@ def generate_hypothesis_tests(
             console.rule()
             end_time = time.perf_counter()
             logger.debug(f"Generated hypothesis tests in {end_time - start_time:.2f} seconds")
-            return function_to_hypothesis_tests, hypothesis_test_suite_code
+            return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir
 
         logger.debug(
             f"Error running hypothesis write {': ' + hypothesis_result.stderr if hypothesis_result.stderr else '.'}"
@@ -278,4 +280,4 @@ def generate_hypothesis_tests(
 
     end_time = time.perf_counter()
     logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
-    return function_to_hypothesis_tests, hypothesis_test_suite_code
+    return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir