From 389b32c9b2cfc944568c7022b5fab8332cdd6990 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Wed, 22 Oct 2025 02:30:17 -0500
Subject: [PATCH 01/16] first pass at hypothesis integration

---
 codeflash/code_utils/code_utils.py           |   5 +
 codeflash/discovery/discover_unit_tests.py   |   4 +
 codeflash/models/test_type.py                |   2 +
 codeflash/optimization/function_optimizer.py |  77 +++++--
 codeflash/verification/concolic_testing.py   |  14 +-
 codeflash/verification/hypothesis_testing.py | 207 +++++++++++++++++++
 pyproject.toml                               |   1 +
 uv.lock                                      |  50 +++++
 8 files changed, 336 insertions(+), 24 deletions(-)
 create mode 100644 codeflash/verification/hypothesis_testing.py

diff --git a/codeflash/code_utils/code_utils.py b/codeflash/code_utils/code_utils.py
index 82e122d0e..8f1cca998 100644
--- a/codeflash/code_utils/code_utils.py
+++ b/codeflash/code_utils/code_utils.py
@@ -254,6 +254,11 @@ def module_name_from_file_path(file_path: Path, project_root_path: Path, *, trav
         raise ValueError(msg)  # noqa: B904
 
 
+def get_qualified_function_path(file_path: Path, project_root_path: Path, qualified_name: str) -> str:
+    module_path = file_path.relative_to(project_root_path).with_suffix("").as_posix().replace("/", ".")
+    return f"{module_path}.{qualified_name}"
+
+
 def file_path_from_module_name(module_name: str, project_root_path: Path) -> Path:
     """Get file path from module path."""
     return project_root_path / (module_name.replace(".", os.sep) + ".py")
diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py
index 398efe461..d7c1c0a83 100644
--- a/codeflash/discovery/discover_unit_tests.py
+++ b/codeflash/discovery/discover_unit_tests.py
@@ -497,6 +497,8 @@ def discover_tests_pytest(
             test_type = TestType.REPLAY_TEST
         elif "test_concolic_coverage" in test["test_file"]:
             test_type = TestType.CONCOLIC_COVERAGE_TEST
+        elif "test_hypothesis" in test["test_file"]:
+            test_type = TestType.HYPOTHESIS_TEST
         else:
             test_type = TestType.EXISTING_UNIT_TEST
 
@@ -540,6 +542,8 @@ def get_test_details(_test: unittest.TestCase) -> TestsInFile | None:
             test_type = TestType.REPLAY_TEST
         elif "test_concolic_coverage" in str(_test_module_path):
             test_type = TestType.CONCOLIC_COVERAGE_TEST
+        elif "test_hypothesis" in str(_test_module_path):
+            test_type = TestType.HYPOTHESIS_TEST
         else:
             test_type = TestType.EXISTING_UNIT_TEST
         return TestsInFile(
diff --git a/codeflash/models/test_type.py b/codeflash/models/test_type.py
index 103a3bc4d..f30089967 100644
--- a/codeflash/models/test_type.py
+++ b/codeflash/models/test_type.py
@@ -8,6 +8,7 @@ class TestType(Enum):
     REPLAY_TEST = 4
     CONCOLIC_COVERAGE_TEST = 5
     INIT_STATE_TEST = 6
+    HYPOTHESIS_TEST = 7
 
     def to_name(self) -> str:
         if self is TestType.INIT_STATE_TEST:
@@ -18,5 +19,6 @@ def to_name(self) -> str:
             TestType.GENERATED_REGRESSION: "🌀 Generated Regression Tests",
             TestType.REPLAY_TEST: "⏪ Replay Tests",
             TestType.CONCOLIC_COVERAGE_TEST: "🔎 Concolic Coverage Tests",
+            TestType.HYPOTHESIS_TEST: "🔮 Hypothesis Tests",
         }
         return names[self]
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 86e9bf33f..99aeed6d2 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -95,6 +95,7 @@
 from codeflash.telemetry.posthog_cf import ph
 from codeflash.verification.concolic_testing import generate_concolic_tests
 from codeflash.verification.equivalence import compare_test_results
+from codeflash.verification.hypothesis_testing import generate_hypothesis_tests
 from codeflash.verification.instrument_codeflash_capture import instrument_codeflash_capture
 from codeflash.verification.parse_line_profile_test_output import parse_line_profile_results
 from codeflash.verification.parse_test_output import calculate_function_throughput_from_test_results, parse_test_results
@@ -281,6 +282,8 @@ def generate_and_instrument_tests(
             GeneratedTestsList,
             dict[str, set[FunctionCalledInTest]],
             str,
+            dict[str, set[FunctionCalledInTest]],
+            str,
             OptimizationSet,
             list[Path],
             list[Path],
@@ -323,9 +326,15 @@ def generate_and_instrument_tests(
 
         generated_tests: GeneratedTestsList
         optimizations_set: OptimizationSet
-        count_tests, generated_tests, function_to_concolic_tests, concolic_test_str, optimizations_set = (
-            generated_results.unwrap()
-        )
+        (
+            count_tests,
+            generated_tests,
+            function_to_concolic_tests,
+            concolic_test_str,
+            function_to_hypothesis_tests,
+            hypothesis_test_str,
+            optimizations_set,
+        ) = generated_results.unwrap()
 
         for i, generated_test in enumerate(generated_tests.generated_tests):
             with generated_test.behavior_file_path.open("w", encoding="utf8") as f:
@@ -345,12 +354,19 @@ def generate_and_instrument_tests(
             logger.info(f"Generated test {i + 1}/{count_tests}:")
             code_print(generated_test.generated_original_test_source, file_name=f"test_{i + 1}.py")
         if concolic_test_str:
-            logger.info(f"Generated test {count_tests}/{count_tests}:")
+            logger.info(f"Generated test {count_tests - (1 if hypothesis_test_str else 0)}/{count_tests}:")
             code_print(concolic_test_str)
+        if hypothesis_test_str:
+            logger.info(f"Generated test {count_tests}/{count_tests}:")
+            code_print(hypothesis_test_str)
 
         function_to_all_tests = {
-            key: self.function_to_tests.get(key, set()) | function_to_concolic_tests.get(key, set())
-            for key in set(self.function_to_tests) | set(function_to_concolic_tests)
+            key: (
+                self.function_to_tests.get(key, set())
+                | function_to_concolic_tests.get(key, set())
+                | function_to_hypothesis_tests.get(key, set())
+            )
+            for key in set(self.function_to_tests) | set(function_to_concolic_tests) | set(function_to_hypothesis_tests)
         }
         instrumented_unittests_created_for_function = self.instrument_existing_tests(function_to_all_tests)
 
@@ -366,6 +382,8 @@ def generate_and_instrument_tests(
                 generated_tests,
                 function_to_concolic_tests,
                 concolic_test_str,
+                function_to_hypothesis_tests,
+                hypothesis_test_str,
                 optimizations_set,
                 generated_test_paths,
                 generated_perf_test_paths,
@@ -398,6 +416,8 @@ def optimize_function(self) -> Result[BestOptimization, str]:
             generated_tests,
             function_to_concolic_tests,
             concolic_test_str,
+            function_to_hypothesis_tests,
+            _hypothesis_test_str,
             optimizations_set,
             generated_test_paths,
             generated_perf_test_paths,
@@ -409,6 +429,7 @@ def optimize_function(self) -> Result[BestOptimization, str]:
             code_context=code_context,
             original_helper_code=original_helper_code,
             function_to_concolic_tests=function_to_concolic_tests,
+            function_to_hypothesis_tests=function_to_hypothesis_tests,
             generated_test_paths=generated_test_paths,
             generated_perf_test_paths=generated_perf_test_paths,
             instrumented_unittests_created_for_function=instrumented_unittests_created_for_function,
@@ -991,6 +1012,7 @@ def instrument_existing_tests(self, function_to_all_tests: dict[str, set[Functio
         existing_test_files_count = 0
         replay_test_files_count = 0
         concolic_coverage_test_files_count = 0
+        hypothesis_test_files_count = 0
         unique_instrumented_test_files = set()
 
         func_qualname = self.function_to_optimize.qualified_name_with_modules_from_root(self.project_root)
@@ -1011,6 +1033,8 @@ def instrument_existing_tests(self, function_to_all_tests: dict[str, set[Functio
                     replay_test_files_count += 1
                 elif test_type == TestType.CONCOLIC_COVERAGE_TEST:
                     concolic_coverage_test_files_count += 1
+                elif test_type == TestType.HYPOTHESIS_TEST:
+                    hypothesis_test_files_count += 1
                 else:
                     msg = f"Unexpected test type: {test_type}"
                     raise ValueError(msg)
@@ -1069,9 +1093,11 @@ def instrument_existing_tests(self, function_to_all_tests: dict[str, set[Functio
             logger.info(
                 f"Discovered {existing_test_files_count} existing unit test file"
                 f"{'s' if existing_test_files_count != 1 else ''}, {replay_test_files_count} replay test file"
-                f"{'s' if replay_test_files_count != 1 else ''}, and "
+                f"{'s' if replay_test_files_count != 1 else ''}, "
                 f"{concolic_coverage_test_files_count} concolic coverage test file"
-                f"{'s' if concolic_coverage_test_files_count != 1 else ''} for {func_qualname}"
+                f"{'s' if concolic_coverage_test_files_count != 1 else ''}, and "
+                f"{hypothesis_test_files_count} hypothesis test file"
+                f"{'s' if hypothesis_test_files_count != 1 else ''} for {func_qualname}"
             )
             console.rule()
         return unique_instrumented_test_files
@@ -1085,7 +1111,15 @@ def generate_tests_and_optimizations(
         generated_test_paths: list[Path],
         generated_perf_test_paths: list[Path],
         run_experiment: bool = False,  # noqa: FBT001, FBT002
-    ) -> Result[tuple[GeneratedTestsList, dict[str, set[FunctionCalledInTest]], OptimizationSet], str]:
+    ) -> Result[
+        tuple[
+            GeneratedTestsList,
+            dict[str, set[FunctionCalledInTest]],
+            dict[str, set[FunctionCalledInTest]],
+            OptimizationSet,
+        ],
+        str,
+    ]:
         n_tests = N_TESTS_TO_GENERATE_EFFECTIVE
         assert len(generated_test_paths) == n_tests
         console.rule()
@@ -1112,7 +1146,10 @@ def generate_tests_and_optimizations(
         future_concolic_tests = self.executor.submit(
             generate_concolic_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast
         )
-        futures = [*future_tests, future_optimization_candidates, future_concolic_tests]
+        future_hypothesis_tests = self.executor.submit(
+            generate_hypothesis_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast
+        )
+        futures = [*future_tests, future_optimization_candidates, future_concolic_tests, future_hypothesis_tests]
         if run_experiment:
             future_candidates_exp = self.executor.submit(
                 self.local_aiservice_client.optimize_python_code,
@@ -1164,29 +1201,35 @@ def generate_tests_and_optimizations(
             logger.warning(f"Failed to generate and instrument tests for {self.function_to_optimize.function_name}")
             return Failure(f"/!\\ NO TESTS GENERATED for {self.function_to_optimize.function_name}")
         function_to_concolic_tests, concolic_test_str = future_concolic_tests.result()
+        function_to_hypothesis_tests, hypothesis_test_str = future_hypothesis_tests.result()
 
         count_tests = len(tests)
         if concolic_test_str:
             count_tests += 1
+        if hypothesis_test_str:
+            count_tests += 1
 
         logger.info(f"Generated '{count_tests}' tests for {self.function_to_optimize.function_name}")
         console.rule()
         generated_tests = GeneratedTestsList(generated_tests=tests)
-        result = (
+
+        self.generate_and_instrument_tests_results = (
             count_tests,
             generated_tests,
             function_to_concolic_tests,
             concolic_test_str,
+            function_to_hypothesis_tests,
+            hypothesis_test_str,
             OptimizationSet(control=candidates, experiment=candidates_experiment),
         )
-        self.generate_and_instrument_tests_results = result
-        return Success(result)
+        return Success(self.generate_and_instrument_tests_results)
 
     def setup_and_establish_baseline(
         self,
         code_context: CodeOptimizationContext,
         original_helper_code: dict[Path, str],
         function_to_concolic_tests: dict[str, set[FunctionCalledInTest]],
+        function_to_hypothesis_tests: dict[str, set[FunctionCalledInTest]],
         generated_test_paths: list[Path],
         generated_perf_test_paths: list[Path],
         instrumented_unittests_created_for_function: set[Path],
@@ -1197,8 +1240,12 @@ def setup_and_establish_baseline(
         """Set up baseline context and establish original code baseline."""
         function_to_optimize_qualified_name = self.function_to_optimize.qualified_name
         function_to_all_tests = {
-            key: self.function_to_tests.get(key, set()) | function_to_concolic_tests.get(key, set())
-            for key in set(self.function_to_tests) | set(function_to_concolic_tests)
+            key: (
+                self.function_to_tests.get(key, set())
+                | function_to_concolic_tests.get(key, set())
+                | function_to_hypothesis_tests.get(key, set())
+            )
+            for key in set(self.function_to_tests) | set(function_to_concolic_tests) | set(function_to_hypothesis_tests)
         }
 
         # Get a dict of file_path_to_classes of fto and helpers_of_fto
diff --git a/codeflash/verification/concolic_testing.py b/codeflash/verification/concolic_testing.py
index 8f30a1562..e17f5c01f 100644
--- a/codeflash/verification/concolic_testing.py
+++ b/codeflash/verification/concolic_testing.py
@@ -8,6 +8,7 @@
 from typing import TYPE_CHECKING
 
 from codeflash.cli_cmds.console import console, logger
+from codeflash.code_utils.code_utils import get_qualified_function_path
 from codeflash.code_utils.compat import SAFE_SYS_EXECUTABLE
 from codeflash.code_utils.concolic_utils import clean_concolic_tests
 from codeflash.code_utils.static_analysis import has_typed_parameters
@@ -42,6 +43,9 @@ def generate_concolic_tests(
         logger.info("Generating concolic opcode coverage tests for the original code…")
         console.rule()
         try:
+            qualified_function_path = get_qualified_function_path(
+                function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name
+            )
             cover_result = subprocess.run(
                 [
                     SAFE_SYS_EXECUTABLE,
@@ -50,15 +54,7 @@ def generate_concolic_tests(
                     "cover",
                     "--example_output_format=pytest",
                     "--per_condition_timeout=20",
-                    ".".join(
-                        [
-                            function_to_optimize.file_path.relative_to(args.project_root)
-                            .with_suffix("")
-                            .as_posix()
-                            .replace("/", "."),
-                            function_to_optimize.qualified_name,
-                        ]
-                    ),
+                    qualified_function_path,
                 ],
                 capture_output=True,
                 text=True,
diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py
new file mode 100644
index 000000000..c716ca739
--- /dev/null
+++ b/codeflash/verification/hypothesis_testing.py
@@ -0,0 +1,207 @@
+from __future__ import annotations
+
+import ast
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from codeflash.cli_cmds.console import console, logger
+from codeflash.code_utils.code_utils import get_qualified_function_path
+from codeflash.code_utils.formatter import format_code
+from codeflash.code_utils.static_analysis import has_typed_parameters
+from codeflash.discovery.discover_unit_tests import discover_unit_tests
+from codeflash.verification.verification_utils import TestConfig
+
+if TYPE_CHECKING:
+    from argparse import Namespace
+
+    from codeflash.discovery.functions_to_optimize import FunctionToOptimize
+    from codeflash.models.models import FunctionCalledInTest
+
+
+def remove_functions_with_only_any_type(code_string: str) -> str:
+    """Remove functions that have only Any type annotations.
+
+    This filters out functions where all parameters are annotated with typing.Any,
+    as these don't provide useful type information for property-based testing.
+    """
+    tree = ast.parse(code_string)
+    new_body = []
+
+    for node in tree.body:
+        if isinstance(node, (ast.Import, ast.ImportFrom)):
+            new_body.append(node)
+        elif isinstance(node, ast.FunctionDef):
+            all_any = True
+            has_args = False
+
+            for arg in node.args.args:
+                has_args = True
+                if arg.annotation:
+                    if isinstance(arg.annotation, ast.Name):
+                        if arg.annotation.id != "Any":
+                            all_any = False
+                    elif isinstance(arg.annotation, ast.Attribute):
+                        if arg.annotation.attr != "Any":
+                            all_any = False
+                    elif isinstance(arg.annotation, ast.Subscript):
+                        all_any = False
+                    else:
+                        all_any = False
+                else:
+                    all_any = False
+
+            if (has_args and not all_any) or not has_args:
+                new_body.append(node)
+
+        else:
+            new_body.append(node)
+
+    new_tree = ast.Module(body=new_body, type_ignores=[])
+    return ast.unparse(new_tree)
+
+
+def make_hypothesis_tests_deterministic(code: str) -> str:
+    """Add @settings(derandomize=True) decorator to make Hypothesis tests deterministic."""
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return code
+
+    settings_imported = any(
+        isinstance(node, ast.ImportFrom)
+        and node.module == "hypothesis"
+        and any(alias.name == "settings" for alias in node.names)
+        for node in tree.body
+    )
+
+    if not settings_imported:
+        tree.body.insert(0, ast.parse("from hypothesis import settings").body[0])
+
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef):
+            settings_decorator = next(
+                (
+                    d
+                    for d in node.decorator_list
+                    if isinstance(d, ast.Call) and isinstance(d.func, ast.Name) and d.func.id == "settings"
+                ),
+                None,
+            )
+
+            if settings_decorator:
+                if not any(k.arg == "derandomize" for k in settings_decorator.keywords):
+                    settings_decorator.keywords.append(ast.keyword(arg="derandomize", value=ast.Constant(value=True)))
+            else:
+                node.decorator_list.append(
+                    ast.Call(
+                        func=ast.Name(id="settings", ctx=ast.Load()),
+                        args=[],
+                        keywords=[ast.keyword(arg="derandomize", value=ast.Constant(value=True))],
+                    )
+                )
+
+    return ast.unparse(tree)
+
+
+def generate_hypothesis_tests(
+    test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST
+) -> tuple[dict[str, list[FunctionCalledInTest]], str]:
+    """Generate property-based tests using Hypothesis ghostwriter.
+
+    This function:
+    1. Uses Hypothesis CLI to generate property-based tests for the target function
+    2. Filters generated tests to only include the target function
+    3. Removes functions with only Any type annotations
+    4. Makes tests deterministic by adding @settings(derandomize=True)
+    5. Formats the tests with the project formatter
+
+    Returns:
+        Tuple of (function_to_tests_map, test_suite_code)
+
+    """
+    start_time = time.perf_counter()
+    function_to_hypothesis_tests: dict[str, list[FunctionCalledInTest]] = {}
+    hypothesis_test_suite_code: str = ""
+
+    if (
+        test_cfg.project_root_path
+        and isinstance(function_to_optimize_ast, (ast.FunctionDef, ast.AsyncFunctionDef))
+        and has_typed_parameters(function_to_optimize_ast, function_to_optimize.parents)
+    ):
+        logger.info("Generating Hypothesis tests for the original code…")
+        console.rule()
+
+        try:
+            qualified_function_path = get_qualified_function_path(
+                function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name
+            )
+            logger.info(f"command: hypothesis write {function_to_optimize.file_path.stem}")
+
+            hypothesis_result = subprocess.run(
+                ["hypothesis", "write", qualified_function_path],
+                capture_output=True,
+                text=True,
+                cwd=args.project_root,
+                check=False,
+                timeout=60,
+            )
+        except subprocess.TimeoutExpired:
+            logger.debug("Hypothesis test generation timed out")
+            end_time = time.perf_counter()
+            logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
+            return function_to_hypothesis_tests, hypothesis_test_suite_code
+
+        if hypothesis_result.returncode == 0:
+            hypothesis_test_suite_code = hypothesis_result.stdout
+            hypothesis_test_suite_dir = Path(tempfile.mkdtemp(dir=test_cfg.tests_root))
+            hypothesis_path = hypothesis_test_suite_dir / "test_hypothesis.py"
+            hypothesis_path.write_text(hypothesis_test_suite_code, encoding="utf8")
+
+            hypothesis_config = TestConfig(
+                tests_root=hypothesis_test_suite_dir,
+                tests_project_rootdir=test_cfg.tests_project_rootdir,
+                project_root_path=args.project_root,
+                test_framework=args.test_framework,
+                pytest_cmd=args.pytest_cmd,
+            )
+            function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = discover_unit_tests(hypothesis_config)
+            with hypothesis_path.open("r", encoding="utf-8") as f:
+                tree = ast.parse(f.read())
+
+            class TestFunctionRemover(ast.NodeTransformer):
+                def visit_FunctionDef(self, node):  # noqa: ANN001, ANN202
+                    if function_to_optimize.function_name not in node.name:
+                        return None
+                    return node
+
+            modified_tree = TestFunctionRemover().visit(tree)
+            ast.fix_missing_locations(modified_tree)
+            unparsed = ast.unparse(modified_tree)
+
+            hypothesis_test_suite_code = format_code(
+                make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)),
+                function_to_optimize.file_path,
+            )
+            with hypothesis_path.open("w", encoding="utf-8") as f:
+                f.write(hypothesis_test_suite_code)
+            function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = discover_unit_tests(hypothesis_config)
+            logger.info(
+                f"Created {num_discovered_hypothesis_tests} "
+                f"hypothesis unit test case{'s' if num_discovered_hypothesis_tests != 1 else ''} "
+            )
+            console.rule()
+            end_time = time.perf_counter()
+            logger.debug(f"Generated hypothesis tests in {end_time - start_time:.2f} seconds")
+            return function_to_hypothesis_tests, hypothesis_test_suite_code
+
+        logger.debug(
+            f"Error running hypothesis write {': ' + hypothesis_result.stderr if hypothesis_result.stderr else '.'}"
+        )
+        console.rule()
+
+    end_time = time.perf_counter()
+    logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
+    return function_to_hypothesis_tests, hypothesis_test_suite_code
diff --git a/pyproject.toml b/pyproject.toml
index 1186574c0..911b2728f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,7 @@ dependencies = [
     "pygls>=2.0.0,<3.0.0",
     "codeflash-benchmark",
     "filelock",
+    "hypothesis>=6.141.1",
 ]
 
 [project.urls]
diff --git a/uv.lock b/uv.lock
index 0d99bdf15..6d3800cbd 100644
--- a/uv.lock
+++ b/uv.lock
@@ -309,6 +309,8 @@ dependencies = [
     { name = "filelock", version = "3.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "gitpython" },
     { name = "humanize" },
+    { name = "hypothesis", version = "6.141.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+    { name = "hypothesis", version = "6.142.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "inquirer", version = "3.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9.2'" },
     { name = "inquirer", version = "3.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9.2'" },
     { name = "isort", version = "6.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
@@ -399,6 +401,7 @@ requires-dist = [
     { name = "filelock" },
     { name = "gitpython", specifier = ">=3.1.31" },
     { name = "humanize", specifier = ">=4.0.0" },
+    { name = "hypothesis", specifier = ">=6.141.1" },
     { name = "inquirer", specifier = ">=3.0.0" },
     { name = "isort", specifier = ">=5.11.0" },
     { name = "jedi", specifier = ">=0.19.1" },
@@ -791,6 +794,44 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1e/c7/316e7ca04d26695ef0635dc81683d628350810eb8e9b2299fc08ba49f366/humanize-4.13.0-py3-none-any.whl", hash = "sha256:b810820b31891813b1673e8fec7f1ed3312061eab2f26e3fa192c393d11ed25f", size = 128869, upload-time = "2025-08-25T09:39:18.54Z" },
 ]
 
+[[package]]
+name = "hypothesis"
+version = "6.141.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.9.2' and python_full_version < '3.10'",
+    "python_full_version < '3.9.2'",
+]
+dependencies = [
+    { name = "attrs", marker = "python_full_version < '3.10'" },
+    { name = "exceptiongroup", marker = "python_full_version < '3.10'" },
+    { name = "sortedcontainers", marker = "python_full_version < '3.10'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/85/20/8aa62b3e69fea68bb30d35d50be5395c98979013acd8152d64dc927e4cdb/hypothesis-6.141.1.tar.gz", hash = "sha256:8ef356e1e18fbeaa8015aab3c805303b7fe4b868e5b506e87ad83c0bf951f46f", size = 467389, upload-time = "2025-10-15T19:12:25.262Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bc/9a/f901858f139694dd669776983781b08a7c1717911025da6720e526bd8ce3/hypothesis-6.141.1-py3-none-any.whl", hash = "sha256:a5b3c39c16d98b7b4c3c5c8d4262e511e3b2255e6814ced8023af49087ad60b3", size = 535000, upload-time = "2025-10-15T19:12:21.659Z" },
+]
+
+[[package]]
+name = "hypothesis"
+version = "6.142.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.13'",
+    "python_full_version == '3.12.*'",
+    "python_full_version == '3.11.*'",
+    "python_full_version == '3.10.*'",
+]
+dependencies = [
+    { name = "attrs", marker = "python_full_version >= '3.10'" },
+    { name = "exceptiongroup", marker = "python_full_version == '3.10.*'" },
+    { name = "sortedcontainers", marker = "python_full_version >= '3.10'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/47/83/8f76d7c965beb4d3a65d188232c32db97b0799b0e893227d520d5d2a0144/hypothesis-6.142.2.tar.gz", hash = "sha256:c4204a2ce327e45fbaf83a2b58142a285135698dc1d08e368ae9901f06b49e64", size = 465987, upload-time = "2025-10-20T16:08:20.225Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9b/8f/194d63f715c7b0ace35b4f2a83b756d5bc703299b706c401b7ec593054fc/hypothesis-6.142.2-py3-none-any.whl", hash = "sha256:cc6c6e66c06aff695dd255501a767b528e00d84ce3572160425a9ba5e4a47845", size = 533375, upload-time = "2025-10-20T16:08:16.903Z" },
+]
+
 [[package]]
 name = "identify"
 version = "2.6.15"
@@ -3275,6 +3316,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" },
 ]
 
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" },
+]
+
 [[package]]
 name = "stack-data"
 version = "0.6.3"

From c71d2dac81f46f8f786376abc0d6450af81d0b9a Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Fri, 24 Oct 2025 21:32:50 -0500
Subject: [PATCH 02/16] tidy up

---
 codeflash/code_utils/env_utils.py            | 52 ++++++++++++++------
 codeflash/verification/hypothesis_testing.py |  5 +-
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/codeflash/code_utils/env_utils.py b/codeflash/code_utils/env_utils.py
index 08b1fc0da..749c49676 100644
--- a/codeflash/code_utils/env_utils.py
+++ b/codeflash/code_utils/env_utils.py
@@ -2,6 +2,8 @@
 
 import json
 import os
+import shlex
+import shutil
 import tempfile
 from functools import lru_cache
 from pathlib import Path
@@ -14,21 +16,41 @@
 
 
 def check_formatter_installed(formatter_cmds: list[str], exit_on_failure: bool = True) -> bool:  # noqa
-    return_code = True
-    if formatter_cmds[0] == "disabled":
-        return return_code
+    if not formatter_cmds or formatter_cmds[0] == "disabled":
+        return True
+
+    first_cmd = formatter_cmds[0]
+    cmd_tokens = shlex.split(first_cmd) if isinstance(first_cmd, str) else [first_cmd]
+
+    if not cmd_tokens:
+        return True
+
+    exe_name = cmd_tokens[0]
+    command_str = " ".join(formatter_cmds).replace(" $file", "")
+
+    if shutil.which(exe_name) is None:
+        logger.error(
+            f"Could not find formatter: {command_str}\n"
+            f"Please install it or update 'formatter-cmds' in your codeflash configuration"
+        )
+        return False
+
     tmp_code = """print("hello world")"""
-    with tempfile.TemporaryDirectory() as tmpdir:
-        tmp_file = Path(tmpdir) / "test_codeflash_formatter.py"
-        tmp_file.write_text(tmp_code, encoding="utf-8")
-        try:
-            format_code(formatter_cmds, tmp_file, print_status=False, exit_on_failure=exit_on_failure)
-        except Exception:
-            exit_with_message(
-                "⚠️ Codeflash requires a code formatter to be installed in your environment, but none was found. Please install a supported formatter, verify the formatter-cmds in your codeflash pyproject.toml config and try again.",
-                error_on_exit=True,
-            )
-        return return_code
+    try:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmp_file = Path(tmpdir) / "test_codeflash_formatter.py"
+            tmp_file.write_text(tmp_code, encoding="utf-8")
+            format_code(formatter_cmds, tmp_file, print_status=False, exit_on_failure=False)
+            return True
+    except FileNotFoundError:
+        logger.error(
+            f"Could not find formatter: {command_str}\n"
+            f"Please install it or update 'formatter-cmds' in your codeflash configuration"
+        )
+        return False
+    except Exception as e:
+        logger.error(f"Formatter failed to run: {command_str}\nError: {e}")
+        return False
 
 
 @lru_cache(maxsize=1)
@@ -138,4 +160,4 @@ def is_ci() -> bool:
 def is_pr_draft() -> bool:
     """Check if the PR is draft. in the github action context."""
     event = get_cached_gh_event_data()
-    return bool(event.get("pull_request", {}).get("draft", False))
+    return bool(event.get("pull_request", {}).get("draft", False))
\ No newline at end of file
diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py
index c716ca739..fe681a619 100644
--- a/codeflash/verification/hypothesis_testing.py
+++ b/codeflash/verification/hypothesis_testing.py
@@ -138,7 +138,7 @@ def generate_hypothesis_tests(
             qualified_function_path = get_qualified_function_path(
                 function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name
             )
-            logger.info(f"command: hypothesis write {function_to_optimize.file_path.stem}")
+            logger.info(f"command: hypothesis write {qualified_function_path}")
 
             hypothesis_result = subprocess.run(
                 ["hypothesis", "write", qualified_function_path],
@@ -182,8 +182,9 @@ def visit_FunctionDef(self, node):  # noqa: ANN001, ANN202
             unparsed = ast.unparse(modified_tree)
 
             hypothesis_test_suite_code = format_code(
-                make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)),
+                args.formatter_cmds,
                 function_to_optimize.file_path,
+                optimized_code=make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)),
             )
             with hypothesis_path.open("w", encoding="utf-8") as f:
                 f.write(hypothesis_test_suite_code)

From f6285263c4e57d22043f437f758f0e5e20a67cec Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Sat, 25 Oct 2025 00:09:19 -0500
Subject: [PATCH 03/16] Update hypothesis_testing.py

---
 codeflash/verification/hypothesis_testing.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py
index fe681a619..0caf985c7 100644
--- a/codeflash/verification/hypothesis_testing.py
+++ b/codeflash/verification/hypothesis_testing.py
@@ -173,9 +173,9 @@ def generate_hypothesis_tests(
 
             class TestFunctionRemover(ast.NodeTransformer):
                 def visit_FunctionDef(self, node):  # noqa: ANN001, ANN202
-                    if function_to_optimize.function_name not in node.name:
-                        return None
-                    return node
+                    if node.name.startswith("test_") and function_to_optimize.function_name in node.name:
+                        return node
+                    return None
 
             modified_tree = TestFunctionRemover().visit(tree)
             ast.fix_missing_locations(modified_tree)

From bfe4179d0db14261f51d90ede2db487c75bbd71c Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Sat, 25 Oct 2025 00:46:32 -0500
Subject: [PATCH 04/16] Update hypothesis_testing.py

---
 codeflash/verification/hypothesis_testing.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py
index 0caf985c7..986f37cd9 100644
--- a/codeflash/verification/hypothesis_testing.py
+++ b/codeflash/verification/hypothesis_testing.py
@@ -181,9 +181,11 @@ def visit_FunctionDef(self, node):  # noqa: ANN001, ANN202
             ast.fix_missing_locations(modified_tree)
             unparsed = ast.unparse(modified_tree)
 
+            console.print(f"modified src: {unparsed}")
+
             hypothesis_test_suite_code = format_code(
                 args.formatter_cmds,
-                function_to_optimize.file_path,
+                hypothesis_path,
                 optimized_code=make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)),
             )
             with hypothesis_path.open("w", encoding="utf-8") as f:

From 7ee1ab1d12b284d504f62a7874170bf012f44087 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Sat, 25 Oct 2025 17:14:31 -0500
Subject: [PATCH 05/16] cleanup

---
 codeflash/verification/hypothesis_testing.py | 151 ++++++++++++++++---
 1 file changed, 128 insertions(+), 23 deletions(-)

diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py
index 986f37cd9..ca3f3131b 100644
--- a/codeflash/verification/hypothesis_testing.py
+++ b/codeflash/verification/hypothesis_testing.py
@@ -63,8 +63,43 @@ def remove_functions_with_only_any_type(code_string: str) -> str:
     return ast.unparse(new_tree)
 
 
+def filter_hypothesis_tests_by_function_name(code: str, function_name: str) -> str:
+    """Filter hypothesis tests to only include tests matching the function name.
+
+    Preserves all imports, module-level assignments, and only test functions
+    that contain the target function name.
+
+    Args:
+        code: The hypothesis test code to filter
+        function_name: The name of the function being tested
+
+    Returns:
+        Filtered code with only matching tests
+    """
+    tree = ast.parse(code)
+
+    class TestFunctionRemover(ast.NodeTransformer):
+        def visit_Module(self, node):  # noqa: ANN001, ANN202
+            # Filter body to keep imports, module-level assignments, and matching test functions
+            new_body = []
+            for item in node.body:
+                if isinstance(item, (ast.Import, ast.ImportFrom, ast.Assign)):
+                    # Keep all imports and module-level assignments
+                    new_body.append(item)
+                elif isinstance(item, ast.FunctionDef):
+                    # Only keep test functions that match the function name
+                    if item.name.startswith("test_") and function_name in item.name:
+                        new_body.append(item)
+            node.body = new_body
+            return node
+
+    modified_tree = TestFunctionRemover().visit(tree)
+    ast.fix_missing_locations(modified_tree)
+    return ast.unparse(modified_tree)
+
+
 def make_hypothesis_tests_deterministic(code: str) -> str:
-    """Add @settings(derandomize=True) decorator to make Hypothesis tests deterministic."""
+    """Add @settings(derandomize=True) decorator and constrain strategies to make Hypothesis tests deterministic."""
     try:
         tree = ast.parse(code)
     except SyntaxError:
@@ -80,26 +115,83 @@ def make_hypothesis_tests_deterministic(code: str) -> str:
     if not settings_imported:
         tree.body.insert(0, ast.parse("from hypothesis import settings").body[0])
 
+    class StrategyConstrainer(ast.NodeTransformer):
+        def visit_Call(self, node: ast.Call) -> ast.Call:
+            self.generic_visit(node)
+
+            # Check if this is a strategy call (st.floats(), st.integers(), etc.)
+            if (
+                isinstance(node.func, ast.Attribute)
+                and isinstance(node.func.value, ast.Name)
+                and node.func.value.id == "st"
+            ):
+                if node.func.attr == "floats" and not any(
+                    k.arg in ["min_value", "max_value", "allow_nan", "allow_infinity"]
+                    for k in node.keywords
+                ):
+                    # Constrain floats to reasonable bounds
+                    node.keywords.extend(
+                        [
+                            ast.keyword(
+                                arg="min_value",
+                                value=ast.UnaryOp(
+                                    op=ast.USub(), operand=ast.Constant(value=1e6)
+                                ),
+                            ),
+                            ast.keyword(arg="max_value", value=ast.Constant(value=1e6)),
+                            ast.keyword(
+                                arg="allow_nan", value=ast.Constant(value=False)
+                            ),
+                            ast.keyword(
+                                arg="allow_infinity", value=ast.Constant(value=False)
+                            ),
+                        ]
+                    )
+                elif node.func.attr == "integers" and not any(
+                    k.arg in ["min_value", "max_value"] for k in node.keywords
+                ):
+                    # Constrain integers to reasonable bounds
+                    node.keywords.extend(
+                        [
+                            ast.keyword(arg="min_value", value=ast.Constant(value=0)),
+                            ast.keyword(
+                                arg="max_value", value=ast.Constant(value=10000)
+                            ),
+                        ]
+                    )
+            return node
+
+    tree = StrategyConstrainer().visit(tree)
+    ast.fix_missing_locations(tree)
+
     for node in ast.walk(tree):
         if isinstance(node, ast.FunctionDef):
             settings_decorator = next(
                 (
                     d
                     for d in node.decorator_list
-                    if isinstance(d, ast.Call) and isinstance(d.func, ast.Name) and d.func.id == "settings"
+                    if isinstance(d, ast.Call)
+                    and isinstance(d.func, ast.Name)
+                    and d.func.id == "settings"
                 ),
                 None,
             )
 
             if settings_decorator:
                 if not any(k.arg == "derandomize" for k in settings_decorator.keywords):
-                    settings_decorator.keywords.append(ast.keyword(arg="derandomize", value=ast.Constant(value=True)))
+                    settings_decorator.keywords.append(
+                        ast.keyword(arg="derandomize", value=ast.Constant(value=True))
+                    )
             else:
                 node.decorator_list.append(
                     ast.Call(
                         func=ast.Name(id="settings", ctx=ast.Load()),
                         args=[],
-                        keywords=[ast.keyword(arg="derandomize", value=ast.Constant(value=True))],
+                        keywords=[
+                            ast.keyword(
+                                arg="derandomize", value=ast.Constant(value=True)
+                            )
+                        ],
                     )
                 )
 
@@ -107,7 +199,10 @@ def make_hypothesis_tests_deterministic(code: str) -> str:
 
 
 def generate_hypothesis_tests(
-    test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST
+    test_cfg: TestConfig,
+    args: Namespace,
+    function_to_optimize: FunctionToOptimize,
+    function_to_optimize_ast: ast.AST,
 ) -> tuple[dict[str, list[FunctionCalledInTest]], str]:
     """Generate property-based tests using Hypothesis ghostwriter.
 
@@ -128,7 +223,9 @@ def generate_hypothesis_tests(
 
     if (
         test_cfg.project_root_path
-        and isinstance(function_to_optimize_ast, (ast.FunctionDef, ast.AsyncFunctionDef))
+        and isinstance(
+            function_to_optimize_ast, (ast.FunctionDef, ast.AsyncFunctionDef)
+        )
         and has_typed_parameters(function_to_optimize_ast, function_to_optimize.parents)
     ):
         logger.info("Generating Hypothesis tests for the original code…")
@@ -136,7 +233,9 @@ def generate_hypothesis_tests(
 
         try:
             qualified_function_path = get_qualified_function_path(
-                function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name
+                function_to_optimize.file_path,
+                args.project_root,
+                function_to_optimize.qualified_name,
             )
             logger.info(f"command: hypothesis write {qualified_function_path}")
 
@@ -151,7 +250,9 @@ def generate_hypothesis_tests(
         except subprocess.TimeoutExpired:
             logger.debug("Hypothesis test generation timed out")
             end_time = time.perf_counter()
-            logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
+            logger.debug(
+                f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds"
+            )
             return function_to_hypothesis_tests, hypothesis_test_suite_code
 
         if hypothesis_result.returncode == 0:
@@ -167,37 +268,39 @@ def generate_hypothesis_tests(
                 test_framework=args.test_framework,
                 pytest_cmd=args.pytest_cmd,
             )
-            function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = discover_unit_tests(hypothesis_config)
+            function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = (
+                discover_unit_tests(hypothesis_config)
+            )
             with hypothesis_path.open("r", encoding="utf-8") as f:
-                tree = ast.parse(f.read())
-
-            class TestFunctionRemover(ast.NodeTransformer):
-                def visit_FunctionDef(self, node):  # noqa: ANN001, ANN202
-                    if node.name.startswith("test_") and function_to_optimize.function_name in node.name:
-                        return node
-                    return None
+                original_code = f.read()
 
-            modified_tree = TestFunctionRemover().visit(tree)
-            ast.fix_missing_locations(modified_tree)
-            unparsed = ast.unparse(modified_tree)
+            unparsed = filter_hypothesis_tests_by_function_name(
+                original_code, function_to_optimize.function_name
+            )
 
             console.print(f"modified src: {unparsed}")
 
             hypothesis_test_suite_code = format_code(
                 args.formatter_cmds,
                 hypothesis_path,
-                optimized_code=make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)),
+                optimized_code=make_hypothesis_tests_deterministic(
+                    remove_functions_with_only_any_type(unparsed)
+                ),
             )
             with hypothesis_path.open("w", encoding="utf-8") as f:
                 f.write(hypothesis_test_suite_code)
-            function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = discover_unit_tests(hypothesis_config)
+            function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = (
+                discover_unit_tests(hypothesis_config)
+            )
             logger.info(
                 f"Created {num_discovered_hypothesis_tests} "
                 f"hypothesis unit test case{'s' if num_discovered_hypothesis_tests != 1 else ''} "
             )
             console.rule()
             end_time = time.perf_counter()
-            logger.debug(f"Generated hypothesis tests in {end_time - start_time:.2f} seconds")
+            logger.debug(
+                f"Generated hypothesis tests in {end_time - start_time:.2f} seconds"
+            )
             return function_to_hypothesis_tests, hypothesis_test_suite_code
 
         logger.debug(
@@ -206,5 +309,7 @@ def visit_FunctionDef(self, node):  # noqa: ANN001, ANN202
         console.rule()
 
     end_time = time.perf_counter()
-    logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
+    logger.debug(
+        f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds"
+    )
     return function_to_hypothesis_tests, hypothesis_test_suite_code

From 99f095472e374aff659c8822bcd8c7d09e26b8f7 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Sat, 25 Oct 2025 20:22:35 -0500
Subject: [PATCH 06/16] lazy impl

---
 codeflash/discovery/discover_unit_tests.py   | 150 ++++++++++++++++++-
 codeflash/verification/concolic_testing.py   |   5 +-
 codeflash/verification/hypothesis_testing.py |   5 +-
 3 files changed, 156 insertions(+), 4 deletions(-)

diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py
index d7c1c0a83..1f23eaf5b 100644
--- a/codeflash/discovery/discover_unit_tests.py
+++ b/codeflash/discovery/discover_unit_tests.py
@@ -9,6 +9,7 @@
 import re
 import sqlite3
 import subprocess
+import sys
 import unittest
 from collections import defaultdict
 from pathlib import Path
@@ -66,6 +67,75 @@ class TestFunction:
 FUNCTION_NAME_REGEX = re.compile(r"([^.]+)\.([a-zA-Z0-9_]+)$")
 
 
+def _extract_dotted_call_name(node: ast.expr) -> str | None:
+    """Extract full dotted name from function call (e.g., 'src.math.computation.gcd_recursive')."""
+    parts = []
+    current = node
+    while isinstance(current, ast.Attribute):
+        parts.insert(0, current.attr)
+        current = current.value
+    if isinstance(current, ast.Name):
+        parts.insert(0, current.id)
+        return ".".join(parts) if parts else None
+    return None
+
+
+def _discover_calls_via_ast(
+    test_file: Path, test_functions: set[TestFunction], target_qualified_names: set[str]
+) -> dict[str, list[tuple[TestFunction, CodePosition]]]:
+    try:
+        with test_file.open("r", encoding="utf-8") as f:
+            source = f.read()
+        tree = ast.parse(source, filename=str(test_file))
+    except (SyntaxError, FileNotFoundError) as e:
+        logger.debug(f"AST parsing failed for {test_file}: {e}")
+        return {}
+
+    import_map = {}  # alias -> full_qualified_path
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                name = alias.asname or alias.name
+                import_map[name] = alias.name
+        elif isinstance(node, ast.ImportFrom) and node.module:
+            for alias in node.names:
+                if alias.name != "*":
+                    full_name = f"{node.module}.{alias.name}"
+                    name = alias.asname or alias.name
+                    import_map[name] = full_name
+
+    test_funcs_by_name = {tf.function_name: tf for tf in test_functions}
+
+    result = defaultdict(list)
+
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.FunctionDef) or node.name not in test_funcs_by_name:
+            continue
+
+        test_func = test_funcs_by_name[node.name]
+
+        for child in ast.walk(node):
+            if not isinstance(child, ast.Call):
+                continue
+
+            call_name = _extract_dotted_call_name(child.func)
+            if not call_name:
+                continue
+
+            if call_name in target_qualified_names:
+                result[call_name].append((test_func, CodePosition(line_no=child.lineno, col_no=child.col_offset)))
+                continue
+
+            parts = call_name.split(".", 1)
+            if parts[0] in import_map:
+                resolved = f"{import_map[parts[0]]}.{parts[1]}" if len(parts) == 2 else import_map[parts[0]]
+
+                if resolved in target_qualified_names:
+                    result[resolved].append((test_func, CodePosition(line_no=child.lineno, col_no=child.col_offset)))
+
+    return dict(result)
+
+
 class TestsCache:
     SCHEMA_VERSION = 1  # Increment this when schema changes
 
@@ -489,6 +559,7 @@ def discover_tests_pytest(
         console.rule()
     else:
         logger.debug(f"Pytest collection exit code: {exitcode}")
+
     if pytest_rootdir is not None:
         cfg.tests_project_rootdir = Path(pytest_rootdir)
     file_to_test_map: dict[Path, list[FunctionCalledInTest]] = defaultdict(list)
@@ -511,6 +582,7 @@ def discover_tests_pytest(
         if discover_only_these_tests and test_obj.test_file not in discover_only_these_tests:
             continue
         file_to_test_map[test_obj.test_file].append(test_obj)
+
     # Within these test files, find the project functions they are referring to and return their names/locations
     return process_test_files(file_to_test_map, cfg, functions_to_optimize)
 
@@ -592,7 +664,9 @@ def process_test_files(
     test_framework = cfg.test_framework
 
     if functions_to_optimize:
-        target_function_names = {func.qualified_name for func in functions_to_optimize}
+        target_function_names = {
+            func.qualified_name_with_modules_from_root(project_root_path) for func in functions_to_optimize
+        }
         file_to_test_map = filter_test_files_by_imports(file_to_test_map, target_function_names)
 
     function_to_test_map = defaultdict(set)
@@ -602,6 +676,7 @@ def process_test_files(
 
     tests_cache = TestsCache(project_root_path)
     logger.info("!lsp|Discovering tests and processing unit tests")
+
     with test_files_progress_bar(total=len(file_to_test_map), description="Processing test files") as (
         progress,
         task_id,
@@ -702,6 +777,79 @@ def process_test_files(
                 test_functions_by_name[func.function_name].append(func)
 
             test_function_names_set = set(test_functions_by_name.keys())
+
+            is_generated_test_file = (
+                any(
+                    tf.test_type in (TestType.HYPOTHESIS_TEST, TestType.CONCOLIC_COVERAGE_TEST) for tf in test_functions
+                )
+                if test_functions
+                else any(
+                    func.test_type in (TestType.HYPOTHESIS_TEST, TestType.CONCOLIC_COVERAGE_TEST) for func in functions
+                )
+            )
+
+            # For generated tests, use AST-based discovery since Jedi often fails
+            if is_generated_test_file and functions_to_optimize:
+                logger.debug(f"Using AST-based discovery for generated test file: {test_file.name}")
+                target_qualified_names = {
+                    func.qualified_name_with_modules_from_root(project_root_path) for func in functions_to_optimize
+                }
+
+                if not test_functions:
+                    logger.debug("Jedi found no functions, building test_functions from collected functions")
+                    test_functions = {
+                        TestFunction(
+                            function_name=func.test_function,
+                            test_class=func.test_class,
+                            parameters=None,
+                            test_type=func.test_type,
+                        )
+                        for func in functions
+                    }
+
+                ast_results = _discover_calls_via_ast(test_file, test_functions, target_qualified_names)
+
+                for qualified_name, matches in ast_results.items():
+                    for test_func, position in matches:
+                        if test_func.parameters is not None:
+                            if test_framework == "pytest":
+                                scope_test_function = f"{test_func.function_name}[{test_func.parameters}]"
+                            else:  # unittest
+                                scope_test_function = f"{test_func.function_name}_{test_func.parameters}"
+                        else:
+                            scope_test_function = test_func.function_name
+
+                        function_to_test_map[qualified_name].add(
+                            FunctionCalledInTest(
+                                tests_in_file=TestsInFile(
+                                    test_file=test_file,
+                                    test_class=test_func.test_class,
+                                    test_function=scope_test_function,
+                                    test_type=test_func.test_type,
+                                ),
+                                position=position,
+                            )
+                        )
+                        tests_cache.insert_test(
+                            file_path=str(test_file),
+                            file_hash=file_hash,
+                            qualified_name_with_modules_from_root=qualified_name,
+                            function_name=test_func.function_name,
+                            test_class=test_func.test_class or "",
+                            test_function=scope_test_function,
+                            test_type=test_func.test_type,
+                            line_number=position.line_no,
+                            col_number=position.col_no,
+                        )
+
+                        if test_func.test_type == TestType.REPLAY_TEST:
+                            num_discovered_replay_tests += 1
+
+                        num_discovered_tests += 1
+
+                progress.advance(task_id)
+                continue
+
             relevant_names = []
 
             names_with_full_name = [name for name in all_names if name.full_name is not None]
diff --git a/codeflash/verification/concolic_testing.py b/codeflash/verification/concolic_testing.py
index e17f5c01f..2190ba6f9 100644
--- a/codeflash/verification/concolic_testing.py
+++ b/codeflash/verification/concolic_testing.py
@@ -80,7 +80,10 @@ def generate_concolic_tests(
                 test_framework=args.test_framework,
                 pytest_cmd=args.pytest_cmd,
             )
-            function_to_concolic_tests, num_discovered_concolic_tests, _ = discover_unit_tests(concolic_test_cfg)
+            file_to_funcs = {function_to_optimize.file_path: [function_to_optimize]}
+            function_to_concolic_tests, num_discovered_concolic_tests, _ = discover_unit_tests(
+                concolic_test_cfg, file_to_funcs_to_optimize=file_to_funcs
+            )
             logger.info(
                 f"Created {num_discovered_concolic_tests} "
                 f"concolic unit test case{'s' if num_discovered_concolic_tests != 1 else ''} "
diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py
index ca3f3131b..5e6a7863c 100644
--- a/codeflash/verification/hypothesis_testing.py
+++ b/codeflash/verification/hypothesis_testing.py
@@ -268,8 +268,9 @@ def generate_hypothesis_tests(
                 test_framework=args.test_framework,
                 pytest_cmd=args.pytest_cmd,
             )
+            file_to_funcs = {function_to_optimize.file_path: [function_to_optimize]}
             function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = (
-                discover_unit_tests(hypothesis_config)
+                discover_unit_tests(hypothesis_config, file_to_funcs_to_optimize=file_to_funcs)
             )
             with hypothesis_path.open("r", encoding="utf-8") as f:
                 original_code = f.read()
@@ -290,7 +291,7 @@ def generate_hypothesis_tests(
             with hypothesis_path.open("w", encoding="utf-8") as f:
                 f.write(hypothesis_test_suite_code)
             function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = (
-                discover_unit_tests(hypothesis_config)
+                discover_unit_tests(hypothesis_config, file_to_funcs_to_optimize=file_to_funcs)
             )
             logger.info(
                 f"Created {num_discovered_hypothesis_tests} "

From 572ac0e3133f98475ac3be4d9e52269c0a880e84 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Sat, 25 Oct 2025 20:58:22 -0500
Subject: [PATCH 07/16] check

---
 codeflash/verification/hypothesis_testing.py |   4 +-
 tests/test_hypothesis_testing.py             | 158 +++++++++++++++++++
 2 files changed, 160 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_hypothesis_testing.py

diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py
index 5e6a7863c..a39e3999e 100644
--- a/codeflash/verification/hypothesis_testing.py
+++ b/codeflash/verification/hypothesis_testing.py
@@ -150,10 +150,10 @@ def visit_Call(self, node: ast.Call) -> ast.Call:
                 elif node.func.attr == "integers" and not any(
                     k.arg in ["min_value", "max_value"] for k in node.keywords
                 ):
-                    # Constrain integers to reasonable bounds
+                    # Constrain integers to reasonable bounds (including negatives)
                     node.keywords.extend(
                         [
-                            ast.keyword(arg="min_value", value=ast.Constant(value=0)),
+                            ast.keyword(arg="min_value", value=ast.Constant(value=-10000)),
                             ast.keyword(
                                 arg="max_value", value=ast.Constant(value=10000)
                             ),
diff --git a/tests/test_hypothesis_testing.py b/tests/test_hypothesis_testing.py
new file mode 100644
index 000000000..49fff9515
--- /dev/null
+++ b/tests/test_hypothesis_testing.py
@@ -0,0 +1,158 @@
+"""Tests for hypothesis_testing.py functions."""
+
+from codeflash.verification.hypothesis_testing import make_hypothesis_tests_deterministic
+
+
+def test_adds_derandomize_decorator():
+    """Test that @settings(derandomize=True) is added when missing."""
+    src = """
+from hypothesis import given, strategies as st
+
+@given(x=st.integers())
+def test_x(x):
+    assert isinstance(x, int)
+"""
+    out = make_hypothesis_tests_deterministic(src)
+    assert "@settings(derandomize=True)" in out or "settings(derandomize=True)" in out
+
+
+def test_integers_constrained_with_negatives():
+    """Test that st.integers() gets bounded to [-10000, 10000]."""
+    src = """from hypothesis import given, strategies as st
+@given(x=st.integers())
+def t(x):
+    pass
+"""
+    out = make_hypothesis_tests_deterministic(src)
+    # Remove spaces for easier checking
+    normalized = out.replace(" ", "").replace("\n", "")
+    assert "min_value=-10000" in normalized
+    assert "max_value=10000" in normalized
+
+
+def test_floats_constrained_to_finite():
+    """Test that st.floats() is constrained to finite values with bounds."""
+    src = """from hypothesis import given, strategies as st
+@given(x=st.floats())
+def t(x):
+    pass
+"""
+    out = make_hypothesis_tests_deterministic(src)
+    normalized = out.replace(" ", "").replace("\n", "")
+    assert "allow_nan=False" in normalized
+    assert "allow_infinity=False" in normalized
+    assert "min_value=" in normalized and "max_value=" in normalized
+
+
+def test_existing_constraints_not_overridden():
+    """Test that existing constraints on strategies are preserved."""
+    src = """from hypothesis import given, strategies as st, settings
+
+@settings(derandomize=True, max_examples=5)
+@given(x=st.integers(min_value=-5, max_value=5))
+def t(x):
+    pass
+"""
+    out = make_hypothesis_tests_deterministic(src)
+    # Should not add duplicate settings decorator
+    assert out.count("@settings") == 1
+    # Should preserve original constraints
+    assert "min_value=-5" in out or "min_value= -5" in out
+    assert "max_value=5" in out or "max_value= 5" in out
+    # Should not add the default -10000/10000 bounds
+    assert "-10000" not in out
+
+
+def test_existing_float_constraints_preserved():
+    """Test that existing float constraints are not overridden."""
+    src = """from hypothesis import given, strategies as st
+
+@given(y=st.floats(min_value=-1.0, max_value=1.0, allow_nan=False, allow_infinity=False))
+def t(y):
+    pass
+"""
+    out = make_hypothesis_tests_deterministic(src)
+    assert "min_value=-1.0" in out or "min_value= -1.0" in out
+    assert "max_value=1.0" in out or "max_value= 1.0" in out
+    # Should not add the default 1e6 bounds
+    assert "1e6" not in out and "1000000" not in out
+
+
+def test_idempotency():
+    """Test that running the function twice produces the same result."""
+    src = """from hypothesis import given, strategies as st
+
+@given(x=st.integers(), y=st.floats())
+def test_func(x, y):
+    pass
+"""
+    out1 = make_hypothesis_tests_deterministic(src)
+    out2 = make_hypothesis_tests_deterministic(out1)
+    assert out1 == out2
+
+
+def test_multiple_strategies_handled():
+    """Test that multiple strategies in one test are all constrained."""
+    src = """from hypothesis import given, strategies as st
+
+@given(a=st.integers(), b=st.integers(), c=st.floats())
+def test_multi(a, b, c):
+    pass
+"""
+    out = make_hypothesis_tests_deterministic(src)
+    normalized = out.replace(" ", "").replace("\n", "")
+    # All integers should be constrained
+    assert normalized.count("min_value=-10000") >= 2
+    assert normalized.count("max_value=10000") >= 2
+    # Float should be constrained
+    assert "allow_nan=False" in normalized
+    assert "allow_infinity=False" in normalized
+
+
+def test_settings_import_added_if_missing():
+    """Test that 'from hypothesis import settings' is added when needed."""
+    src = """from hypothesis import given, strategies as st
+
+@given(x=st.integers())
+def test_x(x):
+    pass
+"""
+    out = make_hypothesis_tests_deterministic(src)
+    # Should have settings import or settings in existing import
+    assert "settings" in out
+
+
+def test_partial_constraints_completed():
+    """Test that partial constraints are completed."""
+    src = """from hypothesis import given, strategies as st
+
+@given(x=st.integers(min_value=100))
+def test_x(x):
+    pass
+"""
+    out = make_hypothesis_tests_deterministic(src)
+    # Should keep the min_value=100 and not override
+    assert "min_value=100" in out or "min_value= 100" in out
+    # Should not add default bounds since min_value exists
+    assert "-10000" not in out
+
+
+def test_syntax_error_returns_original():
+    """Test that invalid Python syntax returns original code unchanged."""
+    invalid_src = "this is not valid python @#$%"
+    out = make_hypothesis_tests_deterministic(invalid_src)
+    assert out == invalid_src
+
+
+def test_no_hypothesis_code_unchanged():
+    """Test that code without hypothesis is returned mostly unchanged."""
+    src = """def regular_function(x):
+    return x * 2
+
+def test_regular():
+    assert regular_function(2) == 4
+"""
+    out = make_hypothesis_tests_deterministic(src)
+    # Should still parse and return valid code
+    assert "def regular_function" in out
+    assert "def test_regular" in out

From 4866d82b1507b99b693b9eb4b5457319f926af47 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Sat, 25 Oct 2025 23:45:19 -0500
Subject: [PATCH 08/16] cleanup

---
 codeflash/verification/equivalence.py | 123 ++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)

diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py
index 9d7f5ba2c..fe28cfaa6 100644
--- a/codeflash/verification/equivalence.py
+++ b/codeflash/verification/equivalence.py
@@ -1,4 +1,5 @@
 import sys
+from collections import defaultdict
 
 from codeflash.cli_cmds.console import logger
 from codeflash.models.models import TestResults, TestType, VerificationType
@@ -14,14 +15,47 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
     original_recursion_limit = sys.getrecursionlimit()
     if original_recursion_limit < INCREASED_RECURSION_LIMIT:
         sys.setrecursionlimit(INCREASED_RECURSION_LIMIT)  # Increase recursion limit to avoid RecursionError
+
+    # Separate Hypothesis tests from other test types for semantic comparison
+    # Hypothesis tests are always compared semantically (by test function, not example count)
+    original_hypothesis = [
+        r for r in original_results.test_results if r.test_type == TestType.HYPOTHESIS_TEST and r.loop_index == 1
+    ]
+    candidate_hypothesis = [
+        r for r in candidate_results.test_results if r.test_type == TestType.HYPOTHESIS_TEST and r.loop_index == 1
+    ]
+
+    # Compare Hypothesis tests semantically if any are present
+    if original_hypothesis or candidate_hypothesis:
+        logger.debug(
+            f"Comparing Hypothesis tests: original={len(original_hypothesis)} examples, "
+            f"candidate={len(candidate_hypothesis)} examples"
+        )
+        hypothesis_equal = _compare_hypothesis_tests_semantic(original_hypothesis, candidate_hypothesis)
+        if not hypothesis_equal:
+            logger.info("Hypothesis comparison failed")
+            sys.setrecursionlimit(original_recursion_limit)
+            return False
+        logger.debug("Hypothesis comparison passed")
+
     test_ids_superset = original_results.get_all_unique_invocation_loop_ids().union(
         set(candidate_results.get_all_unique_invocation_loop_ids())
     )
+    logger.debug(f"Total test IDs in superset: {len(test_ids_superset)}")
     are_equal: bool = True
     did_all_timeout: bool = True
     for test_id in test_ids_superset:
         original_test_result = original_results.get_by_unique_invocation_loop_id(test_id)
         cdd_test_result = candidate_results.get_by_unique_invocation_loop_id(test_id)
+
+        # Skip Hypothesis tests - already compared semantically above
+        if original_test_result and original_test_result.test_type == TestType.HYPOTHESIS_TEST:
+            did_all_timeout = False  # Hypothesis tests are checked separately, not timed out
+            continue
+        if cdd_test_result and cdd_test_result.test_type == TestType.HYPOTHESIS_TEST:
+            did_all_timeout = False
+            continue
+
         if cdd_test_result is not None and original_test_result is None:
             continue
         # If helper function instance_state verification is not present, that's ok. continue
@@ -33,6 +67,11 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
             continue
         if original_test_result is None or cdd_test_result is None:
             are_equal = False
+            logger.debug(
+                f"Test result mismatch: test_id={test_id}, "
+                f"original_present={original_test_result is not None}, "
+                f"candidate_present={cdd_test_result is not None}"
+            )
             break
         did_all_timeout = did_all_timeout and original_test_result.timed_out
         if original_test_result.timed_out:
@@ -80,5 +119,89 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
             break
     sys.setrecursionlimit(original_recursion_limit)
     if did_all_timeout:
+        logger.debug("Comparison failed: all tests timed out")
         return False
+    logger.debug(f"Final comparison result: are_equal={are_equal}")
     return are_equal
+
+
+def _compare_hypothesis_tests_semantic(original_hypothesis: list, candidate_hypothesis: list) -> bool:
+    """Compare Hypothesis tests by test function, not by example count.
+
+    Hypothesis can generate different numbers of examples between runs due to:
+    - Timing differences
+    - Early stopping
+    - Shrinking behavior
+    - Performance differences
+
+    What matters is whether the test functions themselves pass or fail,
+    not how many examples Hypothesis generated.
+    """
+
+    # Group by test function (excluding loop index and iteration_id from comparison)
+    def get_test_key(test_result):
+        """Get unique key for a Hypothesis test function."""
+        return (
+            test_result.id.test_module_path,
+            test_result.id.test_class_name,
+            test_result.id.test_function_name,
+            test_result.id.function_getting_tested,
+        )
+
+    # Group original results by test function
+    original_by_func = defaultdict(list)
+    for result in original_hypothesis:
+        original_by_func[get_test_key(result)].append(result)
+
+    # Group candidate results by test function
+    candidate_by_func = defaultdict(list)
+    for result in candidate_hypothesis:
+        candidate_by_func[get_test_key(result)].append(result)
+
+    # Log summary statistics
+    orig_total_examples = sum(len(examples) for examples in original_by_func.values())
+    cand_total_examples = sum(len(examples) for examples in candidate_by_func.values())
+
+    logger.debug(
+        f"Hypothesis comparison: Original={len(original_by_func)} test functions ({orig_total_examples} examples), "
+        f"Candidate={len(candidate_by_func)} test functions ({cand_total_examples} examples)"
+    )
+
+    # Check if all test functions in original are present in candidate
+    missing_funcs = set(original_by_func.keys()) - set(candidate_by_func.keys())
+    if missing_funcs:
+        logger.warning(
+            f"Hypothesis test functions missing in candidate: {len(missing_funcs)} functions. "
+            f"First missing: {missing_funcs.__iter__().__next__()}"
+        )
+        return False
+
+    # Compare each test function's results
+    for test_key in original_by_func:
+        if test_key not in candidate_by_func:
+            continue  # Already handled above
+
+        orig_examples = original_by_func[test_key]
+        cand_examples = candidate_by_func[test_key]
+
+        # Check if any original example failed
+        orig_had_failure = any(not ex.did_pass for ex in orig_examples)
+        cand_had_failure = any(not ex.did_pass for ex in cand_examples)
+
+        # If original had failures, candidate must also have failures (or be missing, already handled)
+        # If original passed, candidate must pass (but can have different example counts)
+        if orig_had_failure != cand_had_failure:
+            logger.debug(
+                f"Hypothesis test function behavior mismatch: {test_key} "
+                f"(original_failed={orig_had_failure}, candidate_failed={cand_had_failure})"
+            )
+            return False
+
+        if abs(len(orig_examples) - len(cand_examples)) > 10:
+            logger.info(
+                f"Hypothesis test '{test_key[2]}': example counts differ "
+                f"(original={len(orig_examples)}, candidate={len(cand_examples)}). "
+                f"This is expected when code performance changes."
+            )
+
+    return True

From b7faf816f31a266ceebd01568073a8a60a44c39d Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Sun, 26 Oct 2025 01:05:22 -0500
Subject: [PATCH 09/16] modify equivalence for hypothesis tests

---
 codeflash/verification/equivalence.py        |  4 +-
 codeflash/verification/hypothesis_testing.py | 81 ++++++--------------
 2 files changed, 25 insertions(+), 60 deletions(-)

diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py
index fe28cfaa6..efc222acb 100644
--- a/codeflash/verification/equivalence.py
+++ b/codeflash/verification/equivalence.py
@@ -2,7 +2,7 @@
 from collections import defaultdict
 
 from codeflash.cli_cmds.console import logger
-from codeflash.models.models import TestResults, TestType, VerificationType
+from codeflash.models.models import FunctionTestInvocation, TestResults, TestType, VerificationType
 from codeflash.verification.comparator import comparator
 
 INCREASED_RECURSION_LIMIT = 5000
@@ -139,7 +139,7 @@ def _compare_hypothesis_tests_semantic(original_hypothesis: list, candidate_hypo
     """
 
     # Group by test function (excluding loop index and iteration_id from comparison)
-    def get_test_key(test_result):
+    def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, str]:
         """Get unique key for a Hypothesis test function."""
         return (
             test_result.id.test_module_path,
diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py
index a39e3999e..13007f7af 100644
--- a/codeflash/verification/hypothesis_testing.py
+++ b/codeflash/verification/hypothesis_testing.py
@@ -75,6 +75,7 @@ def filter_hypothesis_tests_by_function_name(code: str, function_name: str) -> s
 
     Returns:
         Filtered code with only matching tests
+
     """
     tree = ast.parse(code)
 
@@ -86,10 +87,9 @@ def visit_Module(self, node):  # noqa: ANN001, ANN202
                 if isinstance(item, (ast.Import, ast.ImportFrom, ast.Assign)):
                     # Keep all imports and module-level assignments
                     new_body.append(item)
-                elif isinstance(item, ast.FunctionDef):
+                elif isinstance(item, ast.FunctionDef) and item.name.startswith("test_") and function_name in item.name:
                     # Only keep test functions that match the function name
-                    if item.name.startswith("test_") and function_name in item.name:
-                        new_body.append(item)
+                    new_body.append(item)
             node.body = new_body
             return node
 
@@ -126,25 +126,17 @@ def visit_Call(self, node: ast.Call) -> ast.Call:
                 and node.func.value.id == "st"
             ):
                 if node.func.attr == "floats" and not any(
-                    k.arg in ["min_value", "max_value", "allow_nan", "allow_infinity"]
-                    for k in node.keywords
+                    k.arg in ["min_value", "max_value", "allow_nan", "allow_infinity"] for k in node.keywords
                 ):
                     # Constrain floats to reasonable bounds
                     node.keywords.extend(
                         [
                             ast.keyword(
-                                arg="min_value",
-                                value=ast.UnaryOp(
-                                    op=ast.USub(), operand=ast.Constant(value=1e6)
-                                ),
+                                arg="min_value", value=ast.UnaryOp(op=ast.USub(), operand=ast.Constant(value=1e6))
                             ),
                             ast.keyword(arg="max_value", value=ast.Constant(value=1e6)),
-                            ast.keyword(
-                                arg="allow_nan", value=ast.Constant(value=False)
-                            ),
-                            ast.keyword(
-                                arg="allow_infinity", value=ast.Constant(value=False)
-                            ),
+                            ast.keyword(arg="allow_nan", value=ast.Constant(value=False)),
+                            ast.keyword(arg="allow_infinity", value=ast.Constant(value=False)),
                         ]
                     )
                 elif node.func.attr == "integers" and not any(
@@ -154,9 +146,7 @@ def visit_Call(self, node: ast.Call) -> ast.Call:
                     node.keywords.extend(
                         [
                             ast.keyword(arg="min_value", value=ast.Constant(value=-10000)),
-                            ast.keyword(
-                                arg="max_value", value=ast.Constant(value=10000)
-                            ),
+                            ast.keyword(arg="max_value", value=ast.Constant(value=10000)),
                         ]
                     )
             return node
@@ -170,28 +160,20 @@ def visit_Call(self, node: ast.Call) -> ast.Call:
                 (
                     d
                     for d in node.decorator_list
-                    if isinstance(d, ast.Call)
-                    and isinstance(d.func, ast.Name)
-                    and d.func.id == "settings"
+                    if isinstance(d, ast.Call) and isinstance(d.func, ast.Name) and d.func.id == "settings"
                 ),
                 None,
             )
 
             if settings_decorator:
                 if not any(k.arg == "derandomize" for k in settings_decorator.keywords):
-                    settings_decorator.keywords.append(
-                        ast.keyword(arg="derandomize", value=ast.Constant(value=True))
-                    )
+                    settings_decorator.keywords.append(ast.keyword(arg="derandomize", value=ast.Constant(value=True)))
             else:
                 node.decorator_list.append(
                     ast.Call(
                         func=ast.Name(id="settings", ctx=ast.Load()),
                         args=[],
-                        keywords=[
-                            ast.keyword(
-                                arg="derandomize", value=ast.Constant(value=True)
-                            )
-                        ],
+                        keywords=[ast.keyword(arg="derandomize", value=ast.Constant(value=True))],
                     )
                 )
 
@@ -199,10 +181,7 @@ def visit_Call(self, node: ast.Call) -> ast.Call:
 
 
 def generate_hypothesis_tests(
-    test_cfg: TestConfig,
-    args: Namespace,
-    function_to_optimize: FunctionToOptimize,
-    function_to_optimize_ast: ast.AST,
+    test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST
 ) -> tuple[dict[str, list[FunctionCalledInTest]], str]:
     """Generate property-based tests using Hypothesis ghostwriter.
 
@@ -223,9 +202,7 @@ def generate_hypothesis_tests(
 
     if (
         test_cfg.project_root_path
-        and isinstance(
-            function_to_optimize_ast, (ast.FunctionDef, ast.AsyncFunctionDef)
-        )
+        and isinstance(function_to_optimize_ast, (ast.FunctionDef, ast.AsyncFunctionDef))
         and has_typed_parameters(function_to_optimize_ast, function_to_optimize.parents)
     ):
         logger.info("Generating Hypothesis tests for the original code…")
@@ -233,9 +210,7 @@ def generate_hypothesis_tests(
 
         try:
             qualified_function_path = get_qualified_function_path(
-                function_to_optimize.file_path,
-                args.project_root,
-                function_to_optimize.qualified_name,
+                function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name
             )
             logger.info(f"command: hypothesis write {qualified_function_path}")
 
@@ -250,9 +225,7 @@ def generate_hypothesis_tests(
         except subprocess.TimeoutExpired:
             logger.debug("Hypothesis test generation timed out")
             end_time = time.perf_counter()
-            logger.debug(
-                f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds"
-            )
+            logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
             return function_to_hypothesis_tests, hypothesis_test_suite_code
 
         if hypothesis_result.returncode == 0:
@@ -269,29 +242,25 @@ def generate_hypothesis_tests(
                 pytest_cmd=args.pytest_cmd,
             )
             file_to_funcs = {function_to_optimize.file_path: [function_to_optimize]}
-            function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = (
-                discover_unit_tests(hypothesis_config, file_to_funcs_to_optimize=file_to_funcs)
+            function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = discover_unit_tests(
+                hypothesis_config, file_to_funcs_to_optimize=file_to_funcs
             )
             with hypothesis_path.open("r", encoding="utf-8") as f:
                 original_code = f.read()
 
-            unparsed = filter_hypothesis_tests_by_function_name(
-                original_code, function_to_optimize.function_name
-            )
+            unparsed = filter_hypothesis_tests_by_function_name(original_code, function_to_optimize.function_name)
 
             console.print(f"modified src: {unparsed}")
 
             hypothesis_test_suite_code = format_code(
                 args.formatter_cmds,
                 hypothesis_path,
-                optimized_code=make_hypothesis_tests_deterministic(
-                    remove_functions_with_only_any_type(unparsed)
-                ),
+                optimized_code=make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)),
             )
             with hypothesis_path.open("w", encoding="utf-8") as f:
                 f.write(hypothesis_test_suite_code)
-            function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = (
-                discover_unit_tests(hypothesis_config, file_to_funcs_to_optimize=file_to_funcs)
+            function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = discover_unit_tests(
+                hypothesis_config, file_to_funcs_to_optimize=file_to_funcs
             )
             logger.info(
                 f"Created {num_discovered_hypothesis_tests} "
@@ -299,9 +268,7 @@ def generate_hypothesis_tests(
             )
             console.rule()
             end_time = time.perf_counter()
-            logger.debug(
-                f"Generated hypothesis tests in {end_time - start_time:.2f} seconds"
-            )
+            logger.debug(f"Generated hypothesis tests in {end_time - start_time:.2f} seconds")
             return function_to_hypothesis_tests, hypothesis_test_suite_code
 
         logger.debug(
@@ -310,7 +277,5 @@ def generate_hypothesis_tests(
         console.rule()
 
     end_time = time.perf_counter()
-    logger.debug(
-        f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds"
-    )
+    logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
     return function_to_hypothesis_tests, hypothesis_test_suite_code

From dfb3927c9c81fc551e75c1c1101685e17e8fb504 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Sun, 26 Oct 2025 01:44:38 -0500
Subject: [PATCH 10/16] fix: track and cleanup hypothesis test temp directories

- Modified generate_hypothesis_tests() to return the temp directory Path
- Added hypothesis_tests_dir tracking in FunctionOptimizer
- Extended cleanup_generated_files() to remove hypothesis test directories
- Added hypothesis_tests_dirs list in Optimizer to track all directories
- Updated cleanup_temporary_paths() to cleanup hypothesis test directories
- Ensures cleanup on success, errors, and KeyboardInterrupt
- Changed temp dir prefix to 'codeflash_hypothesis_' for clarity
---
 codeflash/optimization/function_optimizer.py | 15 +++++++++++++--
 codeflash/optimization/optimizer.py          | 11 ++++++++++-
 codeflash/verification/hypothesis_testing.py | 14 ++++++++------
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 99aeed6d2..9955778ec 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -240,6 +240,7 @@ def __init__(
         self.function_benchmark_timings = function_benchmark_timings if function_benchmark_timings else {}
         self.total_benchmark_timings = total_benchmark_timings if total_benchmark_timings else {}
         self.replay_tests_dir = replay_tests_dir if replay_tests_dir else None
+        self.hypothesis_tests_dir: Path | None = None
         self.generate_and_instrument_tests_results: (
             tuple[GeneratedTestsList, dict[str, set[FunctionCalledInTest]], OptimizationSet] | None
         ) = None
@@ -1147,7 +1148,11 @@ def generate_tests_and_optimizations(
             generate_concolic_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast
         )
         future_hypothesis_tests = self.executor.submit(
-            generate_hypothesis_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast
+            generate_hypothesis_tests,
+            self.test_cfg,
+            self.args,
+            self.function_to_optimize,
+            self.function_to_optimize_ast,
         )
         futures = [*future_tests, future_optimization_candidates, future_concolic_tests, future_hypothesis_tests]
         if run_experiment:
@@ -1201,7 +1206,8 @@ def generate_tests_and_optimizations(
             logger.warning(f"Failed to generate and instrument tests for {self.function_to_optimize.function_name}")
             return Failure(f"/!\\ NO TESTS GENERATED for {self.function_to_optimize.function_name}")
         function_to_concolic_tests, concolic_test_str = future_concolic_tests.result()
-        function_to_hypothesis_tests, hypothesis_test_str = future_hypothesis_tests.result()
+        function_to_hypothesis_tests, hypothesis_test_str, hypothesis_test_suite_dir = future_hypothesis_tests.result()
+        self.hypothesis_tests_dir = hypothesis_test_suite_dir
 
         count_tests = len(tests)
         if concolic_test_str:
@@ -2051,7 +2057,12 @@ def cleanup_generated_files(self) -> None:
             paths_to_cleanup.append(test_file.instrumented_behavior_file_path)
             paths_to_cleanup.append(test_file.benchmarking_file_path)
 
+        # Add hypothesis test directory to cleanup
+        if self.hypothesis_tests_dir and self.hypothesis_tests_dir.exists():
+            paths_to_cleanup.append(self.hypothesis_tests_dir)
+
         cleanup_paths(paths_to_cleanup)
+        self.hypothesis_tests_dir = None
 
     def get_test_env(
         self, codeflash_loop_index: int, codeflash_test_iteration: int, codeflash_tracer_disable: int = 1
diff --git a/codeflash/optimization/optimizer.py b/codeflash/optimization/optimizer.py
index c0e0b014b..a83d604d2 100644
--- a/codeflash/optimization/optimizer.py
+++ b/codeflash/optimization/optimizer.py
@@ -53,6 +53,7 @@ def __init__(self, args: Namespace) -> None:
         self.experiment_id = os.getenv("CODEFLASH_EXPERIMENT_ID", None)
         self.local_aiservice_client = LocalAiServiceClient() if self.experiment_id else None
         self.replay_tests_dir = None
+        self.hypothesis_tests_dirs: list[Path] = []  # Track all hypothesis test directories
         self.functions_checkpoint: CodeflashRunCheckpoint | None = None
         self.current_function_being_optimized: FunctionToOptimize | None = None  # current only for the LSP
         self.current_function_optimizer: FunctionOptimizer | None = None
@@ -337,6 +338,9 @@ def run(self) -> None:
                             function_optimizer  # needed to clean up from the outside of this function
                         )
                         best_optimization = function_optimizer.optimize_function()
+                        # Track hypothesis test directory for cleanup
+                        if function_optimizer.hypothesis_tests_dir:
+                            self.hypothesis_tests_dirs.append(function_optimizer.hypothesis_tests_dir)
                         if self.functions_checkpoint:
                             self.functions_checkpoint.add_function_to_checkpoint(
                                 function_to_optimize.qualified_name_with_modules_from_root(self.args.project_root)
@@ -430,7 +434,12 @@ def cleanup_temporary_paths(self) -> None:
 
         if self.current_function_optimizer:
             self.current_function_optimizer.cleanup_generated_files()
-        cleanup_paths([self.test_cfg.concolic_test_root_dir, self.replay_tests_dir])
+
+        # Cleanup all temporary test directories
+        paths_to_cleanup = [self.test_cfg.concolic_test_root_dir, self.replay_tests_dir]
+        paths_to_cleanup.extend(self.hypothesis_tests_dirs)
+        cleanup_paths(paths_to_cleanup)
+        self.hypothesis_tests_dirs.clear()
 
     def worktree_mode(self) -> None:
         if self.current_worktree:
diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py
index 13007f7af..e36c130ec 100644
--- a/codeflash/verification/hypothesis_testing.py
+++ b/codeflash/verification/hypothesis_testing.py
@@ -182,7 +182,7 @@ def visit_Call(self, node: ast.Call) -> ast.Call:
 
 def generate_hypothesis_tests(
     test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST
-) -> tuple[dict[str, list[FunctionCalledInTest]], str]:
+) -> tuple[dict[str, list[FunctionCalledInTest]], str, Path | None]:
     """Generate property-based tests using Hypothesis ghostwriter.
 
     This function:
@@ -193,12 +193,14 @@ def generate_hypothesis_tests(
     5. Formats the tests with the project formatter
 
     Returns:
-        Tuple of (function_to_tests_map, test_suite_code)
+        Tuple of (function_to_tests_map, test_suite_code, hypothesis_test_suite_dir)
+        The hypothesis_test_suite_dir is None if no tests were generated.
 
     """
     start_time = time.perf_counter()
     function_to_hypothesis_tests: dict[str, list[FunctionCalledInTest]] = {}
     hypothesis_test_suite_code: str = ""
+    hypothesis_test_suite_dir: Path | None = None
 
     if (
         test_cfg.project_root_path
@@ -226,11 +228,11 @@ def generate_hypothesis_tests(
             logger.debug("Hypothesis test generation timed out")
             end_time = time.perf_counter()
             logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
-            return function_to_hypothesis_tests, hypothesis_test_suite_code
+            return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir
 
         if hypothesis_result.returncode == 0:
             hypothesis_test_suite_code = hypothesis_result.stdout
-            hypothesis_test_suite_dir = Path(tempfile.mkdtemp(dir=test_cfg.tests_root))
+            hypothesis_test_suite_dir = Path(tempfile.mkdtemp(prefix="codeflash_hypothesis_", dir=test_cfg.tests_root))
             hypothesis_path = hypothesis_test_suite_dir / "test_hypothesis.py"
             hypothesis_path.write_text(hypothesis_test_suite_code, encoding="utf8")
 
@@ -269,7 +271,7 @@ def generate_hypothesis_tests(
             console.rule()
             end_time = time.perf_counter()
             logger.debug(f"Generated hypothesis tests in {end_time - start_time:.2f} seconds")
-            return function_to_hypothesis_tests, hypothesis_test_suite_code
+            return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir
 
         logger.debug(
             f"Error running hypothesis write {': ' + hypothesis_result.stderr if hypothesis_result.stderr else '.'}"
@@ -278,4 +280,4 @@ def generate_hypothesis_tests(
 
     end_time = time.perf_counter()
     logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
-    return function_to_hypothesis_tests, hypothesis_test_suite_code
+    return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir

From 19fc55750933aaec8e1710b804fe462ef329eaa2 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Sun, 26 Oct 2025 02:08:40 -0500
Subject: [PATCH 11/16] cleanup strategies

---
 codeflash/optimization/function_optimizer.py | 14 ++++++++++++--
 codeflash/optimization/optimizer.py          |  9 ++++++++-
 codeflash/verification/equivalence.py        | 18 ------------------
 codeflash/verification/hypothesis_testing.py | 19 +++++++++----------
 4 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
index 99aeed6d2..61e0cddb9 100644
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@@ -240,6 +240,7 @@ def __init__(
         self.function_benchmark_timings = function_benchmark_timings if function_benchmark_timings else {}
         self.total_benchmark_timings = total_benchmark_timings if total_benchmark_timings else {}
         self.replay_tests_dir = replay_tests_dir if replay_tests_dir else None
+        self.hypothesis_tests_dir: Path | None = None
         self.generate_and_instrument_tests_results: (
             tuple[GeneratedTestsList, dict[str, set[FunctionCalledInTest]], OptimizationSet] | None
         ) = None
@@ -1147,7 +1148,11 @@ def generate_tests_and_optimizations(
             generate_concolic_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast
         )
         future_hypothesis_tests = self.executor.submit(
-            generate_hypothesis_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast
+            generate_hypothesis_tests,
+            self.test_cfg,
+            self.args,
+            self.function_to_optimize,
+            self.function_to_optimize_ast,
         )
         futures = [*future_tests, future_optimization_candidates, future_concolic_tests, future_hypothesis_tests]
         if run_experiment:
@@ -1201,7 +1206,8 @@ def generate_tests_and_optimizations(
             logger.warning(f"Failed to generate and instrument tests for {self.function_to_optimize.function_name}")
             return Failure(f"/!\\ NO TESTS GENERATED for {self.function_to_optimize.function_name}")
         function_to_concolic_tests, concolic_test_str = future_concolic_tests.result()
-        function_to_hypothesis_tests, hypothesis_test_str = future_hypothesis_tests.result()
+        function_to_hypothesis_tests, hypothesis_test_str, hypothesis_test_suite_dir = future_hypothesis_tests.result()
+        self.hypothesis_tests_dir = hypothesis_test_suite_dir
 
         count_tests = len(tests)
         if concolic_test_str:
@@ -2051,7 +2057,11 @@ def cleanup_generated_files(self) -> None:
             paths_to_cleanup.append(test_file.instrumented_behavior_file_path)
             paths_to_cleanup.append(test_file.benchmarking_file_path)
 
+        if self.hypothesis_tests_dir and self.hypothesis_tests_dir.exists():
+            paths_to_cleanup.append(self.hypothesis_tests_dir)
+
         cleanup_paths(paths_to_cleanup)
+        self.hypothesis_tests_dir = None
 
     def get_test_env(
         self, codeflash_loop_index: int, codeflash_test_iteration: int, codeflash_tracer_disable: int = 1
diff --git a/codeflash/optimization/optimizer.py b/codeflash/optimization/optimizer.py
index c0e0b014b..398db1a47 100644
--- a/codeflash/optimization/optimizer.py
+++ b/codeflash/optimization/optimizer.py
@@ -53,6 +53,7 @@ def __init__(self, args: Namespace) -> None:
         self.experiment_id = os.getenv("CODEFLASH_EXPERIMENT_ID", None)
         self.local_aiservice_client = LocalAiServiceClient() if self.experiment_id else None
         self.replay_tests_dir = None
+        self.hypothesis_tests_dirs: list[Path] = []  # Track all hypothesis test directories
         self.functions_checkpoint: CodeflashRunCheckpoint | None = None
         self.current_function_being_optimized: FunctionToOptimize | None = None  # current only for the LSP
         self.current_function_optimizer: FunctionOptimizer | None = None
@@ -337,6 +338,8 @@ def run(self) -> None:
                             function_optimizer  # needed to clean up from the outside of this function
                         )
                         best_optimization = function_optimizer.optimize_function()
+                        if function_optimizer.hypothesis_tests_dir:
+                            self.hypothesis_tests_dirs.append(function_optimizer.hypothesis_tests_dir)
                         if self.functions_checkpoint:
                             self.functions_checkpoint.add_function_to_checkpoint(
                                 function_to_optimize.qualified_name_with_modules_from_root(self.args.project_root)
@@ -430,7 +433,11 @@ def cleanup_temporary_paths(self) -> None:
 
         if self.current_function_optimizer:
             self.current_function_optimizer.cleanup_generated_files()
-        cleanup_paths([self.test_cfg.concolic_test_root_dir, self.replay_tests_dir])
+
+        paths_to_cleanup = [self.test_cfg.concolic_test_root_dir, self.replay_tests_dir]
+        paths_to_cleanup.extend(self.hypothesis_tests_dirs)
+        cleanup_paths(paths_to_cleanup)
+        self.hypothesis_tests_dirs.clear()
 
     def worktree_mode(self) -> None:
         if self.current_worktree:
diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py
index efc222acb..66ed7e2b4 100644
--- a/codeflash/verification/equivalence.py
+++ b/codeflash/verification/equivalence.py
@@ -167,16 +167,6 @@ def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, st
         f"Candidate={len(candidate_by_func)} test functions ({cand_total_examples} examples)"
     )
 
-    # Check if all test functions in original are present in candidate
-    missing_funcs = set(original_by_func.keys()) - set(candidate_by_func.keys())
-    if missing_funcs:
-        logger.warning(
-            f"Hypothesis test functions missing in candidate: {len(missing_funcs)} functions. "
-            f"First missing: {missing_funcs.__iter__().__next__()}"
-        )
-        return False
-
-    # Compare each test function's results
     for test_key in original_by_func:
         if test_key not in candidate_by_func:
             continue  # Already handled above
@@ -196,12 +186,4 @@ def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, st
                 f"(original_failed={orig_had_failure}, candidate_failed={cand_had_failure})"
             )
             return False
-
-        if abs(len(orig_examples) - len(cand_examples)) > 10:
-            logger.info(
-                f"Hypothesis test '{test_key[2]}': example counts differ "
-                f"(original={len(orig_examples)}, candidate={len(cand_examples)}). "
-                f"This is expected when code performance changes."
-            )
-
     return True
diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py
index 13007f7af..9d213c4b4 100644
--- a/codeflash/verification/hypothesis_testing.py
+++ b/codeflash/verification/hypothesis_testing.py
@@ -182,7 +182,7 @@ def visit_Call(self, node: ast.Call) -> ast.Call:
 
 def generate_hypothesis_tests(
     test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST
-) -> tuple[dict[str, list[FunctionCalledInTest]], str]:
+) -> tuple[dict[str, list[FunctionCalledInTest]], str, Path | None]:
     """Generate property-based tests using Hypothesis ghostwriter.
 
     This function:
@@ -193,12 +193,14 @@ def generate_hypothesis_tests(
     5. Formats the tests with the project formatter
 
     Returns:
-        Tuple of (function_to_tests_map, test_suite_code)
+        Tuple of (function_to_tests_map, test_suite_code, hypothesis_test_suite_dir)
+        The hypothesis_test_suite_dir is None if no tests were generated.
 
     """
     start_time = time.perf_counter()
     function_to_hypothesis_tests: dict[str, list[FunctionCalledInTest]] = {}
     hypothesis_test_suite_code: str = ""
+    hypothesis_test_suite_dir: Path | None = None
 
     if (
         test_cfg.project_root_path
@@ -212,8 +214,6 @@ def generate_hypothesis_tests(
             qualified_function_path = get_qualified_function_path(
                 function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name
             )
-            logger.info(f"command: hypothesis write {qualified_function_path}")
-
             hypothesis_result = subprocess.run(
                 ["hypothesis", "write", qualified_function_path],
                 capture_output=True,
@@ -226,11 +226,11 @@ def generate_hypothesis_tests(
             logger.debug("Hypothesis test generation timed out")
             end_time = time.perf_counter()
             logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
-            return function_to_hypothesis_tests, hypothesis_test_suite_code
+            return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir
 
         if hypothesis_result.returncode == 0:
             hypothesis_test_suite_code = hypothesis_result.stdout
-            hypothesis_test_suite_dir = Path(tempfile.mkdtemp(dir=test_cfg.tests_root))
+            hypothesis_test_suite_dir = Path(tempfile.mkdtemp(prefix="codeflash_hypothesis_", dir=test_cfg.tests_root))
             hypothesis_path = hypothesis_test_suite_dir / "test_hypothesis.py"
             hypothesis_path.write_text(hypothesis_test_suite_code, encoding="utf8")
 
@@ -250,12 +250,11 @@ def generate_hypothesis_tests(
 
             unparsed = filter_hypothesis_tests_by_function_name(original_code, function_to_optimize.function_name)
 
-            console.print(f"modified src: {unparsed}")
-
             hypothesis_test_suite_code = format_code(
                 args.formatter_cmds,
                 hypothesis_path,
                 optimized_code=make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)),
+                print_status=False,
             )
             with hypothesis_path.open("w", encoding="utf-8") as f:
                 f.write(hypothesis_test_suite_code)
@@ -269,7 +268,7 @@ def generate_hypothesis_tests(
             console.rule()
             end_time = time.perf_counter()
             logger.debug(f"Generated hypothesis tests in {end_time - start_time:.2f} seconds")
-            return function_to_hypothesis_tests, hypothesis_test_suite_code
+            return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir
 
         logger.debug(
             f"Error running hypothesis write {': ' + hypothesis_result.stderr if hypothesis_result.stderr else '.'}"
@@ -278,4 +277,4 @@ def generate_hypothesis_tests(
 
     end_time = time.perf_counter()
     logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds")
-    return function_to_hypothesis_tests, hypothesis_test_suite_code
+    return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir

From 51cfe7caca584ede6300fbf888e84f82bd492f10 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Sun, 26 Oct 2025 15:06:57 -0500
Subject: [PATCH 12/16] formatting

---
 codeflash/code_utils/env_utils.py          | 2 +-
 codeflash/discovery/discover_unit_tests.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/codeflash/code_utils/env_utils.py b/codeflash/code_utils/env_utils.py
index 749c49676..4200edb7d 100644
--- a/codeflash/code_utils/env_utils.py
+++ b/codeflash/code_utils/env_utils.py
@@ -160,4 +160,4 @@ def is_ci() -> bool:
 def is_pr_draft() -> bool:
     """Check if the PR is draft. in the github action context."""
     event = get_cached_gh_event_data()
-    return bool(event.get("pull_request", {}).get("draft", False))
\ No newline at end of file
+    return bool(event.get("pull_request", {}).get("draft", False))
diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py
index 1f23eaf5b..ffd66c6b9 100644
--- a/codeflash/discovery/discover_unit_tests.py
+++ b/codeflash/discovery/discover_unit_tests.py
@@ -9,7 +9,6 @@
 import re
 import sqlite3
 import subprocess
-import sys
 import unittest
 from collections import defaultdict
 from pathlib import Path

From 6968ab391aa64b2d6721d67d552a0bb7f9b4ec00 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Sun, 26 Oct 2025 15:12:08 -0500
Subject: [PATCH 13/16] exact tests

---
 tests/test_hypothesis_testing.py | 54 +++++++++++---------------------
 1 file changed, 18 insertions(+), 36 deletions(-)

diff --git a/tests/test_hypothesis_testing.py b/tests/test_hypothesis_testing.py
index 49fff9515..d44b2413a 100644
--- a/tests/test_hypothesis_testing.py
+++ b/tests/test_hypothesis_testing.py
@@ -12,8 +12,9 @@ def test_adds_derandomize_decorator():
 def test_x(x):
     assert isinstance(x, int)
 """
+    expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(x=st.integers(min_value=-10000, max_value=10000))\n@settings(derandomize=True)\ndef test_x(x):\n    assert isinstance(x, int)"""
     out = make_hypothesis_tests_deterministic(src)
-    assert "@settings(derandomize=True)" in out or "settings(derandomize=True)" in out
+    assert out == expected
 
 
 def test_integers_constrained_with_negatives():
@@ -23,11 +24,9 @@ def test_integers_constrained_with_negatives():
 def t(x):
     pass
 """
+    expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(x=st.integers(min_value=-10000, max_value=10000))\n@settings(derandomize=True)\ndef t(x):\n    pass"""
     out = make_hypothesis_tests_deterministic(src)
-    # Remove spaces for easier checking
-    normalized = out.replace(" ", "").replace("\n", "")
-    assert "min_value=-10000" in normalized
-    assert "max_value=10000" in normalized
+    assert out == expected
 
 
 def test_floats_constrained_to_finite():
@@ -37,11 +36,9 @@ def test_floats_constrained_to_finite():
 def t(x):
     pass
 """
+    expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(x=st.floats(min_value=-1000000.0, max_value=1000000.0, allow_nan=False, allow_infinity=False))\n@settings(derandomize=True)\ndef t(x):\n    pass"""
     out = make_hypothesis_tests_deterministic(src)
-    normalized = out.replace(" ", "").replace("\n", "")
-    assert "allow_nan=False" in normalized
-    assert "allow_infinity=False" in normalized
-    assert "min_value=" in normalized and "max_value=" in normalized
+    assert out == expected
 
 
 def test_existing_constraints_not_overridden():
@@ -53,14 +50,9 @@ def test_existing_constraints_not_overridden():
 def t(x):
     pass
 """
+    expected = """from hypothesis import given, strategies as st, settings\n\n@settings(derandomize=True, max_examples=5)\n@given(x=st.integers(min_value=-5, max_value=5))\ndef t(x):\n    pass"""
     out = make_hypothesis_tests_deterministic(src)
-    # Should not add duplicate settings decorator
-    assert out.count("@settings") == 1
-    # Should preserve original constraints
-    assert "min_value=-5" in out or "min_value= -5" in out
-    assert "max_value=5" in out or "max_value= 5" in out
-    # Should not add the default -10000/10000 bounds
-    assert "-10000" not in out
+    assert out == expected
 
 
 def test_existing_float_constraints_preserved():
@@ -71,11 +63,9 @@ def test_existing_float_constraints_preserved():
 def t(y):
     pass
 """
+    expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(y=st.floats(min_value=-1.0, max_value=1.0, allow_nan=False, allow_infinity=False))\n@settings(derandomize=True)\ndef t(y):\n    pass"""
     out = make_hypothesis_tests_deterministic(src)
-    assert "min_value=-1.0" in out or "min_value= -1.0" in out
-    assert "max_value=1.0" in out or "max_value= 1.0" in out
-    # Should not add the default 1e6 bounds
-    assert "1e6" not in out and "1000000" not in out
+    assert out == expected
 
 
 def test_idempotency():
@@ -99,14 +89,9 @@ def test_multiple_strategies_handled():
 def test_multi(a, b, c):
     pass
 """
+    expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(a=st.integers(min_value=-10000, max_value=10000), b=st.integers(min_value=-10000, max_value=10000), c=st.floats(min_value=-1000000.0, max_value=1000000.0, allow_nan=False, allow_infinity=False))\n@settings(derandomize=True)\ndef test_multi(a, b, c):\n    pass"""
     out = make_hypothesis_tests_deterministic(src)
-    normalized = out.replace(" ", "").replace("\n", "")
-    # All integers should be constrained
-    assert normalized.count("min_value=-10000") >= 2
-    assert normalized.count("max_value=10000") >= 2
-    # Float should be constrained
-    assert "allow_nan=False" in normalized
-    assert "allow_infinity=False" in normalized
+    assert out == expected
 
 
 def test_settings_import_added_if_missing():
@@ -117,9 +102,9 @@ def test_settings_import_added_if_missing():
 def test_x(x):
     pass
 """
+    expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(x=st.integers(min_value=-10000, max_value=10000))\n@settings(derandomize=True)\ndef test_x(x):\n    pass"""
     out = make_hypothesis_tests_deterministic(src)
-    # Should have settings import or settings in existing import
-    assert "settings" in out
+    assert out == expected
 
 
 def test_partial_constraints_completed():
@@ -130,11 +115,9 @@ def test_partial_constraints_completed():
 def test_x(x):
     pass
 """
+    expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(x=st.integers(min_value=100))\n@settings(derandomize=True)\ndef test_x(x):\n    pass"""
     out = make_hypothesis_tests_deterministic(src)
-    # Should keep the min_value=100 and not override
-    assert "min_value=100" in out or "min_value= 100" in out
-    # Should not add default bounds since min_value exists
-    assert "-10000" not in out
+    assert out == expected
 
 
 def test_syntax_error_returns_original():
@@ -152,7 +135,6 @@ def test_no_hypothesis_code_unchanged():
 def test_regular():
     assert regular_function(2) == 4
 """
+    expected = """from hypothesis import settings\n\n@settings(derandomize=True)\ndef regular_function(x):\n    return x * 2\n\n@settings(derandomize=True)\ndef test_regular():\n    assert regular_function(2) == 4"""
     out = make_hypothesis_tests_deterministic(src)
-    # Should still parse and return valid code
-    assert "def regular_function" in out
-    assert "def test_regular" in out
+    assert out == expected

From c9f64830c394fb6b6fe831a2d916d2340919ac9d Mon Sep 17 00:00:00 2001
From: "codeflash-ai[bot]"
 <148906541+codeflash-ai[bot]@users.noreply.github.com>
Date: Sun, 26 Oct 2025 20:37:44 +0000
Subject: [PATCH 14/16] Optimize _compare_hypothesis_tests_semantic

The optimized code achieves a **32% speedup** by eliminating redundant data structures and reducing iteration overhead through two key optimizations:

**1. Single-pass aggregation instead of list accumulation:**
- **Original**: Uses `defaultdict(list)` to collect all `FunctionTestInvocation` objects per test function, then later iterates through these lists to compute failure flags with `any(not ex.did_pass for ex in orig_examples)`
- **Optimized**: Uses plain dicts with 2-element lists `[count, had_failure]` to track both example count and failure status in a single pass, eliminating the need to store individual test objects or re-scan them

**2. Reduced memory allocation and access patterns:**
- **Original**: Creates and stores complete lists of test objects (up to 9,458 objects in large test cases), then performs expensive `any()` operations over these lists
- **Optimized**: Uses compact 2-item lists per test function, avoiding object accumulation and expensive linear scans

The line profiler shows the key performance gains:
- Lines with `any(not ex.did_pass...)` in original (10.1% and 10.2% of total time) are completely eliminated
- The `setdefault()` operations replace the more expensive `defaultdict(list).append()` calls
- Overall reduction from storing ~9,458 objects to just tracking summary statistics

**Best performance gains** occur in test cases with:
- **Large numbers of examples per test function** (up to 105% faster for `test_large_scale_all_fail`)
- **Many distinct test functions** (up to 75% faster for `test_large_scale_some_failures`)
- **Mixed pass/fail scenarios** where the original's `any()` operations were most expensive

The optimization maintains identical behavior while dramatically reducing both memory usage and computational complexity from O(examples) to O(1) per test function group.
---
 codeflash/verification/equivalence.py | 45 +++++++++++++--------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py
index 66ed7e2b4..1bcf4e47e 100644
--- a/codeflash/verification/equivalence.py
+++ b/codeflash/verification/equivalence.py
@@ -1,5 +1,4 @@
 import sys
-from collections import defaultdict
 
 from codeflash.cli_cmds.console import logger
 from codeflash.models.models import FunctionTestInvocation, TestResults, TestType, VerificationType
@@ -138,7 +137,6 @@ def _compare_hypothesis_tests_semantic(original_hypothesis: list, candidate_hypo
     not how many examples Hypothesis generated.
     """
 
-    # Group by test function (excluding loop index and iteration_id from comparison)
     def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, str]:
         """Get unique key for a Hypothesis test function."""
         return (
@@ -148,38 +146,39 @@ def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, st
             test_result.id.function_getting_tested,
         )
 
-    # Group original results by test function
-    original_by_func = defaultdict(list)
+    # Group by test function and simultaneously collect failure flag and example count
+    orig_by_func = {}
     for result in original_hypothesis:
-        original_by_func[get_test_key(result)].append(result)
+        test_key = get_test_key(result)
+        group = orig_by_func.setdefault(test_key, [0, False])  # [count, had_failure]
+        group[0] += 1
+        if not result.did_pass:
+            group[1] = True
 
-    # Group candidate results by test function
-    candidate_by_func = defaultdict(list)
+    cand_by_func = {}
     for result in candidate_hypothesis:
-        candidate_by_func[get_test_key(result)].append(result)
+        test_key = get_test_key(result)
+        group = cand_by_func.setdefault(test_key, [0, False])  # [count, had_failure]
+        group[0] += 1
+        if not result.did_pass:
+            group[1] = True
 
-    # Log summary statistics
-    orig_total_examples = sum(len(examples) for examples in original_by_func.values())
-    cand_total_examples = sum(len(examples) for examples in candidate_by_func.values())
+    orig_total_examples = sum(group[0] for group in orig_by_func.values())
+    cand_total_examples = sum(group[0] for group in cand_by_func.values())
 
     logger.debug(
-        f"Hypothesis comparison: Original={len(original_by_func)} test functions ({orig_total_examples} examples), "
-        f"Candidate={len(candidate_by_func)} test functions ({cand_total_examples} examples)"
+        f"Hypothesis comparison: Original={len(orig_by_func)} test functions ({orig_total_examples} examples), "
+        f"Candidate={len(cand_by_func)} test functions ({cand_total_examples} examples)"
     )
 
-    for test_key in original_by_func:
-        if test_key not in candidate_by_func:
+    # Compare only for test_keys present in original
+    for test_key, (orig_count, orig_had_failure) in orig_by_func.items():
+        cand_group = cand_by_func.get(test_key)
+        if cand_group is None:
             continue  # Already handled above
 
-        orig_examples = original_by_func[test_key]
-        cand_examples = candidate_by_func[test_key]
+        cand_had_failure = cand_group[1]
 
-        # Check if any original example failed
-        orig_had_failure = any(not ex.did_pass for ex in orig_examples)
-        cand_had_failure = any(not ex.did_pass for ex in cand_examples)
-
-        # If original had failures, candidate must also have failures (or be missing, already handled)
-        # If original passed, candidate must pass (but can have different example counts)
         if orig_had_failure != cand_had_failure:
             logger.debug(
                 f"Hypothesis test function behavior mismatch: {test_key} "

From 8fb7c1e56547de98826928b00e1f5b857f5ee2b9 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <106575910+KRRT7@users.noreply.github.com>
Date: Sun, 26 Oct 2025 17:49:54 -0500
Subject: [PATCH 15/16] Update codeflash/discovery/discover_unit_tests.py

Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com>
---
 codeflash/discovery/discover_unit_tests.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py
index ffd66c6b9..55cc9f33c 100644
--- a/codeflash/discovery/discover_unit_tests.py
+++ b/codeflash/discovery/discover_unit_tests.py
@@ -71,10 +71,11 @@ def _extract_dotted_call_name(node: ast.expr) -> str | None:
     parts = []
     current = node
     while isinstance(current, ast.Attribute):
-        parts.insert(0, current.attr)
+        parts.append(current.attr)
         current = current.value
     if isinstance(current, ast.Name):
-        parts.insert(0, current.id)
+        parts.append(current.id)
+        parts.reverse()
         return ".".join(parts) if parts else None
     return None
 

From 01a189aab18fce2323a4b82d4d5a226b27159c49 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Sun, 26 Oct 2025 17:51:24 -0500
Subject: [PATCH 16/16] fix linter

---
 codeflash/verification/equivalence.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py
index 1bcf4e47e..89b0d9b6a 100644
--- a/codeflash/verification/equivalence.py
+++ b/codeflash/verification/equivalence.py
@@ -172,7 +172,7 @@ def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, st
     )
 
     # Compare only for test_keys present in original
-    for test_key, (orig_count, orig_had_failure) in orig_by_func.items():
+    for test_key, (_orig_count, orig_had_failure) in orig_by_func.items():
         cand_group = cand_by_func.get(test_key)
         if cand_group is None:
             continue  # Already handled above