From 389b32c9b2cfc944568c7022b5fab8332cdd6990 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Wed, 22 Oct 2025 02:30:17 -0500 Subject: [PATCH 01/16] first pass at hypothesis integration --- codeflash/code_utils/code_utils.py | 5 + codeflash/discovery/discover_unit_tests.py | 4 + codeflash/models/test_type.py | 2 + codeflash/optimization/function_optimizer.py | 77 +++++-- codeflash/verification/concolic_testing.py | 14 +- codeflash/verification/hypothesis_testing.py | 207 +++++++++++++++++++ pyproject.toml | 1 + uv.lock | 50 +++++ 8 files changed, 336 insertions(+), 24 deletions(-) create mode 100644 codeflash/verification/hypothesis_testing.py diff --git a/codeflash/code_utils/code_utils.py b/codeflash/code_utils/code_utils.py index 82e122d0e..8f1cca998 100644 --- a/codeflash/code_utils/code_utils.py +++ b/codeflash/code_utils/code_utils.py @@ -254,6 +254,11 @@ def module_name_from_file_path(file_path: Path, project_root_path: Path, *, trav raise ValueError(msg) # noqa: B904 +def get_qualified_function_path(file_path: Path, project_root_path: Path, qualified_name: str) -> str: + module_path = file_path.relative_to(project_root_path).with_suffix("").as_posix().replace("/", ".") + return f"{module_path}.{qualified_name}" + + def file_path_from_module_name(module_name: str, project_root_path: Path) -> Path: """Get file path from module path.""" return project_root_path / (module_name.replace(".", os.sep) + ".py") diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py index 398efe461..d7c1c0a83 100644 --- a/codeflash/discovery/discover_unit_tests.py +++ b/codeflash/discovery/discover_unit_tests.py @@ -497,6 +497,8 @@ def discover_tests_pytest( test_type = TestType.REPLAY_TEST elif "test_concolic_coverage" in test["test_file"]: test_type = TestType.CONCOLIC_COVERAGE_TEST + elif "test_hypothesis" in test["test_file"]: + test_type = TestType.HYPOTHESIS_TEST else: test_type = TestType.EXISTING_UNIT_TEST @@ -540,6 +542,8 @@ def get_test_details(_test: unittest.TestCase) -> TestsInFile | None: test_type = TestType.REPLAY_TEST elif "test_concolic_coverage" in str(_test_module_path): test_type = TestType.CONCOLIC_COVERAGE_TEST + elif "test_hypothesis" in str(_test_module_path): + test_type = TestType.HYPOTHESIS_TEST else: test_type = TestType.EXISTING_UNIT_TEST return TestsInFile( diff --git a/codeflash/models/test_type.py b/codeflash/models/test_type.py index 103a3bc4d..f30089967 100644 --- a/codeflash/models/test_type.py +++ b/codeflash/models/test_type.py @@ -8,6 +8,7 @@ class TestType(Enum): REPLAY_TEST = 4 CONCOLIC_COVERAGE_TEST = 5 INIT_STATE_TEST = 6 + HYPOTHESIS_TEST = 7 def to_name(self) -> str: if self is TestType.INIT_STATE_TEST: @@ -18,5 +19,6 @@ def to_name(self) -> str: TestType.GENERATED_REGRESSION: "šŸŒ€ Generated Regression Tests", TestType.REPLAY_TEST: "āŖ Replay Tests", TestType.CONCOLIC_COVERAGE_TEST: "šŸ”Ž Concolic Coverage Tests", + TestType.HYPOTHESIS_TEST: "šŸ”® Hypothesis Tests", } return names[self] diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 86e9bf33f..99aeed6d2 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -95,6 +95,7 @@ from codeflash.telemetry.posthog_cf import ph from codeflash.verification.concolic_testing import generate_concolic_tests from codeflash.verification.equivalence import compare_test_results +from codeflash.verification.hypothesis_testing import generate_hypothesis_tests from codeflash.verification.instrument_codeflash_capture import instrument_codeflash_capture from codeflash.verification.parse_line_profile_test_output import parse_line_profile_results from codeflash.verification.parse_test_output import calculate_function_throughput_from_test_results, parse_test_results @@ -281,6 +282,8 @@ def generate_and_instrument_tests( GeneratedTestsList, dict[str, set[FunctionCalledInTest]], str, + dict[str, set[FunctionCalledInTest]], + str, OptimizationSet, list[Path], list[Path], @@ -323,9 +326,15 @@ def generate_and_instrument_tests( generated_tests: GeneratedTestsList optimizations_set: OptimizationSet - count_tests, generated_tests, function_to_concolic_tests, concolic_test_str, optimizations_set = ( - generated_results.unwrap() - ) + ( + count_tests, + generated_tests, + function_to_concolic_tests, + concolic_test_str, + function_to_hypothesis_tests, + hypothesis_test_str, + optimizations_set, + ) = generated_results.unwrap() for i, generated_test in enumerate(generated_tests.generated_tests): with generated_test.behavior_file_path.open("w", encoding="utf8") as f: @@ -345,12 +354,19 @@ def generate_and_instrument_tests( logger.info(f"Generated test {i + 1}/{count_tests}:") code_print(generated_test.generated_original_test_source, file_name=f"test_{i + 1}.py") if concolic_test_str: - logger.info(f"Generated test {count_tests}/{count_tests}:") + logger.info(f"Generated test {count_tests - (1 if hypothesis_test_str else 0)}/{count_tests}:") code_print(concolic_test_str) + if hypothesis_test_str: + logger.info(f"Generated test {count_tests}/{count_tests}:") + code_print(hypothesis_test_str) function_to_all_tests = { - key: self.function_to_tests.get(key, set()) | function_to_concolic_tests.get(key, set()) - for key in set(self.function_to_tests) | set(function_to_concolic_tests) + key: ( + self.function_to_tests.get(key, set()) + | function_to_concolic_tests.get(key, set()) + | function_to_hypothesis_tests.get(key, set()) + ) + for key in set(self.function_to_tests) | set(function_to_concolic_tests) | set(function_to_hypothesis_tests) } instrumented_unittests_created_for_function = self.instrument_existing_tests(function_to_all_tests) @@ -366,6 +382,8 @@ def generate_and_instrument_tests( generated_tests, function_to_concolic_tests, concolic_test_str, + function_to_hypothesis_tests, + hypothesis_test_str, optimizations_set, generated_test_paths, generated_perf_test_paths, @@ -398,6 +416,8 @@ def optimize_function(self) -> Result[BestOptimization, str]: generated_tests, function_to_concolic_tests, concolic_test_str, + function_to_hypothesis_tests, + _hypothesis_test_str, optimizations_set, generated_test_paths, generated_perf_test_paths, @@ -409,6 +429,7 @@ def optimize_function(self) -> Result[BestOptimization, str]: code_context=code_context, original_helper_code=original_helper_code, function_to_concolic_tests=function_to_concolic_tests, + function_to_hypothesis_tests=function_to_hypothesis_tests, generated_test_paths=generated_test_paths, generated_perf_test_paths=generated_perf_test_paths, instrumented_unittests_created_for_function=instrumented_unittests_created_for_function, @@ -991,6 +1012,7 @@ def instrument_existing_tests(self, function_to_all_tests: dict[str, set[Functio existing_test_files_count = 0 replay_test_files_count = 0 concolic_coverage_test_files_count = 0 + hypothesis_test_files_count = 0 unique_instrumented_test_files = set() func_qualname = self.function_to_optimize.qualified_name_with_modules_from_root(self.project_root) @@ -1011,6 +1033,8 @@ def instrument_existing_tests(self, function_to_all_tests: dict[str, set[Functio replay_test_files_count += 1 elif test_type == TestType.CONCOLIC_COVERAGE_TEST: concolic_coverage_test_files_count += 1 + elif test_type == TestType.HYPOTHESIS_TEST: + hypothesis_test_files_count += 1 else: msg = f"Unexpected test type: {test_type}" raise ValueError(msg) @@ -1069,9 +1093,11 @@ def instrument_existing_tests(self, function_to_all_tests: dict[str, set[Functio logger.info( f"Discovered {existing_test_files_count} existing unit test file" f"{'s' if existing_test_files_count != 1 else ''}, {replay_test_files_count} replay test file" - f"{'s' if replay_test_files_count != 1 else ''}, and " + f"{'s' if replay_test_files_count != 1 else ''}, " f"{concolic_coverage_test_files_count} concolic coverage test file" - f"{'s' if concolic_coverage_test_files_count != 1 else ''} for {func_qualname}" + f"{'s' if concolic_coverage_test_files_count != 1 else ''}, and " + f"{hypothesis_test_files_count} hypothesis test file" + f"{'s' if hypothesis_test_files_count != 1 else ''} for {func_qualname}" ) console.rule() return unique_instrumented_test_files @@ -1085,7 +1111,15 @@ def generate_tests_and_optimizations( generated_test_paths: list[Path], generated_perf_test_paths: list[Path], run_experiment: bool = False, # noqa: FBT001, FBT002 - ) -> Result[tuple[GeneratedTestsList, dict[str, set[FunctionCalledInTest]], OptimizationSet], str]: + ) -> Result[ + tuple[ + GeneratedTestsList, + dict[str, set[FunctionCalledInTest]], + dict[str, set[FunctionCalledInTest]], + OptimizationSet, + ], + str, + ]: n_tests = N_TESTS_TO_GENERATE_EFFECTIVE assert len(generated_test_paths) == n_tests console.rule() @@ -1112,7 +1146,10 @@ def generate_tests_and_optimizations( future_concolic_tests = self.executor.submit( generate_concolic_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast ) - futures = [*future_tests, future_optimization_candidates, future_concolic_tests] + future_hypothesis_tests = self.executor.submit( + generate_hypothesis_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast + ) + futures = [*future_tests, future_optimization_candidates, future_concolic_tests, future_hypothesis_tests] if run_experiment: future_candidates_exp = self.executor.submit( self.local_aiservice_client.optimize_python_code, @@ -1164,29 +1201,35 @@ def generate_tests_and_optimizations( logger.warning(f"Failed to generate and instrument tests for {self.function_to_optimize.function_name}") return Failure(f"/!\\ NO TESTS GENERATED for {self.function_to_optimize.function_name}") function_to_concolic_tests, concolic_test_str = future_concolic_tests.result() + function_to_hypothesis_tests, hypothesis_test_str = future_hypothesis_tests.result() count_tests = len(tests) if concolic_test_str: count_tests += 1 + if hypothesis_test_str: + count_tests += 1 logger.info(f"Generated '{count_tests}' tests for {self.function_to_optimize.function_name}") console.rule() generated_tests = GeneratedTestsList(generated_tests=tests) - result = ( + + self.generate_and_instrument_tests_results = ( count_tests, generated_tests, function_to_concolic_tests, concolic_test_str, + function_to_hypothesis_tests, + hypothesis_test_str, OptimizationSet(control=candidates, experiment=candidates_experiment), ) - self.generate_and_instrument_tests_results = result - return Success(result) + return Success(self.generate_and_instrument_tests_results) def setup_and_establish_baseline( self, code_context: CodeOptimizationContext, original_helper_code: dict[Path, str], function_to_concolic_tests: dict[str, set[FunctionCalledInTest]], + function_to_hypothesis_tests: dict[str, set[FunctionCalledInTest]], generated_test_paths: list[Path], generated_perf_test_paths: list[Path], instrumented_unittests_created_for_function: set[Path], @@ -1197,8 +1240,12 @@ def setup_and_establish_baseline( """Set up baseline context and establish original code baseline.""" function_to_optimize_qualified_name = self.function_to_optimize.qualified_name function_to_all_tests = { - key: self.function_to_tests.get(key, set()) | function_to_concolic_tests.get(key, set()) - for key in set(self.function_to_tests) | set(function_to_concolic_tests) + key: ( + self.function_to_tests.get(key, set()) + | function_to_concolic_tests.get(key, set()) + | function_to_hypothesis_tests.get(key, set()) + ) + for key in set(self.function_to_tests) | set(function_to_concolic_tests) | set(function_to_hypothesis_tests) } # Get a dict of file_path_to_classes of fto and helpers_of_fto diff --git a/codeflash/verification/concolic_testing.py b/codeflash/verification/concolic_testing.py index 8f30a1562..e17f5c01f 100644 --- a/codeflash/verification/concolic_testing.py +++ b/codeflash/verification/concolic_testing.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING from codeflash.cli_cmds.console import console, logger +from codeflash.code_utils.code_utils import get_qualified_function_path from codeflash.code_utils.compat import SAFE_SYS_EXECUTABLE from codeflash.code_utils.concolic_utils import clean_concolic_tests from codeflash.code_utils.static_analysis import has_typed_parameters @@ -42,6 +43,9 @@ def generate_concolic_tests( logger.info("Generating concolic opcode coverage tests for the original code…") console.rule() try: + qualified_function_path = get_qualified_function_path( + function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name + ) cover_result = subprocess.run( [ SAFE_SYS_EXECUTABLE, @@ -50,15 +54,7 @@ def generate_concolic_tests( "cover", "--example_output_format=pytest", "--per_condition_timeout=20", - ".".join( - [ - function_to_optimize.file_path.relative_to(args.project_root) - .with_suffix("") - .as_posix() - .replace("/", "."), - function_to_optimize.qualified_name, - ] - ), + qualified_function_path, ], capture_output=True, text=True, diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py new file mode 100644 index 000000000..c716ca739 --- /dev/null +++ b/codeflash/verification/hypothesis_testing.py @@ -0,0 +1,207 @@ +from __future__ import annotations + +import ast +import subprocess +import tempfile +import time +from pathlib import Path +from typing import TYPE_CHECKING + +from codeflash.cli_cmds.console import console, logger +from codeflash.code_utils.code_utils import get_qualified_function_path +from codeflash.code_utils.formatter import format_code +from codeflash.code_utils.static_analysis import has_typed_parameters +from codeflash.discovery.discover_unit_tests import discover_unit_tests +from codeflash.verification.verification_utils import TestConfig + +if TYPE_CHECKING: + from argparse import Namespace + + from codeflash.discovery.functions_to_optimize import FunctionToOptimize + from codeflash.models.models import FunctionCalledInTest + + +def remove_functions_with_only_any_type(code_string: str) -> str: + """Remove functions that have only Any type annotations. + + This filters out functions where all parameters are annotated with typing.Any, + as these don't provide useful type information for property-based testing. + """ + tree = ast.parse(code_string) + new_body = [] + + for node in tree.body: + if isinstance(node, (ast.Import, ast.ImportFrom)): + new_body.append(node) + elif isinstance(node, ast.FunctionDef): + all_any = True + has_args = False + + for arg in node.args.args: + has_args = True + if arg.annotation: + if isinstance(arg.annotation, ast.Name): + if arg.annotation.id != "Any": + all_any = False + elif isinstance(arg.annotation, ast.Attribute): + if arg.annotation.attr != "Any": + all_any = False + elif isinstance(arg.annotation, ast.Subscript): + all_any = False + else: + all_any = False + else: + all_any = False + + if (has_args and not all_any) or not has_args: + new_body.append(node) + + else: + new_body.append(node) + + new_tree = ast.Module(body=new_body, type_ignores=[]) + return ast.unparse(new_tree) + + +def make_hypothesis_tests_deterministic(code: str) -> str: + """Add @settings(derandomize=True) decorator to make Hypothesis tests deterministic.""" + try: + tree = ast.parse(code) + except SyntaxError: + return code + + settings_imported = any( + isinstance(node, ast.ImportFrom) + and node.module == "hypothesis" + and any(alias.name == "settings" for alias in node.names) + for node in tree.body + ) + + if not settings_imported: + tree.body.insert(0, ast.parse("from hypothesis import settings").body[0]) + + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef): + settings_decorator = next( + ( + d + for d in node.decorator_list + if isinstance(d, ast.Call) and isinstance(d.func, ast.Name) and d.func.id == "settings" + ), + None, + ) + + if settings_decorator: + if not any(k.arg == "derandomize" for k in settings_decorator.keywords): + settings_decorator.keywords.append(ast.keyword(arg="derandomize", value=ast.Constant(value=True))) + else: + node.decorator_list.append( + ast.Call( + func=ast.Name(id="settings", ctx=ast.Load()), + args=[], + keywords=[ast.keyword(arg="derandomize", value=ast.Constant(value=True))], + ) + ) + + return ast.unparse(tree) + + +def generate_hypothesis_tests( + test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST +) -> tuple[dict[str, list[FunctionCalledInTest]], str]: + """Generate property-based tests using Hypothesis ghostwriter. + + This function: + 1. Uses Hypothesis CLI to generate property-based tests for the target function + 2. Filters generated tests to only include the target function + 3. Removes functions with only Any type annotations + 4. Makes tests deterministic by adding @settings(derandomize=True) + 5. Formats the tests with the project formatter + + Returns: + Tuple of (function_to_tests_map, test_suite_code) + + """ + start_time = time.perf_counter() + function_to_hypothesis_tests: dict[str, list[FunctionCalledInTest]] = {} + hypothesis_test_suite_code: str = "" + + if ( + test_cfg.project_root_path + and isinstance(function_to_optimize_ast, (ast.FunctionDef, ast.AsyncFunctionDef)) + and has_typed_parameters(function_to_optimize_ast, function_to_optimize.parents) + ): + logger.info("Generating Hypothesis tests for the original code…") + console.rule() + + try: + qualified_function_path = get_qualified_function_path( + function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name + ) + logger.info(f"command: hypothesis write {function_to_optimize.file_path.stem}") + + hypothesis_result = subprocess.run( + ["hypothesis", "write", qualified_function_path], + capture_output=True, + text=True, + cwd=args.project_root, + check=False, + timeout=60, + ) + except subprocess.TimeoutExpired: + logger.debug("Hypothesis test generation timed out") + end_time = time.perf_counter() + logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds") + return function_to_hypothesis_tests, hypothesis_test_suite_code + + if hypothesis_result.returncode == 0: + hypothesis_test_suite_code = hypothesis_result.stdout + hypothesis_test_suite_dir = Path(tempfile.mkdtemp(dir=test_cfg.tests_root)) + hypothesis_path = hypothesis_test_suite_dir / "test_hypothesis.py" + hypothesis_path.write_text(hypothesis_test_suite_code, encoding="utf8") + + hypothesis_config = TestConfig( + tests_root=hypothesis_test_suite_dir, + tests_project_rootdir=test_cfg.tests_project_rootdir, + project_root_path=args.project_root, + test_framework=args.test_framework, + pytest_cmd=args.pytest_cmd, + ) + function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = discover_unit_tests(hypothesis_config) + with hypothesis_path.open("r", encoding="utf-8") as f: + tree = ast.parse(f.read()) + + class TestFunctionRemover(ast.NodeTransformer): + def visit_FunctionDef(self, node): # noqa: ANN001, ANN202 + if function_to_optimize.function_name not in node.name: + return None + return node + + modified_tree = TestFunctionRemover().visit(tree) + ast.fix_missing_locations(modified_tree) + unparsed = ast.unparse(modified_tree) + + hypothesis_test_suite_code = format_code( + make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)), + function_to_optimize.file_path, + ) + with hypothesis_path.open("w", encoding="utf-8") as f: + f.write(hypothesis_test_suite_code) + function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = discover_unit_tests(hypothesis_config) + logger.info( + f"Created {num_discovered_hypothesis_tests} " + f"hypothesis unit test case{'s' if num_discovered_hypothesis_tests != 1 else ''} " + ) + console.rule() + end_time = time.perf_counter() + logger.debug(f"Generated hypothesis tests in {end_time - start_time:.2f} seconds") + return function_to_hypothesis_tests, hypothesis_test_suite_code + + logger.debug( + f"Error running hypothesis write {': ' + hypothesis_result.stderr if hypothesis_result.stderr else '.'}" + ) + console.rule() + + end_time = time.perf_counter() + logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds") + return function_to_hypothesis_tests, hypothesis_test_suite_code diff --git a/pyproject.toml b/pyproject.toml index 1186574c0..911b2728f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dependencies = [ "pygls>=2.0.0,<3.0.0", "codeflash-benchmark", "filelock", + "hypothesis>=6.141.1", ] [project.urls] diff --git a/uv.lock b/uv.lock index 0d99bdf15..6d3800cbd 100644 --- a/uv.lock +++ b/uv.lock @@ -309,6 +309,8 @@ dependencies = [ { name = "filelock", version = "3.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "gitpython" }, { name = "humanize" }, + { name = "hypothesis", version = "6.141.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "hypothesis", version = "6.142.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "inquirer", version = "3.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9.2'" }, { name = "inquirer", version = "3.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9.2'" }, { name = "isort", version = "6.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, @@ -399,6 +401,7 @@ requires-dist = [ { name = "filelock" }, { name = "gitpython", specifier = ">=3.1.31" }, { name = "humanize", specifier = ">=4.0.0" }, + { name = "hypothesis", specifier = ">=6.141.1" }, { name = "inquirer", specifier = ">=3.0.0" }, { name = "isort", specifier = ">=5.11.0" }, { name = "jedi", specifier = ">=0.19.1" }, @@ -791,6 +794,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/c7/316e7ca04d26695ef0635dc81683d628350810eb8e9b2299fc08ba49f366/humanize-4.13.0-py3-none-any.whl", hash = "sha256:b810820b31891813b1673e8fec7f1ed3312061eab2f26e3fa192c393d11ed25f", size = 128869, upload-time = "2025-08-25T09:39:18.54Z" }, ] +[[package]] +name = "hypothesis" +version = "6.141.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.9.2' and python_full_version < '3.10'", + "python_full_version < '3.9.2'", +] +dependencies = [ + { name = "attrs", marker = "python_full_version < '3.10'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.10'" }, + { name = "sortedcontainers", marker = "python_full_version < '3.10'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/85/20/8aa62b3e69fea68bb30d35d50be5395c98979013acd8152d64dc927e4cdb/hypothesis-6.141.1.tar.gz", hash = "sha256:8ef356e1e18fbeaa8015aab3c805303b7fe4b868e5b506e87ad83c0bf951f46f", size = 467389, upload-time = "2025-10-15T19:12:25.262Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/9a/f901858f139694dd669776983781b08a7c1717911025da6720e526bd8ce3/hypothesis-6.141.1-py3-none-any.whl", hash = "sha256:a5b3c39c16d98b7b4c3c5c8d4262e511e3b2255e6814ced8023af49087ad60b3", size = 535000, upload-time = "2025-10-15T19:12:21.659Z" }, +] + +[[package]] +name = "hypothesis" +version = "6.142.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.13'", + "python_full_version == '3.12.*'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +dependencies = [ + { name = "attrs", marker = "python_full_version >= '3.10'" }, + { name = "exceptiongroup", marker = "python_full_version == '3.10.*'" }, + { name = "sortedcontainers", marker = "python_full_version >= '3.10'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/47/83/8f76d7c965beb4d3a65d188232c32db97b0799b0e893227d520d5d2a0144/hypothesis-6.142.2.tar.gz", hash = "sha256:c4204a2ce327e45fbaf83a2b58142a285135698dc1d08e368ae9901f06b49e64", size = 465987, upload-time = "2025-10-20T16:08:20.225Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/8f/194d63f715c7b0ace35b4f2a83b756d5bc703299b706c401b7ec593054fc/hypothesis-6.142.2-py3-none-any.whl", hash = "sha256:cc6c6e66c06aff695dd255501a767b528e00d84ce3572160425a9ba5e4a47845", size = 533375, upload-time = "2025-10-20T16:08:16.903Z" }, +] + [[package]] name = "identify" version = "2.6.15" @@ -3275,6 +3316,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" }, ] +[[package]] +name = "sortedcontainers" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, +] + [[package]] name = "stack-data" version = "0.6.3" From c71d2dac81f46f8f786376abc0d6450af81d0b9a Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Fri, 24 Oct 2025 21:32:50 -0500 Subject: [PATCH 02/16] tidy up --- codeflash/code_utils/env_utils.py | 52 ++++++++++++++------ codeflash/verification/hypothesis_testing.py | 5 +- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/codeflash/code_utils/env_utils.py b/codeflash/code_utils/env_utils.py index 08b1fc0da..749c49676 100644 --- a/codeflash/code_utils/env_utils.py +++ b/codeflash/code_utils/env_utils.py @@ -2,6 +2,8 @@ import json import os +import shlex +import shutil import tempfile from functools import lru_cache from pathlib import Path @@ -14,21 +16,41 @@ def check_formatter_installed(formatter_cmds: list[str], exit_on_failure: bool = True) -> bool: # noqa - return_code = True - if formatter_cmds[0] == "disabled": - return return_code + if not formatter_cmds or formatter_cmds[0] == "disabled": + return True + + first_cmd = formatter_cmds[0] + cmd_tokens = shlex.split(first_cmd) if isinstance(first_cmd, str) else [first_cmd] + + if not cmd_tokens: + return True + + exe_name = cmd_tokens[0] + command_str = " ".join(formatter_cmds).replace(" $file", "") + + if shutil.which(exe_name) is None: + logger.error( + f"Could not find formatter: {command_str}\n" + f"Please install it or update 'formatter-cmds' in your codeflash configuration" + ) + return False + tmp_code = """print("hello world")""" - with tempfile.TemporaryDirectory() as tmpdir: - tmp_file = Path(tmpdir) / "test_codeflash_formatter.py" - tmp_file.write_text(tmp_code, encoding="utf-8") - try: - format_code(formatter_cmds, tmp_file, print_status=False, exit_on_failure=exit_on_failure) - except Exception: - exit_with_message( - "āš ļø Codeflash requires a code formatter to be installed in your environment, but none was found. Please install a supported formatter, verify the formatter-cmds in your codeflash pyproject.toml config and try again.", - error_on_exit=True, - ) - return return_code + try: + with tempfile.TemporaryDirectory() as tmpdir: + tmp_file = Path(tmpdir) / "test_codeflash_formatter.py" + tmp_file.write_text(tmp_code, encoding="utf-8") + format_code(formatter_cmds, tmp_file, print_status=False, exit_on_failure=False) + return True + except FileNotFoundError: + logger.error( + f"Could not find formatter: {command_str}\n" + f"Please install it or update 'formatter-cmds' in your codeflash configuration" + ) + return False + except Exception as e: + logger.error(f"Formatter failed to run: {command_str}\nError: {e}") + return False @lru_cache(maxsize=1) @@ -138,4 +160,4 @@ def is_ci() -> bool: def is_pr_draft() -> bool: """Check if the PR is draft. in the github action context.""" event = get_cached_gh_event_data() - return bool(event.get("pull_request", {}).get("draft", False)) + return bool(event.get("pull_request", {}).get("draft", False)) \ No newline at end of file diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py index c716ca739..fe681a619 100644 --- a/codeflash/verification/hypothesis_testing.py +++ b/codeflash/verification/hypothesis_testing.py @@ -138,7 +138,7 @@ def generate_hypothesis_tests( qualified_function_path = get_qualified_function_path( function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name ) - logger.info(f"command: hypothesis write {function_to_optimize.file_path.stem}") + logger.info(f"command: hypothesis write {qualified_function_path}") hypothesis_result = subprocess.run( ["hypothesis", "write", qualified_function_path], @@ -182,8 +182,9 @@ def visit_FunctionDef(self, node): # noqa: ANN001, ANN202 unparsed = ast.unparse(modified_tree) hypothesis_test_suite_code = format_code( - make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)), + args.formatter_cmds, function_to_optimize.file_path, + optimized_code=make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)), ) with hypothesis_path.open("w", encoding="utf-8") as f: f.write(hypothesis_test_suite_code) From f6285263c4e57d22043f437f758f0e5e20a67cec Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sat, 25 Oct 2025 00:09:19 -0500 Subject: [PATCH 03/16] Update hypothesis_testing.py --- codeflash/verification/hypothesis_testing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py index fe681a619..0caf985c7 100644 --- a/codeflash/verification/hypothesis_testing.py +++ b/codeflash/verification/hypothesis_testing.py @@ -173,9 +173,9 @@ def generate_hypothesis_tests( class TestFunctionRemover(ast.NodeTransformer): def visit_FunctionDef(self, node): # noqa: ANN001, ANN202 - if function_to_optimize.function_name not in node.name: - return None - return node + if node.name.startswith("test_") and function_to_optimize.function_name in node.name: + return node + return None modified_tree = TestFunctionRemover().visit(tree) ast.fix_missing_locations(modified_tree) From bfe4179d0db14261f51d90ede2db487c75bbd71c Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sat, 25 Oct 2025 00:46:32 -0500 Subject: [PATCH 04/16] Update hypothesis_testing.py --- codeflash/verification/hypothesis_testing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py index 0caf985c7..986f37cd9 100644 --- a/codeflash/verification/hypothesis_testing.py +++ b/codeflash/verification/hypothesis_testing.py @@ -181,9 +181,11 @@ def visit_FunctionDef(self, node): # noqa: ANN001, ANN202 ast.fix_missing_locations(modified_tree) unparsed = ast.unparse(modified_tree) + console.print(f"modified src: {unparsed}") + hypothesis_test_suite_code = format_code( args.formatter_cmds, - function_to_optimize.file_path, + hypothesis_path, optimized_code=make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)), ) with hypothesis_path.open("w", encoding="utf-8") as f: From 7ee1ab1d12b284d504f62a7874170bf012f44087 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sat, 25 Oct 2025 17:14:31 -0500 Subject: [PATCH 05/16] cleanup --- codeflash/verification/hypothesis_testing.py | 151 ++++++++++++++++--- 1 file changed, 128 insertions(+), 23 deletions(-) diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py index 986f37cd9..ca3f3131b 100644 --- a/codeflash/verification/hypothesis_testing.py +++ b/codeflash/verification/hypothesis_testing.py @@ -63,8 +63,43 @@ def remove_functions_with_only_any_type(code_string: str) -> str: return ast.unparse(new_tree) +def filter_hypothesis_tests_by_function_name(code: str, function_name: str) -> str: + """Filter hypothesis tests to only include tests matching the function name. + + Preserves all imports, module-level assignments, and only test functions + that contain the target function name. + + Args: + code: The hypothesis test code to filter + function_name: The name of the function being tested + + Returns: + Filtered code with only matching tests + """ + tree = ast.parse(code) + + class TestFunctionRemover(ast.NodeTransformer): + def visit_Module(self, node): # noqa: ANN001, ANN202 + # Filter body to keep imports, module-level assignments, and matching test functions + new_body = [] + for item in node.body: + if isinstance(item, (ast.Import, ast.ImportFrom, ast.Assign)): + # Keep all imports and module-level assignments + new_body.append(item) + elif isinstance(item, ast.FunctionDef): + # Only keep test functions that match the function name + if item.name.startswith("test_") and function_name in item.name: + new_body.append(item) + node.body = new_body + return node + + modified_tree = TestFunctionRemover().visit(tree) + ast.fix_missing_locations(modified_tree) + return ast.unparse(modified_tree) + + def make_hypothesis_tests_deterministic(code: str) -> str: - """Add @settings(derandomize=True) decorator to make Hypothesis tests deterministic.""" + """Add @settings(derandomize=True) decorator and constrain strategies to make Hypothesis tests deterministic.""" try: tree = ast.parse(code) except SyntaxError: @@ -80,26 +115,83 @@ def make_hypothesis_tests_deterministic(code: str) -> str: if not settings_imported: tree.body.insert(0, ast.parse("from hypothesis import settings").body[0]) + class StrategyConstrainer(ast.NodeTransformer): + def visit_Call(self, node: ast.Call) -> ast.Call: + self.generic_visit(node) + + # Check if this is a strategy call (st.floats(), st.integers(), etc.) + if ( + isinstance(node.func, ast.Attribute) + and isinstance(node.func.value, ast.Name) + and node.func.value.id == "st" + ): + if node.func.attr == "floats" and not any( + k.arg in ["min_value", "max_value", "allow_nan", "allow_infinity"] + for k in node.keywords + ): + # Constrain floats to reasonable bounds + node.keywords.extend( + [ + ast.keyword( + arg="min_value", + value=ast.UnaryOp( + op=ast.USub(), operand=ast.Constant(value=1e6) + ), + ), + ast.keyword(arg="max_value", value=ast.Constant(value=1e6)), + ast.keyword( + arg="allow_nan", value=ast.Constant(value=False) + ), + ast.keyword( + arg="allow_infinity", value=ast.Constant(value=False) + ), + ] + ) + elif node.func.attr == "integers" and not any( + k.arg in ["min_value", "max_value"] for k in node.keywords + ): + # Constrain integers to reasonable bounds + node.keywords.extend( + [ + ast.keyword(arg="min_value", value=ast.Constant(value=0)), + ast.keyword( + arg="max_value", value=ast.Constant(value=10000) + ), + ] + ) + return node + + tree = StrategyConstrainer().visit(tree) + ast.fix_missing_locations(tree) + for node in ast.walk(tree): if isinstance(node, ast.FunctionDef): settings_decorator = next( ( d for d in node.decorator_list - if isinstance(d, ast.Call) and isinstance(d.func, ast.Name) and d.func.id == "settings" + if isinstance(d, ast.Call) + and isinstance(d.func, ast.Name) + and d.func.id == "settings" ), None, ) if settings_decorator: if not any(k.arg == "derandomize" for k in settings_decorator.keywords): - settings_decorator.keywords.append(ast.keyword(arg="derandomize", value=ast.Constant(value=True))) + settings_decorator.keywords.append( + ast.keyword(arg="derandomize", value=ast.Constant(value=True)) + ) else: node.decorator_list.append( ast.Call( func=ast.Name(id="settings", ctx=ast.Load()), args=[], - keywords=[ast.keyword(arg="derandomize", value=ast.Constant(value=True))], + keywords=[ + ast.keyword( + arg="derandomize", value=ast.Constant(value=True) + ) + ], ) ) @@ -107,7 +199,10 @@ def make_hypothesis_tests_deterministic(code: str) -> str: def generate_hypothesis_tests( - test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST + test_cfg: TestConfig, + args: Namespace, + function_to_optimize: FunctionToOptimize, + function_to_optimize_ast: ast.AST, ) -> tuple[dict[str, list[FunctionCalledInTest]], str]: """Generate property-based tests using Hypothesis ghostwriter. @@ -128,7 +223,9 @@ def generate_hypothesis_tests( if ( test_cfg.project_root_path - and isinstance(function_to_optimize_ast, (ast.FunctionDef, ast.AsyncFunctionDef)) + and isinstance( + function_to_optimize_ast, (ast.FunctionDef, ast.AsyncFunctionDef) + ) and has_typed_parameters(function_to_optimize_ast, function_to_optimize.parents) ): logger.info("Generating Hypothesis tests for the original code…") @@ -136,7 +233,9 @@ def generate_hypothesis_tests( try: qualified_function_path = get_qualified_function_path( - function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name + function_to_optimize.file_path, + args.project_root, + function_to_optimize.qualified_name, ) logger.info(f"command: hypothesis write {qualified_function_path}") @@ -151,7 +250,9 @@ def generate_hypothesis_tests( except subprocess.TimeoutExpired: logger.debug("Hypothesis test generation timed out") end_time = time.perf_counter() - logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds") + logger.debug( + f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds" + ) return function_to_hypothesis_tests, hypothesis_test_suite_code if hypothesis_result.returncode == 0: @@ -167,37 +268,39 @@ def generate_hypothesis_tests( test_framework=args.test_framework, pytest_cmd=args.pytest_cmd, ) - function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = discover_unit_tests(hypothesis_config) + function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = ( + discover_unit_tests(hypothesis_config) + ) with hypothesis_path.open("r", encoding="utf-8") as f: - tree = ast.parse(f.read()) - - class TestFunctionRemover(ast.NodeTransformer): - def visit_FunctionDef(self, node): # noqa: ANN001, ANN202 - if node.name.startswith("test_") and function_to_optimize.function_name in node.name: - return node - return None + original_code = f.read() - modified_tree = TestFunctionRemover().visit(tree) - ast.fix_missing_locations(modified_tree) - unparsed = ast.unparse(modified_tree) + unparsed = filter_hypothesis_tests_by_function_name( + original_code, function_to_optimize.function_name + ) console.print(f"modified src: {unparsed}") hypothesis_test_suite_code = format_code( args.formatter_cmds, hypothesis_path, - optimized_code=make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)), + optimized_code=make_hypothesis_tests_deterministic( + remove_functions_with_only_any_type(unparsed) + ), ) with hypothesis_path.open("w", encoding="utf-8") as f: f.write(hypothesis_test_suite_code) - function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = discover_unit_tests(hypothesis_config) + function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = ( + discover_unit_tests(hypothesis_config) + ) logger.info( f"Created {num_discovered_hypothesis_tests} " f"hypothesis unit test case{'s' if num_discovered_hypothesis_tests != 1 else ''} " ) console.rule() end_time = time.perf_counter() - logger.debug(f"Generated hypothesis tests in {end_time - start_time:.2f} seconds") + logger.debug( + f"Generated hypothesis tests in {end_time - start_time:.2f} seconds" + ) return function_to_hypothesis_tests, hypothesis_test_suite_code logger.debug( @@ -206,5 +309,7 @@ def visit_FunctionDef(self, node): # noqa: ANN001, ANN202 console.rule() end_time = time.perf_counter() - logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds") + logger.debug( + f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds" + ) return function_to_hypothesis_tests, hypothesis_test_suite_code From 99f095472e374aff659c8822bcd8c7d09e26b8f7 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sat, 25 Oct 2025 20:22:35 -0500 Subject: [PATCH 06/16] lazy impl --- codeflash/discovery/discover_unit_tests.py | 150 ++++++++++++++++++- codeflash/verification/concolic_testing.py | 5 +- codeflash/verification/hypothesis_testing.py | 5 +- 3 files changed, 156 insertions(+), 4 deletions(-) diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py index d7c1c0a83..1f23eaf5b 100644 --- a/codeflash/discovery/discover_unit_tests.py +++ b/codeflash/discovery/discover_unit_tests.py @@ -9,6 +9,7 @@ import re import sqlite3 import subprocess +import sys import unittest from collections import defaultdict from pathlib import Path @@ -66,6 +67,75 @@ class TestFunction: FUNCTION_NAME_REGEX = re.compile(r"([^.]+)\.([a-zA-Z0-9_]+)$") +def _extract_dotted_call_name(node: ast.expr) -> str | None: + """Extract full dotted name from function call (e.g., 'src.math.computation.gcd_recursive').""" + parts = [] + current = node + while isinstance(current, ast.Attribute): + parts.insert(0, current.attr) + current = current.value + if isinstance(current, ast.Name): + parts.insert(0, current.id) + return ".".join(parts) if parts else None + return None + + +def _discover_calls_via_ast( + test_file: Path, test_functions: set[TestFunction], target_qualified_names: set[str] +) -> dict[str, list[tuple[TestFunction, CodePosition]]]: + try: + with test_file.open("r", encoding="utf-8") as f: + source = f.read() + tree = ast.parse(source, filename=str(test_file)) + except (SyntaxError, FileNotFoundError) as e: + logger.debug(f"AST parsing failed for {test_file}: {e}") + return {} + + import_map = {} # alias -> full_qualified_path + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + name = alias.asname or alias.name + import_map[name] = alias.name + elif isinstance(node, ast.ImportFrom) and node.module: + for alias in node.names: + if alias.name != "*": + full_name = f"{node.module}.{alias.name}" + name = alias.asname or alias.name + import_map[name] = full_name + + test_funcs_by_name = {tf.function_name: tf for tf in test_functions} + + result = defaultdict(list) + + for node in ast.walk(tree): + if not isinstance(node, ast.FunctionDef) or node.name not in test_funcs_by_name: + continue + + test_func = test_funcs_by_name[node.name] + + for child in ast.walk(node): + if not isinstance(child, ast.Call): + continue + + call_name = _extract_dotted_call_name(child.func) + if not call_name: + continue + + if call_name in target_qualified_names: + result[call_name].append((test_func, CodePosition(line_no=child.lineno, col_no=child.col_offset))) + continue + + parts = call_name.split(".", 1) + if parts[0] in import_map: + resolved = f"{import_map[parts[0]]}.{parts[1]}" if len(parts) == 2 else import_map[parts[0]] + + if resolved in target_qualified_names: + result[resolved].append((test_func, CodePosition(line_no=child.lineno, col_no=child.col_offset))) + + return dict(result) + + class TestsCache: SCHEMA_VERSION = 1 # Increment this when schema changes @@ -489,6 +559,7 @@ def discover_tests_pytest( console.rule() else: logger.debug(f"Pytest collection exit code: {exitcode}") + if pytest_rootdir is not None: cfg.tests_project_rootdir = Path(pytest_rootdir) file_to_test_map: dict[Path, list[FunctionCalledInTest]] = defaultdict(list) @@ -511,6 +582,7 @@ def discover_tests_pytest( if discover_only_these_tests and test_obj.test_file not in discover_only_these_tests: continue file_to_test_map[test_obj.test_file].append(test_obj) + # Within these test files, find the project functions they are referring to and return their names/locations return process_test_files(file_to_test_map, cfg, functions_to_optimize) @@ -592,7 +664,9 @@ def process_test_files( test_framework = cfg.test_framework if functions_to_optimize: - target_function_names = {func.qualified_name for func in functions_to_optimize} + target_function_names = { + func.qualified_name_with_modules_from_root(project_root_path) for func in functions_to_optimize + } file_to_test_map = filter_test_files_by_imports(file_to_test_map, target_function_names) function_to_test_map = defaultdict(set) @@ -602,6 +676,7 @@ def process_test_files( tests_cache = TestsCache(project_root_path) logger.info("!lsp|Discovering tests and processing unit tests") + with test_files_progress_bar(total=len(file_to_test_map), description="Processing test files") as ( progress, task_id, @@ -702,6 +777,79 @@ def process_test_files( test_functions_by_name[func.function_name].append(func) test_function_names_set = set(test_functions_by_name.keys()) + + is_generated_test_file = ( + any( + tf.test_type in (TestType.HYPOTHESIS_TEST, TestType.CONCOLIC_COVERAGE_TEST) for tf in test_functions + ) + if test_functions + else any( + func.test_type in (TestType.HYPOTHESIS_TEST, TestType.CONCOLIC_COVERAGE_TEST) for func in functions + ) + ) + + # For generated tests, use AST-based discovery since Jedi often fails + if is_generated_test_file and functions_to_optimize: + logger.debug(f"Using AST-based discovery for generated test file: {test_file.name}") + target_qualified_names = { + func.qualified_name_with_modules_from_root(project_root_path) for func in functions_to_optimize + } + + if not test_functions: + logger.debug("Jedi found no functions, building test_functions from collected functions") + test_functions = { + TestFunction( + function_name=func.test_function, + test_class=func.test_class, + parameters=None, + test_type=func.test_type, + ) + for func in functions + } + + ast_results = _discover_calls_via_ast(test_file, test_functions, target_qualified_names) + + for qualified_name, matches in ast_results.items(): + for test_func, position in matches: + if test_func.parameters is not None: + if test_framework == "pytest": + scope_test_function = f"{test_func.function_name}[{test_func.parameters}]" + else: # unittest + scope_test_function = f"{test_func.function_name}_{test_func.parameters}" + else: + scope_test_function = test_func.function_name + + function_to_test_map[qualified_name].add( + FunctionCalledInTest( + tests_in_file=TestsInFile( + test_file=test_file, + test_class=test_func.test_class, + test_function=scope_test_function, + test_type=test_func.test_type, + ), + position=position, + ) + ) + tests_cache.insert_test( + file_path=str(test_file), + file_hash=file_hash, + qualified_name_with_modules_from_root=qualified_name, + function_name=test_func.function_name, + test_class=test_func.test_class or "", + test_function=scope_test_function, + test_type=test_func.test_type, + line_number=position.line_no, + col_number=position.col_no, + ) + + if test_func.test_type == TestType.REPLAY_TEST: + num_discovered_replay_tests += 1 + + num_discovered_tests += 1 + + progress.advance(task_id) + continue + relevant_names = [] names_with_full_name = [name for name in all_names if name.full_name is not None] diff --git a/codeflash/verification/concolic_testing.py b/codeflash/verification/concolic_testing.py index e17f5c01f..2190ba6f9 100644 --- a/codeflash/verification/concolic_testing.py +++ b/codeflash/verification/concolic_testing.py @@ -80,7 +80,10 @@ def generate_concolic_tests( test_framework=args.test_framework, pytest_cmd=args.pytest_cmd, ) - function_to_concolic_tests, num_discovered_concolic_tests, _ = discover_unit_tests(concolic_test_cfg) + file_to_funcs = {function_to_optimize.file_path: [function_to_optimize]} + function_to_concolic_tests, num_discovered_concolic_tests, _ = discover_unit_tests( + concolic_test_cfg, file_to_funcs_to_optimize=file_to_funcs + ) logger.info( f"Created {num_discovered_concolic_tests} " f"concolic unit test case{'s' if num_discovered_concolic_tests != 1 else ''} " diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py index ca3f3131b..5e6a7863c 100644 --- a/codeflash/verification/hypothesis_testing.py +++ b/codeflash/verification/hypothesis_testing.py @@ -268,8 +268,9 @@ def generate_hypothesis_tests( test_framework=args.test_framework, pytest_cmd=args.pytest_cmd, ) + file_to_funcs = {function_to_optimize.file_path: [function_to_optimize]} function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = ( - discover_unit_tests(hypothesis_config) + discover_unit_tests(hypothesis_config, file_to_funcs_to_optimize=file_to_funcs) ) with hypothesis_path.open("r", encoding="utf-8") as f: original_code = f.read() @@ -290,7 +291,7 @@ def generate_hypothesis_tests( with hypothesis_path.open("w", encoding="utf-8") as f: f.write(hypothesis_test_suite_code) function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = ( - discover_unit_tests(hypothesis_config) + discover_unit_tests(hypothesis_config, file_to_funcs_to_optimize=file_to_funcs) ) logger.info( f"Created {num_discovered_hypothesis_tests} " From 572ac0e3133f98475ac3be4d9e52269c0a880e84 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sat, 25 Oct 2025 20:58:22 -0500 Subject: [PATCH 07/16] check --- codeflash/verification/hypothesis_testing.py | 4 +- tests/test_hypothesis_testing.py | 158 +++++++++++++++++++ 2 files changed, 160 insertions(+), 2 deletions(-) create mode 100644 tests/test_hypothesis_testing.py diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py index 5e6a7863c..a39e3999e 100644 --- a/codeflash/verification/hypothesis_testing.py +++ b/codeflash/verification/hypothesis_testing.py @@ -150,10 +150,10 @@ def visit_Call(self, node: ast.Call) -> ast.Call: elif node.func.attr == "integers" and not any( k.arg in ["min_value", "max_value"] for k in node.keywords ): - # Constrain integers to reasonable bounds + # Constrain integers to reasonable bounds (including negatives) node.keywords.extend( [ - ast.keyword(arg="min_value", value=ast.Constant(value=0)), + ast.keyword(arg="min_value", value=ast.Constant(value=-10000)), ast.keyword( arg="max_value", value=ast.Constant(value=10000) ), diff --git a/tests/test_hypothesis_testing.py b/tests/test_hypothesis_testing.py new file mode 100644 index 000000000..49fff9515 --- /dev/null +++ b/tests/test_hypothesis_testing.py @@ -0,0 +1,158 @@ +"""Tests for hypothesis_testing.py functions.""" + +from codeflash.verification.hypothesis_testing import make_hypothesis_tests_deterministic + + +def test_adds_derandomize_decorator(): + """Test that @settings(derandomize=True) is added when missing.""" + src = """ +from hypothesis import given, strategies as st + +@given(x=st.integers()) +def test_x(x): + assert isinstance(x, int) +""" + out = make_hypothesis_tests_deterministic(src) + assert "@settings(derandomize=True)" in out or "settings(derandomize=True)" in out + + +def test_integers_constrained_with_negatives(): + """Test that st.integers() gets bounded to [-10000, 10000].""" + src = """from hypothesis import given, strategies as st +@given(x=st.integers()) +def t(x): + pass +""" + out = make_hypothesis_tests_deterministic(src) + # Remove spaces for easier checking + normalized = out.replace(" ", "").replace("\n", "") + assert "min_value=-10000" in normalized + assert "max_value=10000" in normalized + + +def test_floats_constrained_to_finite(): + """Test that st.floats() is constrained to finite values with bounds.""" + src = """from hypothesis import given, strategies as st +@given(x=st.floats()) +def t(x): + pass +""" + out = make_hypothesis_tests_deterministic(src) + normalized = out.replace(" ", "").replace("\n", "") + assert "allow_nan=False" in normalized + assert "allow_infinity=False" in normalized + assert "min_value=" in normalized and "max_value=" in normalized + + +def test_existing_constraints_not_overridden(): + """Test that existing constraints on strategies are preserved.""" + src = """from hypothesis import given, strategies as st, settings + +@settings(derandomize=True, max_examples=5) +@given(x=st.integers(min_value=-5, max_value=5)) +def t(x): + pass +""" + out = make_hypothesis_tests_deterministic(src) + # Should not add duplicate settings decorator + assert out.count("@settings") == 1 + # Should preserve original constraints + assert "min_value=-5" in out or "min_value= -5" in out + assert "max_value=5" in out or "max_value= 5" in out + # Should not add the default -10000/10000 bounds + assert "-10000" not in out + + +def test_existing_float_constraints_preserved(): + """Test that existing float constraints are not overridden.""" + src = """from hypothesis import given, strategies as st + +@given(y=st.floats(min_value=-1.0, max_value=1.0, allow_nan=False, allow_infinity=False)) +def t(y): + pass +""" + out = make_hypothesis_tests_deterministic(src) + assert "min_value=-1.0" in out or "min_value= -1.0" in out + assert "max_value=1.0" in out or "max_value= 1.0" in out + # Should not add the default 1e6 bounds + assert "1e6" not in out and "1000000" not in out + + +def test_idempotency(): + """Test that running the function twice produces the same result.""" + src = """from hypothesis import given, strategies as st + +@given(x=st.integers(), y=st.floats()) +def test_func(x, y): + pass +""" + out1 = make_hypothesis_tests_deterministic(src) + out2 = make_hypothesis_tests_deterministic(out1) + assert out1 == out2 + + +def test_multiple_strategies_handled(): + """Test that multiple strategies in one test are all constrained.""" + src = """from hypothesis import given, strategies as st + +@given(a=st.integers(), b=st.integers(), c=st.floats()) +def test_multi(a, b, c): + pass +""" + out = make_hypothesis_tests_deterministic(src) + normalized = out.replace(" ", "").replace("\n", "") + # All integers should be constrained + assert normalized.count("min_value=-10000") >= 2 + assert normalized.count("max_value=10000") >= 2 + # Float should be constrained + assert "allow_nan=False" in normalized + assert "allow_infinity=False" in normalized + + +def test_settings_import_added_if_missing(): + """Test that 'from hypothesis import settings' is added when needed.""" + src = """from hypothesis import given, strategies as st + +@given(x=st.integers()) +def test_x(x): + pass +""" + out = make_hypothesis_tests_deterministic(src) + # Should have settings import or settings in existing import + assert "settings" in out + + +def test_partial_constraints_completed(): + """Test that partial constraints are completed.""" + src = """from hypothesis import given, strategies as st + +@given(x=st.integers(min_value=100)) +def test_x(x): + pass +""" + out = make_hypothesis_tests_deterministic(src) + # Should keep the min_value=100 and not override + assert "min_value=100" in out or "min_value= 100" in out + # Should not add default bounds since min_value exists + assert "-10000" not in out + + +def test_syntax_error_returns_original(): + """Test that invalid Python syntax returns original code unchanged.""" + invalid_src = "this is not valid python @#$%" + out = make_hypothesis_tests_deterministic(invalid_src) + assert out == invalid_src + + +def test_no_hypothesis_code_unchanged(): + """Test that code without hypothesis is returned mostly unchanged.""" + src = """def regular_function(x): + return x * 2 + +def test_regular(): + assert regular_function(2) == 4 +""" + out = make_hypothesis_tests_deterministic(src) + # Should still parse and return valid code + assert "def regular_function" in out + assert "def test_regular" in out From 4866d82b1507b99b693b9eb4b5457319f926af47 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sat, 25 Oct 2025 23:45:19 -0500 Subject: [PATCH 08/16] cleanup --- codeflash/verification/equivalence.py | 123 ++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py index 9d7f5ba2c..fe28cfaa6 100644 --- a/codeflash/verification/equivalence.py +++ b/codeflash/verification/equivalence.py @@ -1,4 +1,5 @@ import sys +from collections import defaultdict from codeflash.cli_cmds.console import logger from codeflash.models.models import TestResults, TestType, VerificationType @@ -14,14 +15,47 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR original_recursion_limit = sys.getrecursionlimit() if original_recursion_limit < INCREASED_RECURSION_LIMIT: sys.setrecursionlimit(INCREASED_RECURSION_LIMIT) # Increase recursion limit to avoid RecursionError + + # Separate Hypothesis tests from other test types for semantic comparison + # Hypothesis tests are always compared semantically (by test function, not example count) + original_hypothesis = [ + r for r in original_results.test_results if r.test_type == TestType.HYPOTHESIS_TEST and r.loop_index == 1 + ] + candidate_hypothesis = [ + r for r in candidate_results.test_results if r.test_type == TestType.HYPOTHESIS_TEST and r.loop_index == 1 + ] + + # Compare Hypothesis tests semantically if any are present + if original_hypothesis or candidate_hypothesis: + logger.debug( + f"Comparing Hypothesis tests: original={len(original_hypothesis)} examples, " + f"candidate={len(candidate_hypothesis)} examples" + ) + hypothesis_equal = _compare_hypothesis_tests_semantic(original_hypothesis, candidate_hypothesis) + if not hypothesis_equal: + logger.info("Hypothesis comparison failed") + sys.setrecursionlimit(original_recursion_limit) + return False + logger.debug("Hypothesis comparison passed") + test_ids_superset = original_results.get_all_unique_invocation_loop_ids().union( set(candidate_results.get_all_unique_invocation_loop_ids()) ) + logger.debug(f"Total test IDs in superset: {len(test_ids_superset)}") are_equal: bool = True did_all_timeout: bool = True for test_id in test_ids_superset: original_test_result = original_results.get_by_unique_invocation_loop_id(test_id) cdd_test_result = candidate_results.get_by_unique_invocation_loop_id(test_id) + + # Skip Hypothesis tests - already compared semantically above + if original_test_result and original_test_result.test_type == TestType.HYPOTHESIS_TEST: + did_all_timeout = False # Hypothesis tests are checked separately, not timed out + continue + if cdd_test_result and cdd_test_result.test_type == TestType.HYPOTHESIS_TEST: + did_all_timeout = False + continue + if cdd_test_result is not None and original_test_result is None: continue # If helper function instance_state verification is not present, that's ok. continue @@ -33,6 +67,11 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR continue if original_test_result is None or cdd_test_result is None: are_equal = False + logger.debug( + f"Test result mismatch: test_id={test_id}, " + f"original_present={original_test_result is not None}, " + f"candidate_present={cdd_test_result is not None}" + ) break did_all_timeout = did_all_timeout and original_test_result.timed_out if original_test_result.timed_out: @@ -80,5 +119,89 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR break sys.setrecursionlimit(original_recursion_limit) if did_all_timeout: + logger.debug("Comparison failed: all tests timed out") return False + logger.debug(f"Final comparison result: are_equal={are_equal}") return are_equal + + +def _compare_hypothesis_tests_semantic(original_hypothesis: list, candidate_hypothesis: list) -> bool: + """Compare Hypothesis tests by test function, not by example count. + + Hypothesis can generate different numbers of examples between runs due to: + - Timing differences + - Early stopping + - Shrinking behavior + - Performance differences + + What matters is whether the test functions themselves pass or fail, + not how many examples Hypothesis generated. + """ + + # Group by test function (excluding loop index and iteration_id from comparison) + def get_test_key(test_result): + """Get unique key for a Hypothesis test function.""" + return ( + test_result.id.test_module_path, + test_result.id.test_class_name, + test_result.id.test_function_name, + test_result.id.function_getting_tested, + ) + + # Group original results by test function + original_by_func = defaultdict(list) + for result in original_hypothesis: + original_by_func[get_test_key(result)].append(result) + + # Group candidate results by test function + candidate_by_func = defaultdict(list) + for result in candidate_hypothesis: + candidate_by_func[get_test_key(result)].append(result) + + # Log summary statistics + orig_total_examples = sum(len(examples) for examples in original_by_func.values()) + cand_total_examples = sum(len(examples) for examples in candidate_by_func.values()) + + logger.debug( + f"Hypothesis comparison: Original={len(original_by_func)} test functions ({orig_total_examples} examples), " + f"Candidate={len(candidate_by_func)} test functions ({cand_total_examples} examples)" + ) + + # Check if all test functions in original are present in candidate + missing_funcs = set(original_by_func.keys()) - set(candidate_by_func.keys()) + if missing_funcs: + logger.warning( + f"Hypothesis test functions missing in candidate: {len(missing_funcs)} functions. " + f"First missing: {missing_funcs.__iter__().__next__()}" + ) + return False + + # Compare each test function's results + for test_key in original_by_func: + if test_key not in candidate_by_func: + continue # Already handled above + + orig_examples = original_by_func[test_key] + cand_examples = candidate_by_func[test_key] + + # Check if any original example failed + orig_had_failure = any(not ex.did_pass for ex in orig_examples) + cand_had_failure = any(not ex.did_pass for ex in cand_examples) + + # If original had failures, candidate must also have failures (or be missing, already handled) + # If original passed, candidate must pass (but can have different example counts) + if orig_had_failure != cand_had_failure: + logger.debug( + f"Hypothesis test function behavior mismatch: {test_key} " + f"(original_failed={orig_had_failure}, candidate_failed={cand_had_failure})" + ) + return False + + if abs(len(orig_examples) - len(cand_examples)) > 10: + logger.info( + f"Hypothesis test '{test_key[2]}': example counts differ " + f"(original={len(orig_examples)}, candidate={len(cand_examples)}). " + f"This is expected when code performance changes." + ) + + return True From b7faf816f31a266ceebd01568073a8a60a44c39d Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sun, 26 Oct 2025 01:05:22 -0500 Subject: [PATCH 09/16] modify equivalence for hypothesis tests --- codeflash/verification/equivalence.py | 4 +- codeflash/verification/hypothesis_testing.py | 81 ++++++-------------- 2 files changed, 25 insertions(+), 60 deletions(-) diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py index fe28cfaa6..efc222acb 100644 --- a/codeflash/verification/equivalence.py +++ b/codeflash/verification/equivalence.py @@ -2,7 +2,7 @@ from collections import defaultdict from codeflash.cli_cmds.console import logger -from codeflash.models.models import TestResults, TestType, VerificationType +from codeflash.models.models import FunctionTestInvocation, TestResults, TestType, VerificationType from codeflash.verification.comparator import comparator INCREASED_RECURSION_LIMIT = 5000 @@ -139,7 +139,7 @@ def _compare_hypothesis_tests_semantic(original_hypothesis: list, candidate_hypo """ # Group by test function (excluding loop index and iteration_id from comparison) - def get_test_key(test_result): + def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, str]: """Get unique key for a Hypothesis test function.""" return ( test_result.id.test_module_path, diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py index a39e3999e..13007f7af 100644 --- a/codeflash/verification/hypothesis_testing.py +++ b/codeflash/verification/hypothesis_testing.py @@ -75,6 +75,7 @@ def filter_hypothesis_tests_by_function_name(code: str, function_name: str) -> s Returns: Filtered code with only matching tests + """ tree = ast.parse(code) @@ -86,10 +87,9 @@ def visit_Module(self, node): # noqa: ANN001, ANN202 if isinstance(item, (ast.Import, ast.ImportFrom, ast.Assign)): # Keep all imports and module-level assignments new_body.append(item) - elif isinstance(item, ast.FunctionDef): + elif isinstance(item, ast.FunctionDef) and item.name.startswith("test_") and function_name in item.name: # Only keep test functions that match the function name - if item.name.startswith("test_") and function_name in item.name: - new_body.append(item) + new_body.append(item) node.body = new_body return node @@ -126,25 +126,17 @@ def visit_Call(self, node: ast.Call) -> ast.Call: and node.func.value.id == "st" ): if node.func.attr == "floats" and not any( - k.arg in ["min_value", "max_value", "allow_nan", "allow_infinity"] - for k in node.keywords + k.arg in ["min_value", "max_value", "allow_nan", "allow_infinity"] for k in node.keywords ): # Constrain floats to reasonable bounds node.keywords.extend( [ ast.keyword( - arg="min_value", - value=ast.UnaryOp( - op=ast.USub(), operand=ast.Constant(value=1e6) - ), + arg="min_value", value=ast.UnaryOp(op=ast.USub(), operand=ast.Constant(value=1e6)) ), ast.keyword(arg="max_value", value=ast.Constant(value=1e6)), - ast.keyword( - arg="allow_nan", value=ast.Constant(value=False) - ), - ast.keyword( - arg="allow_infinity", value=ast.Constant(value=False) - ), + ast.keyword(arg="allow_nan", value=ast.Constant(value=False)), + ast.keyword(arg="allow_infinity", value=ast.Constant(value=False)), ] ) elif node.func.attr == "integers" and not any( @@ -154,9 +146,7 @@ def visit_Call(self, node: ast.Call) -> ast.Call: node.keywords.extend( [ ast.keyword(arg="min_value", value=ast.Constant(value=-10000)), - ast.keyword( - arg="max_value", value=ast.Constant(value=10000) - ), + ast.keyword(arg="max_value", value=ast.Constant(value=10000)), ] ) return node @@ -170,28 +160,20 @@ def visit_Call(self, node: ast.Call) -> ast.Call: ( d for d in node.decorator_list - if isinstance(d, ast.Call) - and isinstance(d.func, ast.Name) - and d.func.id == "settings" + if isinstance(d, ast.Call) and isinstance(d.func, ast.Name) and d.func.id == "settings" ), None, ) if settings_decorator: if not any(k.arg == "derandomize" for k in settings_decorator.keywords): - settings_decorator.keywords.append( - ast.keyword(arg="derandomize", value=ast.Constant(value=True)) - ) + settings_decorator.keywords.append(ast.keyword(arg="derandomize", value=ast.Constant(value=True))) else: node.decorator_list.append( ast.Call( func=ast.Name(id="settings", ctx=ast.Load()), args=[], - keywords=[ - ast.keyword( - arg="derandomize", value=ast.Constant(value=True) - ) - ], + keywords=[ast.keyword(arg="derandomize", value=ast.Constant(value=True))], ) ) @@ -199,10 +181,7 @@ def visit_Call(self, node: ast.Call) -> ast.Call: def generate_hypothesis_tests( - test_cfg: TestConfig, - args: Namespace, - function_to_optimize: FunctionToOptimize, - function_to_optimize_ast: ast.AST, + test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST ) -> tuple[dict[str, list[FunctionCalledInTest]], str]: """Generate property-based tests using Hypothesis ghostwriter. @@ -223,9 +202,7 @@ def generate_hypothesis_tests( if ( test_cfg.project_root_path - and isinstance( - function_to_optimize_ast, (ast.FunctionDef, ast.AsyncFunctionDef) - ) + and isinstance(function_to_optimize_ast, (ast.FunctionDef, ast.AsyncFunctionDef)) and has_typed_parameters(function_to_optimize_ast, function_to_optimize.parents) ): logger.info("Generating Hypothesis tests for the original code…") @@ -233,9 +210,7 @@ def generate_hypothesis_tests( try: qualified_function_path = get_qualified_function_path( - function_to_optimize.file_path, - args.project_root, - function_to_optimize.qualified_name, + function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name ) logger.info(f"command: hypothesis write {qualified_function_path}") @@ -250,9 +225,7 @@ def generate_hypothesis_tests( except subprocess.TimeoutExpired: logger.debug("Hypothesis test generation timed out") end_time = time.perf_counter() - logger.debug( - f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds" - ) + logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds") return function_to_hypothesis_tests, hypothesis_test_suite_code if hypothesis_result.returncode == 0: @@ -269,29 +242,25 @@ def generate_hypothesis_tests( pytest_cmd=args.pytest_cmd, ) file_to_funcs = {function_to_optimize.file_path: [function_to_optimize]} - function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = ( - discover_unit_tests(hypothesis_config, file_to_funcs_to_optimize=file_to_funcs) + function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = discover_unit_tests( + hypothesis_config, file_to_funcs_to_optimize=file_to_funcs ) with hypothesis_path.open("r", encoding="utf-8") as f: original_code = f.read() - unparsed = filter_hypothesis_tests_by_function_name( - original_code, function_to_optimize.function_name - ) + unparsed = filter_hypothesis_tests_by_function_name(original_code, function_to_optimize.function_name) console.print(f"modified src: {unparsed}") hypothesis_test_suite_code = format_code( args.formatter_cmds, hypothesis_path, - optimized_code=make_hypothesis_tests_deterministic( - remove_functions_with_only_any_type(unparsed) - ), + optimized_code=make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)), ) with hypothesis_path.open("w", encoding="utf-8") as f: f.write(hypothesis_test_suite_code) - function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = ( - discover_unit_tests(hypothesis_config, file_to_funcs_to_optimize=file_to_funcs) + function_to_hypothesis_tests, num_discovered_hypothesis_tests, _ = discover_unit_tests( + hypothesis_config, file_to_funcs_to_optimize=file_to_funcs ) logger.info( f"Created {num_discovered_hypothesis_tests} " @@ -299,9 +268,7 @@ def generate_hypothesis_tests( ) console.rule() end_time = time.perf_counter() - logger.debug( - f"Generated hypothesis tests in {end_time - start_time:.2f} seconds" - ) + logger.debug(f"Generated hypothesis tests in {end_time - start_time:.2f} seconds") return function_to_hypothesis_tests, hypothesis_test_suite_code logger.debug( @@ -310,7 +277,5 @@ def generate_hypothesis_tests( console.rule() end_time = time.perf_counter() - logger.debug( - f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds" - ) + logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds") return function_to_hypothesis_tests, hypothesis_test_suite_code From dfb3927c9c81fc551e75c1c1101685e17e8fb504 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sun, 26 Oct 2025 01:44:38 -0500 Subject: [PATCH 10/16] fix: track and cleanup hypothesis test temp directories - Modified generate_hypothesis_tests() to return the temp directory Path - Added hypothesis_tests_dir tracking in FunctionOptimizer - Extended cleanup_generated_files() to remove hypothesis test directories - Added hypothesis_tests_dirs list in Optimizer to track all directories - Updated cleanup_temporary_paths() to cleanup hypothesis test directories - Ensures cleanup on success, errors, and KeyboardInterrupt - Changed temp dir prefix to 'codeflash_hypothesis_' for clarity --- codeflash/optimization/function_optimizer.py | 15 +++++++++++++-- codeflash/optimization/optimizer.py | 11 ++++++++++- codeflash/verification/hypothesis_testing.py | 14 ++++++++------ 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 99aeed6d2..9955778ec 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -240,6 +240,7 @@ def __init__( self.function_benchmark_timings = function_benchmark_timings if function_benchmark_timings else {} self.total_benchmark_timings = total_benchmark_timings if total_benchmark_timings else {} self.replay_tests_dir = replay_tests_dir if replay_tests_dir else None + self.hypothesis_tests_dir: Path | None = None self.generate_and_instrument_tests_results: ( tuple[GeneratedTestsList, dict[str, set[FunctionCalledInTest]], OptimizationSet] | None ) = None @@ -1147,7 +1148,11 @@ def generate_tests_and_optimizations( generate_concolic_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast ) future_hypothesis_tests = self.executor.submit( - generate_hypothesis_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast + generate_hypothesis_tests, + self.test_cfg, + self.args, + self.function_to_optimize, + self.function_to_optimize_ast, ) futures = [*future_tests, future_optimization_candidates, future_concolic_tests, future_hypothesis_tests] if run_experiment: @@ -1201,7 +1206,8 @@ def generate_tests_and_optimizations( logger.warning(f"Failed to generate and instrument tests for {self.function_to_optimize.function_name}") return Failure(f"/!\\ NO TESTS GENERATED for {self.function_to_optimize.function_name}") function_to_concolic_tests, concolic_test_str = future_concolic_tests.result() - function_to_hypothesis_tests, hypothesis_test_str = future_hypothesis_tests.result() + function_to_hypothesis_tests, hypothesis_test_str, hypothesis_test_suite_dir = future_hypothesis_tests.result() + self.hypothesis_tests_dir = hypothesis_test_suite_dir count_tests = len(tests) if concolic_test_str: @@ -2051,7 +2057,12 @@ def cleanup_generated_files(self) -> None: paths_to_cleanup.append(test_file.instrumented_behavior_file_path) paths_to_cleanup.append(test_file.benchmarking_file_path) + # Add hypothesis test directory to cleanup + if self.hypothesis_tests_dir and self.hypothesis_tests_dir.exists(): + paths_to_cleanup.append(self.hypothesis_tests_dir) + cleanup_paths(paths_to_cleanup) + self.hypothesis_tests_dir = None def get_test_env( self, codeflash_loop_index: int, codeflash_test_iteration: int, codeflash_tracer_disable: int = 1 diff --git a/codeflash/optimization/optimizer.py b/codeflash/optimization/optimizer.py index c0e0b014b..a83d604d2 100644 --- a/codeflash/optimization/optimizer.py +++ b/codeflash/optimization/optimizer.py @@ -53,6 +53,7 @@ def __init__(self, args: Namespace) -> None: self.experiment_id = os.getenv("CODEFLASH_EXPERIMENT_ID", None) self.local_aiservice_client = LocalAiServiceClient() if self.experiment_id else None self.replay_tests_dir = None + self.hypothesis_tests_dirs: list[Path] = [] # Track all hypothesis test directories self.functions_checkpoint: CodeflashRunCheckpoint | None = None self.current_function_being_optimized: FunctionToOptimize | None = None # current only for the LSP self.current_function_optimizer: FunctionOptimizer | None = None @@ -337,6 +338,9 @@ def run(self) -> None: function_optimizer # needed to clean up from the outside of this function ) best_optimization = function_optimizer.optimize_function() + # Track hypothesis test directory for cleanup + if function_optimizer.hypothesis_tests_dir: + self.hypothesis_tests_dirs.append(function_optimizer.hypothesis_tests_dir) if self.functions_checkpoint: self.functions_checkpoint.add_function_to_checkpoint( function_to_optimize.qualified_name_with_modules_from_root(self.args.project_root) @@ -430,7 +434,12 @@ def cleanup_temporary_paths(self) -> None: if self.current_function_optimizer: self.current_function_optimizer.cleanup_generated_files() - cleanup_paths([self.test_cfg.concolic_test_root_dir, self.replay_tests_dir]) + + # Cleanup all temporary test directories + paths_to_cleanup = [self.test_cfg.concolic_test_root_dir, self.replay_tests_dir] + paths_to_cleanup.extend(self.hypothesis_tests_dirs) + cleanup_paths(paths_to_cleanup) + self.hypothesis_tests_dirs.clear() def worktree_mode(self) -> None: if self.current_worktree: diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py index 13007f7af..e36c130ec 100644 --- a/codeflash/verification/hypothesis_testing.py +++ b/codeflash/verification/hypothesis_testing.py @@ -182,7 +182,7 @@ def visit_Call(self, node: ast.Call) -> ast.Call: def generate_hypothesis_tests( test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST -) -> tuple[dict[str, list[FunctionCalledInTest]], str]: +) -> tuple[dict[str, list[FunctionCalledInTest]], str, Path | None]: """Generate property-based tests using Hypothesis ghostwriter. This function: @@ -193,12 +193,14 @@ def generate_hypothesis_tests( 5. Formats the tests with the project formatter Returns: - Tuple of (function_to_tests_map, test_suite_code) + Tuple of (function_to_tests_map, test_suite_code, hypothesis_test_suite_dir) + The hypothesis_test_suite_dir is None if no tests were generated. """ start_time = time.perf_counter() function_to_hypothesis_tests: dict[str, list[FunctionCalledInTest]] = {} hypothesis_test_suite_code: str = "" + hypothesis_test_suite_dir: Path | None = None if ( test_cfg.project_root_path @@ -226,11 +228,11 @@ def generate_hypothesis_tests( logger.debug("Hypothesis test generation timed out") end_time = time.perf_counter() logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds") - return function_to_hypothesis_tests, hypothesis_test_suite_code + return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir if hypothesis_result.returncode == 0: hypothesis_test_suite_code = hypothesis_result.stdout - hypothesis_test_suite_dir = Path(tempfile.mkdtemp(dir=test_cfg.tests_root)) + hypothesis_test_suite_dir = Path(tempfile.mkdtemp(prefix="codeflash_hypothesis_", dir=test_cfg.tests_root)) hypothesis_path = hypothesis_test_suite_dir / "test_hypothesis.py" hypothesis_path.write_text(hypothesis_test_suite_code, encoding="utf8") @@ -269,7 +271,7 @@ def generate_hypothesis_tests( console.rule() end_time = time.perf_counter() logger.debug(f"Generated hypothesis tests in {end_time - start_time:.2f} seconds") - return function_to_hypothesis_tests, hypothesis_test_suite_code + return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir logger.debug( f"Error running hypothesis write {': ' + hypothesis_result.stderr if hypothesis_result.stderr else '.'}" @@ -278,4 +280,4 @@ def generate_hypothesis_tests( end_time = time.perf_counter() logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds") - return function_to_hypothesis_tests, hypothesis_test_suite_code + return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir From 19fc55750933aaec8e1710b804fe462ef329eaa2 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sun, 26 Oct 2025 02:08:40 -0500 Subject: [PATCH 11/16] cleanup strategies --- codeflash/optimization/function_optimizer.py | 14 ++++++++++++-- codeflash/optimization/optimizer.py | 9 ++++++++- codeflash/verification/equivalence.py | 18 ------------------ codeflash/verification/hypothesis_testing.py | 19 +++++++++---------- 4 files changed, 29 insertions(+), 31 deletions(-) diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 99aeed6d2..61e0cddb9 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -240,6 +240,7 @@ def __init__( self.function_benchmark_timings = function_benchmark_timings if function_benchmark_timings else {} self.total_benchmark_timings = total_benchmark_timings if total_benchmark_timings else {} self.replay_tests_dir = replay_tests_dir if replay_tests_dir else None + self.hypothesis_tests_dir: Path | None = None self.generate_and_instrument_tests_results: ( tuple[GeneratedTestsList, dict[str, set[FunctionCalledInTest]], OptimizationSet] | None ) = None @@ -1147,7 +1148,11 @@ def generate_tests_and_optimizations( generate_concolic_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast ) future_hypothesis_tests = self.executor.submit( - generate_hypothesis_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast + generate_hypothesis_tests, + self.test_cfg, + self.args, + self.function_to_optimize, + self.function_to_optimize_ast, ) futures = [*future_tests, future_optimization_candidates, future_concolic_tests, future_hypothesis_tests] if run_experiment: @@ -1201,7 +1206,8 @@ def generate_tests_and_optimizations( logger.warning(f"Failed to generate and instrument tests for {self.function_to_optimize.function_name}") return Failure(f"/!\\ NO TESTS GENERATED for {self.function_to_optimize.function_name}") function_to_concolic_tests, concolic_test_str = future_concolic_tests.result() - function_to_hypothesis_tests, hypothesis_test_str = future_hypothesis_tests.result() + function_to_hypothesis_tests, hypothesis_test_str, hypothesis_test_suite_dir = future_hypothesis_tests.result() + self.hypothesis_tests_dir = hypothesis_test_suite_dir count_tests = len(tests) if concolic_test_str: @@ -2051,7 +2057,11 @@ def cleanup_generated_files(self) -> None: paths_to_cleanup.append(test_file.instrumented_behavior_file_path) paths_to_cleanup.append(test_file.benchmarking_file_path) + if self.hypothesis_tests_dir and self.hypothesis_tests_dir.exists(): + paths_to_cleanup.append(self.hypothesis_tests_dir) + cleanup_paths(paths_to_cleanup) + self.hypothesis_tests_dir = None def get_test_env( self, codeflash_loop_index: int, codeflash_test_iteration: int, codeflash_tracer_disable: int = 1 diff --git a/codeflash/optimization/optimizer.py b/codeflash/optimization/optimizer.py index c0e0b014b..398db1a47 100644 --- a/codeflash/optimization/optimizer.py +++ b/codeflash/optimization/optimizer.py @@ -53,6 +53,7 @@ def __init__(self, args: Namespace) -> None: self.experiment_id = os.getenv("CODEFLASH_EXPERIMENT_ID", None) self.local_aiservice_client = LocalAiServiceClient() if self.experiment_id else None self.replay_tests_dir = None + self.hypothesis_tests_dirs: list[Path] = [] # Track all hypothesis test directories self.functions_checkpoint: CodeflashRunCheckpoint | None = None self.current_function_being_optimized: FunctionToOptimize | None = None # current only for the LSP self.current_function_optimizer: FunctionOptimizer | None = None @@ -337,6 +338,8 @@ def run(self) -> None: function_optimizer # needed to clean up from the outside of this function ) best_optimization = function_optimizer.optimize_function() + if function_optimizer.hypothesis_tests_dir: + self.hypothesis_tests_dirs.append(function_optimizer.hypothesis_tests_dir) if self.functions_checkpoint: self.functions_checkpoint.add_function_to_checkpoint( function_to_optimize.qualified_name_with_modules_from_root(self.args.project_root) @@ -430,7 +433,11 @@ def cleanup_temporary_paths(self) -> None: if self.current_function_optimizer: self.current_function_optimizer.cleanup_generated_files() - cleanup_paths([self.test_cfg.concolic_test_root_dir, self.replay_tests_dir]) + + paths_to_cleanup = [self.test_cfg.concolic_test_root_dir, self.replay_tests_dir] + paths_to_cleanup.extend(self.hypothesis_tests_dirs) + cleanup_paths(paths_to_cleanup) + self.hypothesis_tests_dirs.clear() def worktree_mode(self) -> None: if self.current_worktree: diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py index efc222acb..66ed7e2b4 100644 --- a/codeflash/verification/equivalence.py +++ b/codeflash/verification/equivalence.py @@ -167,16 +167,6 @@ def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, st f"Candidate={len(candidate_by_func)} test functions ({cand_total_examples} examples)" ) - # Check if all test functions in original are present in candidate - missing_funcs = set(original_by_func.keys()) - set(candidate_by_func.keys()) - if missing_funcs: - logger.warning( - f"Hypothesis test functions missing in candidate: {len(missing_funcs)} functions. " - f"First missing: {missing_funcs.__iter__().__next__()}" - ) - return False - - # Compare each test function's results for test_key in original_by_func: if test_key not in candidate_by_func: continue # Already handled above @@ -196,12 +186,4 @@ def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, st f"(original_failed={orig_had_failure}, candidate_failed={cand_had_failure})" ) return False - - if abs(len(orig_examples) - len(cand_examples)) > 10: - logger.info( - f"Hypothesis test '{test_key[2]}': example counts differ " - f"(original={len(orig_examples)}, candidate={len(cand_examples)}). " - f"This is expected when code performance changes." - ) - return True diff --git a/codeflash/verification/hypothesis_testing.py b/codeflash/verification/hypothesis_testing.py index 13007f7af..9d213c4b4 100644 --- a/codeflash/verification/hypothesis_testing.py +++ b/codeflash/verification/hypothesis_testing.py @@ -182,7 +182,7 @@ def visit_Call(self, node: ast.Call) -> ast.Call: def generate_hypothesis_tests( test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST -) -> tuple[dict[str, list[FunctionCalledInTest]], str]: +) -> tuple[dict[str, list[FunctionCalledInTest]], str, Path | None]: """Generate property-based tests using Hypothesis ghostwriter. This function: @@ -193,12 +193,14 @@ def generate_hypothesis_tests( 5. Formats the tests with the project formatter Returns: - Tuple of (function_to_tests_map, test_suite_code) + Tuple of (function_to_tests_map, test_suite_code, hypothesis_test_suite_dir) + The hypothesis_test_suite_dir is None if no tests were generated. """ start_time = time.perf_counter() function_to_hypothesis_tests: dict[str, list[FunctionCalledInTest]] = {} hypothesis_test_suite_code: str = "" + hypothesis_test_suite_dir: Path | None = None if ( test_cfg.project_root_path @@ -212,8 +214,6 @@ def generate_hypothesis_tests( qualified_function_path = get_qualified_function_path( function_to_optimize.file_path, args.project_root, function_to_optimize.qualified_name ) - logger.info(f"command: hypothesis write {qualified_function_path}") - hypothesis_result = subprocess.run( ["hypothesis", "write", qualified_function_path], capture_output=True, @@ -226,11 +226,11 @@ def generate_hypothesis_tests( logger.debug("Hypothesis test generation timed out") end_time = time.perf_counter() logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds") - return function_to_hypothesis_tests, hypothesis_test_suite_code + return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir if hypothesis_result.returncode == 0: hypothesis_test_suite_code = hypothesis_result.stdout - hypothesis_test_suite_dir = Path(tempfile.mkdtemp(dir=test_cfg.tests_root)) + hypothesis_test_suite_dir = Path(tempfile.mkdtemp(prefix="codeflash_hypothesis_", dir=test_cfg.tests_root)) hypothesis_path = hypothesis_test_suite_dir / "test_hypothesis.py" hypothesis_path.write_text(hypothesis_test_suite_code, encoding="utf8") @@ -250,12 +250,11 @@ def generate_hypothesis_tests( unparsed = filter_hypothesis_tests_by_function_name(original_code, function_to_optimize.function_name) - console.print(f"modified src: {unparsed}") - hypothesis_test_suite_code = format_code( args.formatter_cmds, hypothesis_path, optimized_code=make_hypothesis_tests_deterministic(remove_functions_with_only_any_type(unparsed)), + print_status=False, ) with hypothesis_path.open("w", encoding="utf-8") as f: f.write(hypothesis_test_suite_code) @@ -269,7 +268,7 @@ def generate_hypothesis_tests( console.rule() end_time = time.perf_counter() logger.debug(f"Generated hypothesis tests in {end_time - start_time:.2f} seconds") - return function_to_hypothesis_tests, hypothesis_test_suite_code + return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir logger.debug( f"Error running hypothesis write {': ' + hypothesis_result.stderr if hypothesis_result.stderr else '.'}" @@ -278,4 +277,4 @@ def generate_hypothesis_tests( end_time = time.perf_counter() logger.debug(f"Hypothesis test generation completed in {end_time - start_time:.2f} seconds") - return function_to_hypothesis_tests, hypothesis_test_suite_code + return function_to_hypothesis_tests, hypothesis_test_suite_code, hypothesis_test_suite_dir From 51cfe7caca584ede6300fbf888e84f82bd492f10 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sun, 26 Oct 2025 15:06:57 -0500 Subject: [PATCH 12/16] formatting --- codeflash/code_utils/env_utils.py | 2 +- codeflash/discovery/discover_unit_tests.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/codeflash/code_utils/env_utils.py b/codeflash/code_utils/env_utils.py index 749c49676..4200edb7d 100644 --- a/codeflash/code_utils/env_utils.py +++ b/codeflash/code_utils/env_utils.py @@ -160,4 +160,4 @@ def is_ci() -> bool: def is_pr_draft() -> bool: """Check if the PR is draft. in the github action context.""" event = get_cached_gh_event_data() - return bool(event.get("pull_request", {}).get("draft", False)) \ No newline at end of file + return bool(event.get("pull_request", {}).get("draft", False)) diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py index 1f23eaf5b..ffd66c6b9 100644 --- a/codeflash/discovery/discover_unit_tests.py +++ b/codeflash/discovery/discover_unit_tests.py @@ -9,7 +9,6 @@ import re import sqlite3 import subprocess -import sys import unittest from collections import defaultdict from pathlib import Path From 6968ab391aa64b2d6721d67d552a0bb7f9b4ec00 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sun, 26 Oct 2025 15:12:08 -0500 Subject: [PATCH 13/16] exact tests --- tests/test_hypothesis_testing.py | 54 +++++++++++--------------------- 1 file changed, 18 insertions(+), 36 deletions(-) diff --git a/tests/test_hypothesis_testing.py b/tests/test_hypothesis_testing.py index 49fff9515..d44b2413a 100644 --- a/tests/test_hypothesis_testing.py +++ b/tests/test_hypothesis_testing.py @@ -12,8 +12,9 @@ def test_adds_derandomize_decorator(): def test_x(x): assert isinstance(x, int) """ + expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(x=st.integers(min_value=-10000, max_value=10000))\n@settings(derandomize=True)\ndef test_x(x):\n assert isinstance(x, int)""" out = make_hypothesis_tests_deterministic(src) - assert "@settings(derandomize=True)" in out or "settings(derandomize=True)" in out + assert out == expected def test_integers_constrained_with_negatives(): @@ -23,11 +24,9 @@ def test_integers_constrained_with_negatives(): def t(x): pass """ + expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(x=st.integers(min_value=-10000, max_value=10000))\n@settings(derandomize=True)\ndef t(x):\n pass""" out = make_hypothesis_tests_deterministic(src) - # Remove spaces for easier checking - normalized = out.replace(" ", "").replace("\n", "") - assert "min_value=-10000" in normalized - assert "max_value=10000" in normalized + assert out == expected def test_floats_constrained_to_finite(): @@ -37,11 +36,9 @@ def test_floats_constrained_to_finite(): def t(x): pass """ + expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(x=st.floats(min_value=-1000000.0, max_value=1000000.0, allow_nan=False, allow_infinity=False))\n@settings(derandomize=True)\ndef t(x):\n pass""" out = make_hypothesis_tests_deterministic(src) - normalized = out.replace(" ", "").replace("\n", "") - assert "allow_nan=False" in normalized - assert "allow_infinity=False" in normalized - assert "min_value=" in normalized and "max_value=" in normalized + assert out == expected def test_existing_constraints_not_overridden(): @@ -53,14 +50,9 @@ def test_existing_constraints_not_overridden(): def t(x): pass """ + expected = """from hypothesis import given, strategies as st, settings\n\n@settings(derandomize=True, max_examples=5)\n@given(x=st.integers(min_value=-5, max_value=5))\ndef t(x):\n pass""" out = make_hypothesis_tests_deterministic(src) - # Should not add duplicate settings decorator - assert out.count("@settings") == 1 - # Should preserve original constraints - assert "min_value=-5" in out or "min_value= -5" in out - assert "max_value=5" in out or "max_value= 5" in out - # Should not add the default -10000/10000 bounds - assert "-10000" not in out + assert out == expected def test_existing_float_constraints_preserved(): @@ -71,11 +63,9 @@ def test_existing_float_constraints_preserved(): def t(y): pass """ + expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(y=st.floats(min_value=-1.0, max_value=1.0, allow_nan=False, allow_infinity=False))\n@settings(derandomize=True)\ndef t(y):\n pass""" out = make_hypothesis_tests_deterministic(src) - assert "min_value=-1.0" in out or "min_value= -1.0" in out - assert "max_value=1.0" in out or "max_value= 1.0" in out - # Should not add the default 1e6 bounds - assert "1e6" not in out and "1000000" not in out + assert out == expected def test_idempotency(): @@ -99,14 +89,9 @@ def test_multiple_strategies_handled(): def test_multi(a, b, c): pass """ + expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(a=st.integers(min_value=-10000, max_value=10000), b=st.integers(min_value=-10000, max_value=10000), c=st.floats(min_value=-1000000.0, max_value=1000000.0, allow_nan=False, allow_infinity=False))\n@settings(derandomize=True)\ndef test_multi(a, b, c):\n pass""" out = make_hypothesis_tests_deterministic(src) - normalized = out.replace(" ", "").replace("\n", "") - # All integers should be constrained - assert normalized.count("min_value=-10000") >= 2 - assert normalized.count("max_value=10000") >= 2 - # Float should be constrained - assert "allow_nan=False" in normalized - assert "allow_infinity=False" in normalized + assert out == expected def test_settings_import_added_if_missing(): @@ -117,9 +102,9 @@ def test_settings_import_added_if_missing(): def test_x(x): pass """ + expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(x=st.integers(min_value=-10000, max_value=10000))\n@settings(derandomize=True)\ndef test_x(x):\n pass""" out = make_hypothesis_tests_deterministic(src) - # Should have settings import or settings in existing import - assert "settings" in out + assert out == expected def test_partial_constraints_completed(): @@ -130,11 +115,9 @@ def test_partial_constraints_completed(): def test_x(x): pass """ + expected = """from hypothesis import settings\nfrom hypothesis import given, strategies as st\n\n@given(x=st.integers(min_value=100))\n@settings(derandomize=True)\ndef test_x(x):\n pass""" out = make_hypothesis_tests_deterministic(src) - # Should keep the min_value=100 and not override - assert "min_value=100" in out or "min_value= 100" in out - # Should not add default bounds since min_value exists - assert "-10000" not in out + assert out == expected def test_syntax_error_returns_original(): @@ -152,7 +135,6 @@ def test_no_hypothesis_code_unchanged(): def test_regular(): assert regular_function(2) == 4 """ + expected = """from hypothesis import settings\n\n@settings(derandomize=True)\ndef regular_function(x):\n return x * 2\n\n@settings(derandomize=True)\ndef test_regular():\n assert regular_function(2) == 4""" out = make_hypothesis_tests_deterministic(src) - # Should still parse and return valid code - assert "def regular_function" in out - assert "def test_regular" in out + assert out == expected From c9f64830c394fb6b6fe831a2d916d2340919ac9d Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Sun, 26 Oct 2025 20:37:44 +0000 Subject: [PATCH 14/16] Optimize _compare_hypothesis_tests_semantic The optimized code achieves a **32% speedup** by eliminating redundant data structures and reducing iteration overhead through two key optimizations: **1. Single-pass aggregation instead of list accumulation:** - **Original**: Uses `defaultdict(list)` to collect all `FunctionTestInvocation` objects per test function, then later iterates through these lists to compute failure flags with `any(not ex.did_pass for ex in orig_examples)` - **Optimized**: Uses plain dicts with 2-element lists `[count, had_failure]` to track both example count and failure status in a single pass, eliminating the need to store individual test objects or re-scan them **2. Reduced memory allocation and access patterns:** - **Original**: Creates and stores complete lists of test objects (up to 9,458 objects in large test cases), then performs expensive `any()` operations over these lists - **Optimized**: Uses compact 2-item lists per test function, avoiding object accumulation and expensive linear scans The line profiler shows the key performance gains: - Lines with `any(not ex.did_pass...)` in original (10.1% and 10.2% of total time) are completely eliminated - The `setdefault()` operations replace the more expensive `defaultdict(list).append()` calls - Overall reduction from storing ~9,458 objects to just tracking summary statistics **Best performance gains** occur in test cases with: - **Large numbers of examples per test function** (up to 105% faster for `test_large_scale_all_fail`) - **Many distinct test functions** (up to 75% faster for `test_large_scale_some_failures`) - **Mixed pass/fail scenarios** where the original's `any()` operations were most expensive The optimization maintains identical behavior while dramatically reducing both memory usage and computational complexity from O(examples) to O(1) per test function group. --- codeflash/verification/equivalence.py | 45 +++++++++++++-------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py index 66ed7e2b4..1bcf4e47e 100644 --- a/codeflash/verification/equivalence.py +++ b/codeflash/verification/equivalence.py @@ -1,5 +1,4 @@ import sys -from collections import defaultdict from codeflash.cli_cmds.console import logger from codeflash.models.models import FunctionTestInvocation, TestResults, TestType, VerificationType @@ -138,7 +137,6 @@ def _compare_hypothesis_tests_semantic(original_hypothesis: list, candidate_hypo not how many examples Hypothesis generated. """ - # Group by test function (excluding loop index and iteration_id from comparison) def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, str]: """Get unique key for a Hypothesis test function.""" return ( @@ -148,38 +146,39 @@ def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, st test_result.id.function_getting_tested, ) - # Group original results by test function - original_by_func = defaultdict(list) + # Group by test function and simultaneously collect failure flag and example count + orig_by_func = {} for result in original_hypothesis: - original_by_func[get_test_key(result)].append(result) + test_key = get_test_key(result) + group = orig_by_func.setdefault(test_key, [0, False]) # [count, had_failure] + group[0] += 1 + if not result.did_pass: + group[1] = True - # Group candidate results by test function - candidate_by_func = defaultdict(list) + cand_by_func = {} for result in candidate_hypothesis: - candidate_by_func[get_test_key(result)].append(result) + test_key = get_test_key(result) + group = cand_by_func.setdefault(test_key, [0, False]) # [count, had_failure] + group[0] += 1 + if not result.did_pass: + group[1] = True - # Log summary statistics - orig_total_examples = sum(len(examples) for examples in original_by_func.values()) - cand_total_examples = sum(len(examples) for examples in candidate_by_func.values()) + orig_total_examples = sum(group[0] for group in orig_by_func.values()) + cand_total_examples = sum(group[0] for group in cand_by_func.values()) logger.debug( - f"Hypothesis comparison: Original={len(original_by_func)} test functions ({orig_total_examples} examples), " - f"Candidate={len(candidate_by_func)} test functions ({cand_total_examples} examples)" + f"Hypothesis comparison: Original={len(orig_by_func)} test functions ({orig_total_examples} examples), " + f"Candidate={len(cand_by_func)} test functions ({cand_total_examples} examples)" ) - for test_key in original_by_func: - if test_key not in candidate_by_func: + # Compare only for test_keys present in original + for test_key, (orig_count, orig_had_failure) in orig_by_func.items(): + cand_group = cand_by_func.get(test_key) + if cand_group is None: continue # Already handled above - orig_examples = original_by_func[test_key] - cand_examples = candidate_by_func[test_key] + cand_had_failure = cand_group[1] - # Check if any original example failed - orig_had_failure = any(not ex.did_pass for ex in orig_examples) - cand_had_failure = any(not ex.did_pass for ex in cand_examples) - - # If original had failures, candidate must also have failures (or be missing, already handled) - # If original passed, candidate must pass (but can have different example counts) if orig_had_failure != cand_had_failure: logger.debug( f"Hypothesis test function behavior mismatch: {test_key} " From 8fb7c1e56547de98826928b00e1f5b857f5ee2b9 Mon Sep 17 00:00:00 2001 From: Kevin Turcios <106575910+KRRT7@users.noreply.github.com> Date: Sun, 26 Oct 2025 17:49:54 -0500 Subject: [PATCH 15/16] Update codeflash/discovery/discover_unit_tests.py Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com> --- codeflash/discovery/discover_unit_tests.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py index ffd66c6b9..55cc9f33c 100644 --- a/codeflash/discovery/discover_unit_tests.py +++ b/codeflash/discovery/discover_unit_tests.py @@ -71,10 +71,11 @@ def _extract_dotted_call_name(node: ast.expr) -> str | None: parts = [] current = node while isinstance(current, ast.Attribute): - parts.insert(0, current.attr) + parts.append(current.attr) current = current.value if isinstance(current, ast.Name): - parts.insert(0, current.id) + parts.append(current.id) + parts.reverse() return ".".join(parts) if parts else None return None From 01a189aab18fce2323a4b82d4d5a226b27159c49 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sun, 26 Oct 2025 17:51:24 -0500 Subject: [PATCH 16/16] fix linter --- codeflash/verification/equivalence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py index 1bcf4e47e..89b0d9b6a 100644 --- a/codeflash/verification/equivalence.py +++ b/codeflash/verification/equivalence.py @@ -172,7 +172,7 @@ def get_test_key(test_result: FunctionTestInvocation) -> tuple[str, str, str, st ) # Compare only for test_keys present in original - for test_key, (orig_count, orig_had_failure) in orig_by_func.items(): + for test_key, (_orig_count, orig_had_failure) in orig_by_func.items(): cand_group = cand_by_func.get(test_key) if cand_group is None: continue # Already handled above