diff --git a/code_to_optimize/bubble_sort.py b/code_to_optimize/bubble_sort.py index b18994494..787cc4a90 100644 --- a/code_to_optimize/bubble_sort.py +++ b/code_to_optimize/bubble_sort.py @@ -1,8 +1,10 @@ def sorter(arr): + print("codeflash stdout: Sorting list") for i in range(len(arr)): for j in range(len(arr) - 1): if arr[j] > arr[j + 1]: temp = arr[j] arr[j] = arr[j + 1] arr[j + 1] = temp + print(f"result: {arr}") return arr \ No newline at end of file diff --git a/code_to_optimize/bubble_sort_method.py b/code_to_optimize/bubble_sort_method.py index 3928e41fb..962fde339 100644 --- a/code_to_optimize/bubble_sort_method.py +++ b/code_to_optimize/bubble_sort_method.py @@ -1,12 +1,17 @@ +import sys + + class BubbleSorter: def __init__(self, x=0): self.x = x def sorter(self, arr): + print("codeflash stdout : BubbleSorter.sorter() called") for i in range(len(arr)): for j in range(len(arr) - 1): if arr[j] > arr[j + 1]: temp = arr[j] arr[j] = arr[j + 1] arr[j + 1] = temp + print("stderr test", file=sys.stderr) return arr diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py index 48b69e710..c3f19df02 100644 --- a/codeflash/verification/equivalence.py +++ b/codeflash/verification/equivalence.py @@ -1,6 +1,7 @@ +import difflib import sys -from codeflash.cli_cmds.console import logger +from codeflash.cli_cmds.console import console, logger from codeflash.verification.comparator import comparator from codeflash.verification.test_results import TestResults, TestType, VerificationType @@ -61,6 +62,12 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR cdd_test_result.return_value, ) break + if (original_test_result.stdout and cdd_test_result.stdout) and not comparator( + original_test_result.stdout, cdd_test_result.stdout + ): + are_equal = False + break + if original_test_result.test_type in [TestType.EXISTING_UNIT_TEST, TestType.CONCOLIC_COVERAGE_TEST] and ( cdd_test_result.did_pass != original_test_result.did_pass ): diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py index e69ee78d1..fa565b5b9 100644 --- a/codeflash/verification/parse_test_output.py +++ b/codeflash/verification/parse_test_output.py @@ -42,6 +42,11 @@ def parse_func(file_path: Path) -> XMLParser: return parse(file_path, xml_parser) +matches_re = re.compile(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!") +cleaner_re = re.compile(r"!######.*?######!|-+\s*Captured\s+(Log|Out)\s*-+\n?") + + + def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, test_config: TestConfig) -> TestResults: test_results = TestResults() if not file_location.exists(): @@ -259,7 +264,13 @@ def parse_test_xml( message = testcase.result[0].message.lower() if "timed out" in message: timed_out = True - matches = re.findall(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!", testcase.system_out or "") + + sys_stdout = testcase.system_out or "" + matches = matches_re.findall(sys_stdout) + + if sys_stdout: + sys_stdout = cleaner_re.sub("", sys_stdout).strip() + if not matches or not len(matches): test_results.add( FunctionTestInvocation( @@ -278,6 +289,7 @@ def parse_test_xml( test_type=test_type, return_value=None, timed_out=timed_out, + stdout=sys_stdout, ) ) @@ -306,6 +318,7 @@ def parse_test_xml( test_type=test_type, return_value=None, timed_out=timed_out, + stdout=sys_stdout, ) ) @@ -393,6 +406,7 @@ def merge_test_results( verification_type=VerificationType(result_bin.verification_type) if result_bin.verification_type else None, + stdout=xml_result.stdout, ) ) elif xml_results.test_results[0].id.iteration_id is not None: @@ -422,6 +436,7 @@ def merge_test_results( verification_type=VerificationType(bin_result.verification_type) if bin_result.verification_type else None, + stdout=xml_result.stdout, ) ) else: @@ -448,6 +463,7 @@ def merge_test_results( verification_type=VerificationType(bin_result.verification_type) if bin_result.verification_type else None, + stdout=xml_result.stdout, ) ) diff --git a/codeflash/verification/test_results.py b/codeflash/verification/test_results.py index a4ecea816..c7a210a6a 100644 --- a/codeflash/verification/test_results.py +++ b/codeflash/verification/test_results.py @@ -93,6 +93,7 @@ class FunctionTestInvocation: return_value: Optional[object] # The return value of the function invocation timed_out: Optional[bool] verification_type: Optional[str] = VerificationType.FUNCTION_CALL + stdout: Optional[str] = None @property def unique_invocation_loop_id(self) -> str: diff --git a/pyproject.toml b/pyproject.toml index 1a5e63f8e..2e71f2a0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -119,7 +119,6 @@ types-gevent = "^24.11.0.20241230" types-greenlet = "^3.1.0.20241221" types-pexpect = "^4.9.0.20241208" types-unidiff = "^0.7.0.20240505" -sqlalchemy = "^2.0.38" uv = ">=0.6.2" [tool.poetry.build] @@ -178,8 +177,7 @@ ignore = [ "TD003", "TD004", "PLR2004", - "UP007", - "N802", # we use a lot of stdlib which follows this convention + "UP007" # remove once we drop 3.9 support. ] [tool.ruff.lint.flake8-type-checking] diff --git a/tests/scripts/end_to_end_test_bubblesort_pytest.py b/tests/scripts/end_to_end_test_bubblesort_pytest.py index 08fe3117f..d714703aa 100644 --- a/tests/scripts/end_to_end_test_bubblesort_pytest.py +++ b/tests/scripts/end_to_end_test_bubblesort_pytest.py @@ -11,11 +11,15 @@ def run_test(expected_improvement_pct: int) -> bool: test_framework="pytest", min_improvement_x=1.0, coverage_expectations=[ - CoverageExpectation(function_name="sorter", expected_coverage=100.0, expected_lines=[2, 3, 4, 5, 6, 7, 8]) + CoverageExpectation( + function_name="sorter", expected_coverage=100.0, expected_lines=[2, 3, 4, 5, 6, 7, 8, 9, 10] + ) ], ) cwd = (pathlib.Path(__file__).parent.parent.parent / "code_to_optimize").resolve() - return run_codeflash_command(cwd, config, expected_improvement_pct) + return run_codeflash_command( + cwd, config, expected_improvement_pct, ['print("codeflash stdout: Sorting list")', 'print(f"result: {arr}")'] + ) if __name__ == "__main__": diff --git a/tests/scripts/end_to_end_test_utilities.py b/tests/scripts/end_to_end_test_utilities.py index 891ed29f4..23a67a84a 100644 --- a/tests/scripts/end_to_end_test_utilities.py +++ b/tests/scripts/end_to_end_test_utilities.py @@ -63,19 +63,21 @@ def validate_coverage(stdout: str, expectations: list[CoverageExpectation]) -> b assert coverage_match, f"Failed to find coverage data for {expect.function_name}" coverage = float(coverage_match.group(1)) - assert ( - coverage == expect.expected_coverage - ), f"Coverage was {coverage} instead of {expect.expected_coverage} for function: {expect.function_name}" + assert coverage == expect.expected_coverage, ( + f"Coverage was {coverage} instead of {expect.expected_coverage} for function: {expect.function_name}" + ) executed_lines = list(map(int, coverage_match.group(2).split(", "))) - assert ( - executed_lines == expect.expected_lines - ), f"Executed lines were {executed_lines} instead of {expect.expected_lines} for function: {expect.function_name}" + assert executed_lines == expect.expected_lines, ( + f"Executed lines were {executed_lines} instead of {expect.expected_lines} for function: {expect.function_name}" + ) return True -def run_codeflash_command(cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int) -> bool: +def run_codeflash_command( + cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int, expected_in_stdout: list[str] = None +) -> bool: logging.basicConfig(level=logging.INFO) if config.trace_mode: return run_trace_test(cwd, config, expected_improvement_pct) @@ -97,12 +99,21 @@ def run_codeflash_command(cwd: pathlib.Path, config: TestConfig, expected_improv return_code = process.wait() stdout = "".join(output) - if not validate_output(stdout, return_code, expected_improvement_pct, config): + validated = validate_output(stdout, return_code, expected_improvement_pct, config) + if not validated: # Write original file contents back to file path_to_file.write_text(file_contents, "utf-8") logging.info("Codeflash run did not meet expected requirements for testing, reverting file changes.") return False - return True + + if expected_in_stdout: + stdout_validated = validate_stdout_in_candidate(stdout, expected_in_stdout) + if not stdout_validated: + logging.error("Failed to find expected output in candidate output") + validated = False + logging.info(f"Success: Expected output found in candidate output") + + return validated def build_command(cwd: pathlib.Path, config: TestConfig, test_root: pathlib.Path) -> list[str]: @@ -164,6 +175,11 @@ def validate_output(stdout: str, return_code: int, expected_improvement_pct: int return True +def validate_stdout_in_candidate(stdout: str, expected_in_stdout: list[str]) -> bool: + candidate_output = stdout[stdout.find("INFO Best candidate") : stdout.find("Best Candidate Explanation")] + return all(expected in candidate_output for expected in expected_in_stdout) + + def run_trace_test(cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int) -> bool: # First command: Run the tracer test_root = cwd / "tests" / (config.test_framework or "") diff --git a/tests/test_codeflash_capture.py b/tests/test_codeflash_capture.py index 83b1efd2b..8e5e237cf 100644 --- a/tests/test_codeflash_capture.py +++ b/tests/test_codeflash_capture.py @@ -485,7 +485,6 @@ def __init__(self, x=2): assert test_results[1].id.test_module_path == "code_to_optimize.tests.pytest.test_codeflash_capture_temp" assert test_results[1].id.function_getting_tested == "some_function" assert test_results[1].id.iteration_id == "11_0" - assert test_results[2].did_pass assert test_results[2].return_value[0]["x"] == 2 assert test_results[2].id.test_function_name == "test_example_test_3" @@ -494,6 +493,17 @@ def __init__(self, x=2): assert test_results[2].id.function_getting_tested == "some_function" assert test_results[2].id.iteration_id == "16_0" + test_results2, _ = func_optimizer.run_and_parse_tests( + testing_type=TestingMode.BEHAVIOR, + test_env=test_env, + test_files=func_optimizer.test_files, + optimization_iteration=0, + pytest_min_loops=1, + pytest_max_loops=1, + testing_time=0.1, + ) + assert compare_test_results(test_results, test_results2) + finally: test_path.unlink(missing_ok=True) sample_code_path.unlink(missing_ok=True) @@ -605,6 +615,18 @@ def __init__(self, *args, **kwargs): assert test_results[2].id.function_getting_tested == "some_function" assert test_results[2].id.iteration_id == "16_0" + results2, _ = func_optimizer.run_and_parse_tests( + testing_type=TestingMode.BEHAVIOR, + test_env=test_env, + test_files=func_optimizer.test_files, + optimization_iteration=0, + pytest_min_loops=1, + pytest_max_loops=1, + testing_time=0.1, + ) + + assert compare_test_results(test_results, results2) + finally: test_path.unlink(missing_ok=True) sample_code_path.unlink(missing_ok=True) @@ -720,6 +742,17 @@ def __init__(self, x=2): assert test_results[2].id.function_getting_tested == "some_function" assert test_results[2].id.iteration_id == "12_2" # Third call + test_results2, _ = func_optimizer.run_and_parse_tests( + testing_type=TestingMode.BEHAVIOR, + test_env=test_env, + test_files=func_optimizer.test_files, + optimization_iteration=0, + pytest_min_loops=1, + pytest_max_loops=1, + testing_time=0.1, + ) + + assert compare_test_results(test_results, test_results2) finally: test_path.unlink(missing_ok=True) sample_code_path.unlink(missing_ok=True) @@ -856,6 +889,18 @@ def another_helper(self): assert test_results[3].id.function_getting_tested == "AnotherHelperClass.__init__" assert test_results[3].verification_type == VerificationType.INIT_STATE_HELPER + results2, _ = func_optimizer.run_and_parse_tests( + testing_type=TestingMode.BEHAVIOR, + test_env=test_env, + test_files=func_optimizer.test_files, + optimization_iteration=0, + pytest_min_loops=1, + pytest_max_loops=1, + testing_time=0.1, + ) + + assert compare_test_results(test_results, results2) + finally: test_path.unlink(missing_ok=True) fto_file_path.unlink(missing_ok=True) diff --git a/tests/test_instrument_all_and_run.py b/tests/test_instrument_all_and_run.py index 643d4bde7..ce06c855a 100644 --- a/tests/test_instrument_all_and_run.py +++ b/tests/test_instrument_all_and_run.py @@ -168,6 +168,13 @@ def test_sort(): pytest_max_loops=1, testing_time=0.1, ) + + out_str = """codeflash stdout: Sorting list +result: [0, 1, 2, 3, 4, 5] + +codeflash stdout: Sorting list +result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]""" + assert out_str == test_results[0].stdout assert test_results[0].id.function_getting_tested == "sorter" assert test_results[0].id.iteration_id == "1_0" assert test_results[0].id.test_class_name is None @@ -179,6 +186,7 @@ def test_sort(): assert test_results[0].runtime > 0 assert test_results[0].did_pass assert test_results[0].return_value == ([0, 1, 2, 3, 4, 5],) + assert out_str == test_results[1].stdout.strip() assert test_results[1].id.function_getting_tested == "sorter" assert test_results[1].id.iteration_id == "4_0" @@ -190,6 +198,22 @@ def test_sort(): ) assert test_results[1].runtime > 0 assert test_results[1].did_pass + results2, _ = func_optimizer.run_and_parse_tests( + testing_type=TestingMode.BEHAVIOR, + test_env=test_env, + test_files=func_optimizer.test_files, + optimization_iteration=0, + pytest_min_loops=1, + pytest_max_loops=1, + testing_time=0.1, + ) + out_str = """codeflash stdout: Sorting list +result: [0, 1, 2, 3, 4, 5] + +codeflash stdout: Sorting list +result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]""" + assert out_str == results2[0].stdout.strip() + assert compare_test_results(test_results, results2) finally: fto_path.write_text(original_code, "utf-8") test_path.unlink(missing_ok=True) @@ -340,13 +364,11 @@ def test_sort(): pytest_max_loops=1, testing_time=0.1, ) - assert len(test_results) == 4 assert test_results[0].id.function_getting_tested == "BubbleSorter.__init__" assert test_results[0].id.test_function_name == "test_sort" assert test_results[0].did_pass assert test_results[0].return_value[0] == {"x": 0} - assert test_results[1].id.function_getting_tested == "BubbleSorter.sorter" assert test_results[1].id.iteration_id == "2_0" assert test_results[1].id.test_class_name is None @@ -358,7 +380,9 @@ def test_sort(): assert test_results[1].runtime > 0 assert test_results[1].did_pass assert test_results[1].return_value == ([0, 1, 2, 3, 4, 5],) - + out_str = """codeflash stdout : BubbleSorter.sorter() called\n\n\ncodeflash stdout : BubbleSorter.sorter() called""" + assert test_results[1].stdout == out_str + assert compare_test_results(test_results, test_results) assert test_results[2].id.function_getting_tested == "BubbleSorter.__init__" assert test_results[2].id.test_function_name == "test_sort" assert test_results[2].did_pass @@ -375,6 +399,18 @@ def test_sort(): assert test_results[3].runtime > 0 assert test_results[3].did_pass + results2, _ = func_optimizer.run_and_parse_tests( + testing_type=TestingMode.BEHAVIOR, + test_env=test_env, + test_files=func_optimizer.test_files, + optimization_iteration=0, + pytest_min_loops=1, + pytest_max_loops=1, + testing_time=0.1, + ) + + assert compare_test_results(test_results, results2) + # Replace with optimized code that mutated instance attribute optimized_code = """ class BubbleSorter: diff --git a/tests/test_instrument_tests.py b/tests/test_instrument_tests.py index bf7373522..79f4bc5dd 100644 --- a/tests/test_instrument_tests.py +++ b/tests/test_instrument_tests.py @@ -482,6 +482,13 @@ def test_sort(): ) assert test_results_perf[1].runtime > 0 assert test_results_perf[1].did_pass + out_str = """codeflash stdout: Sorting list +result: [0, 1, 2, 3, 4, 5] + +codeflash stdout: Sorting list +result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]""" + assert out_str == test_results_perf[1].stdout + finally: test_path.unlink(missing_ok=True) test_path_perf.unlink(missing_ok=True) @@ -693,6 +700,10 @@ def test_sort_parametrized(input, expected_output): assert test_results_perf[1].runtime > 0 assert test_results_perf[1].did_pass + out_str = """codeflash stdout: Sorting list +result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]""" + assert out_str == test_results_perf[1].stdout + assert test_results_perf[2].id.function_getting_tested == "sorter" assert test_results_perf[2].id.iteration_id == "0_2" assert test_results_perf[2].id.test_class_name is None @@ -1230,7 +1241,15 @@ def test_sort(): assert test_results[0].runtime > 0 assert test_results[0].did_pass assert test_results[0].return_value is None + out_str = """codeflash stdout: Sorting list +result: [0, 1, 2, 3, 4, 5] + +codeflash stdout: Sorting list +result: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] +codeflash stdout: Sorting list +result: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]""" + assert test_results[1].stdout == out_str assert test_results[1].id.function_getting_tested == "sorter" assert test_results[1].id.iteration_id == "2_2_1" assert test_results[1].id.test_class_name is None diff --git a/tests/test_instrumentation_run_results_aiservice.py b/tests/test_instrumentation_run_results_aiservice.py index a35556dfd..888c629ef 100644 --- a/tests/test_instrumentation_run_results_aiservice.py +++ b/tests/test_instrumentation_run_results_aiservice.py @@ -179,6 +179,7 @@ def test_single_element_list(): testing_time=0.1, ) assert test_results[0].id.function_getting_tested == "sorter" + assert test_results[0].stdout == "codeflash stdout : BubbleSorter.sorter() called" assert test_results[0].id.test_function_name == "test_single_element_list" assert test_results[0].did_pass assert test_results[0].return_value[1]["arr"] == [42] @@ -187,18 +188,23 @@ def test_single_element_list(): # Replace with optimized code that mutated instance attribute optimized_code_mutated_attr = """ +import sys + + class BubbleSorter: def __init__(self, x=1): self.x = x def sorter(self, arr): + print("codeflash stdout : BubbleSorter.sorter() called") for i in range(len(arr)): for j in range(len(arr) - 1): if arr[j] > arr[j + 1]: temp = arr[j] arr[j] = arr[j + 1] arr[j + 1] = temp + print("stderr test", file=sys.stderr) return arr """ fto_path.write_text(optimized_code_mutated_attr, "utf-8") @@ -217,6 +223,7 @@ def sorter(self, arr): test_results, test_results_mutated_attr ) # Without codeflash capture, the init state was not verified, and the results are verified as correct even with the attribute mutated + assert test_results_mutated_attr[0].stdout == "codeflash stdout : BubbleSorter.sorter() called" finally: fto_path.write_text(original_code, "utf-8") test_path.unlink(missing_ok=True) @@ -317,6 +324,7 @@ def test_single_element_list(): assert test_results[0].id.test_function_name == "test_single_element_list" assert test_results[0].did_pass assert test_results[0].return_value[0] == {"x": 0} + assert test_results[0].stdout == "codeflash stdout : BubbleSorter.sorter() called" # Verify function_to_optimize result assert test_results[1].id.function_getting_tested == "sorter" @@ -331,18 +339,23 @@ def test_single_element_list(): assert test_results[1].return_value[2] == [1, 2, 3] # Replace with optimized code that mutated instance attribute optimized_code_mutated_attr = """ +import sys + + class BubbleSorter: def __init__(self, x=1): self.x = x def sorter(self, arr): + print("codeflash stdout : BubbleSorter.sorter() called") for i in range(len(arr)): for j in range(len(arr) - 1): if arr[j] > arr[j + 1]: temp = arr[j] arr[j] = arr[j + 1] arr[j + 1] = temp + print("stderr test", file=sys.stderr) return arr """ fto_path.write_text(optimized_code_mutated_attr, "utf-8") @@ -381,23 +394,29 @@ def sorter(self, arr): assert test_results_mutated_attr[0].id.function_getting_tested == "BubbleSorter.__init__" assert test_results_mutated_attr[0].return_value[0] == {"x": 1} assert test_results_mutated_attr[0].verification_type == VerificationType.INIT_STATE_FTO + assert test_results_mutated_attr[0].stdout == "codeflash stdout : BubbleSorter.sorter() called" assert not compare_test_results( test_results, test_results_mutated_attr ) # The test should fail because the instance attribute was mutated # Replace with optimized code that did not mutate existing instance attribute, but added a new one optimized_code_new_attr = """ +import sys + + class BubbleSorter: def __init__(self, x=0): self.x = x self.y = 2 def sorter(self, arr): + print("codeflash stdout : BubbleSorter.sorter() called") for i in range(len(arr)): for j in range(len(arr) - 1): if arr[j] > arr[j + 1]: temp = arr[j] arr[j] = arr[j + 1] arr[j + 1] = temp + print("stderr test", file=sys.stderr) return arr """ fto_path.write_text(optimized_code_new_attr, "utf-8") @@ -427,6 +446,7 @@ def sorter(self, arr): assert test_results_new_attr[0].id.function_getting_tested == "BubbleSorter.__init__" assert test_results_new_attr[0].return_value[0] == {"x": 0, "y": 2} assert test_results_new_attr[0].verification_type == VerificationType.INIT_STATE_FTO + assert test_results_new_attr[0].stdout == "codeflash stdout : BubbleSorter.sorter() called" # assert test_results_new_attr[1].return_value[1]["self"].x == 0 TODO: add self as input # assert test_results_new_attr[1].return_value[1]["self"].y == 2 TODO: add self as input assert compare_test_results(