stdout comparison in E2E

KRRT7 · KRRT7 · commit f40c388ef649 · 2025-03-03T16:59:17.000-08:00
diff --git a/code_to_optimize/bubble_sort.py b/code_to_optimize/bubble_sort.py
@@ -1,8 +1,10 @@
 def sorter(arr):
+    print("codeflash stdout: Sorting list")
     for i in range(len(arr)):
         for j in range(len(arr) - 1):
             if arr[j] > arr[j + 1]:
                 temp = arr[j]
                 arr[j] = arr[j + 1]
                 arr[j + 1] = temp
+    print(f"result: {arr}")
     return arr
diff --git a/tests/scripts/end_to_end_test_bubblesort_pytest.py b/tests/scripts/end_to_end_test_bubblesort_pytest.py
@@ -11,11 +11,15 @@ def run_test(expected_improvement_pct: int) -> bool:
         test_framework="pytest",
         min_improvement_x=1.0,
         coverage_expectations=[
-            CoverageExpectation(function_name="sorter", expected_coverage=100.0, expected_lines=[2, 3, 4, 5, 6, 7, 8])
+            CoverageExpectation(
+                function_name="sorter", expected_coverage=100.0, expected_lines=[2, 3, 4, 5, 6, 7, 8, 9, 10]
+            )
         ],
     )
     cwd = (pathlib.Path(__file__).parent.parent.parent / "code_to_optimize").resolve()
-    return run_codeflash_command(cwd, config, expected_improvement_pct)
+    return run_codeflash_command(
+        cwd, config, expected_improvement_pct, ['print("codeflash stdout: Sorting list")', 'print(f"result: {arr}")']
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/scripts/end_to_end_test_utilities.py b/tests/scripts/end_to_end_test_utilities.py
@@ -63,19 +63,21 @@ def validate_coverage(stdout: str, expectations: list[CoverageExpectation]) -> b
         assert coverage_match, f"Failed to find coverage data for {expect.function_name}"
 
         coverage = float(coverage_match.group(1))
-        assert (
-            coverage == expect.expected_coverage
-        ), f"Coverage was {coverage} instead of {expect.expected_coverage} for function: {expect.function_name}"
+        assert coverage == expect.expected_coverage, (
+            f"Coverage was {coverage} instead of {expect.expected_coverage} for function: {expect.function_name}"
+        )
 
         executed_lines = list(map(int, coverage_match.group(2).split(", ")))
-        assert (
-            executed_lines == expect.expected_lines
-        ), f"Executed lines were {executed_lines} instead of {expect.expected_lines} for function: {expect.function_name}"
+        assert executed_lines == expect.expected_lines, (
+            f"Executed lines were {executed_lines} instead of {expect.expected_lines} for function: {expect.function_name}"
+        )
 
     return True
 
 
-def run_codeflash_command(cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int) -> bool:
+def run_codeflash_command(
+    cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int, expected_in_stdout: list[str] = None
+) -> bool:
     logging.basicConfig(level=logging.INFO)
     if config.trace_mode:
         return run_trace_test(cwd, config, expected_improvement_pct)
@@ -97,12 +99,21 @@ def run_codeflash_command(cwd: pathlib.Path, config: TestConfig, expected_improv
     return_code = process.wait()
     stdout = "".join(output)
 
-    if not validate_output(stdout, return_code, expected_improvement_pct, config):
+    validated = validate_output(stdout, return_code, expected_improvement_pct, config)
+    if not validated:
         # Write original file contents back to file
         path_to_file.write_text(file_contents, "utf-8")
         logging.info("Codeflash run did not meet expected requirements for testing, reverting file changes.")
         return False
-    return True
+
+    if expected_in_stdout:
+        stdout_validated = validate_stdout_in_candidate(stdout, expected_in_stdout)
+        if not stdout_validated:
+            logging.error("Failed to find expected output in candidate output")
+            validated = False
+        logging.info(f"Success: Expected output found in candidate output")
+
+    return validated
 
 
 def build_command(cwd: pathlib.Path, config: TestConfig, test_root: pathlib.Path) -> list[str]:
@@ -164,6 +175,11 @@ def validate_output(stdout: str, return_code: int, expected_improvement_pct: int
     return True
 
 
+def validate_stdout_in_candidate(stdout: str, expected_in_stdout: list[str]) -> bool:
+    candidate_output = stdout[stdout.find("INFO     Best candidate") : stdout.find("Best Candidate Explanation")]
+    return all(expected in candidate_output for expected in expected_in_stdout)
+
+
 def run_trace_test(cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int) -> bool:
     # First command: Run the tracer
     test_root = cwd / "tests" / (config.test_framework or "")