first pass

KRRT7 · KRRT7 · commit 3f524c2be6bf · 2025-02-19T18:59:14.000-05:00
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -3,8 +3,11 @@
 import ast
 import concurrent.futures
 import os
+import re
+import shlex
 import shutil
 import subprocess
+import tempfile
 import time
 import uuid
 from collections import defaultdict
@@ -13,6 +16,7 @@
 
 import isort
 import libcst as cst
+from crosshair.auditwall import SideEffectDetected
 from rich.console import Group
 from rich.panel import Panel
 from rich.syntax import Syntax
@@ -29,12 +33,14 @@
     get_run_tmp_file,
     module_name_from_file_path,
 )
+from codeflash.code_utils.compat import IS_POSIX, SAFE_SYS_EXECUTABLE
 from codeflash.code_utils.config_consts import (
     INDIVIDUAL_TESTCASE_TIMEOUT,
     N_CANDIDATES,
     N_TESTS_TO_GENERATE,
     TOTAL_LOOPING_TIME,
 )
+from codeflash.code_utils.coverage_utils import prepare_coverage_files
 from codeflash.code_utils.formatter import format_code, sort_imports
 from codeflash.code_utils.instrument_existing_tests import inject_profiling_into_existing_test
 from codeflash.code_utils.remove_generated_tests import remove_functions_from_generated_tests
@@ -63,12 +69,13 @@
 from codeflash.result.critic import coverage_critic, performance_gain, quantity_of_tests_critic, speedup_critic
 from codeflash.result.explanation import Explanation
 from codeflash.telemetry.posthog_cf import ph
+from codeflash.verification.codeflash_auditwall import transform_code
 from codeflash.verification.concolic_testing import generate_concolic_tests
 from codeflash.verification.equivalence import compare_test_results
 from codeflash.verification.instrument_codeflash_capture import instrument_codeflash_capture
 from codeflash.verification.parse_test_output import parse_test_results
 from codeflash.verification.test_results import TestResults, TestType
-from codeflash.verification.test_runner import run_behavioral_tests, run_benchmarking_tests
+from codeflash.verification.test_runner import execute_test_subprocess, run_behavioral_tests, run_benchmarking_tests
 from codeflash.verification.verification_utils import get_test_file_path
 from codeflash.verification.verifier import generate_tests
 
@@ -149,12 +156,14 @@ def optimize_function(self) -> Result[BestOptimization, str]:
                 self.args.project_root,
             )
 
+
         generated_test_paths = [
             get_test_file_path(
                 self.test_cfg.tests_root, self.function_to_optimize.function_name, test_index, test_type="unit"
             )
             for test_index in range(N_TESTS_TO_GENERATE)
         ]
+
         generated_perf_test_paths = [
             get_test_file_path(
                 self.test_cfg.tests_root, self.function_to_optimize.function_name, test_index, test_type="perf"
@@ -844,6 +853,8 @@ def establish_original_code_baseline(
                     enable_coverage=test_framework == "pytest",
                     code_context=code_context,
                 )
+            except SideEffectDetected as e:
+                return Failure(f"Side effect detected in original code: {e}")
             finally:
                 # Remove codeflash capture
                 self.write_code_and_helpers(
@@ -855,9 +866,7 @@ def establish_original_code_baseline(
                 )
                 console.rule()
                 return Failure("Failed to establish a baseline for the original code - bevhavioral tests failed.")
-            if not coverage_critic(
-            coverage_results, self.args.test_framework
-                ):
+            if not coverage_critic(coverage_results, self.args.test_framework):
                 return Failure("The threshold for test coverage was not met.")
             if test_framework == "pytest":
                 benchmarking_results, _ = self.run_and_parse_tests(
@@ -898,7 +907,6 @@ def establish_original_code_baseline(
             )
             console.rule()
 
-
             total_timing = benchmarking_results.total_passed_runtime()  # caution: doesn't handle the loop index
             functions_to_remove = [
                 result.id.test_function_name
@@ -1097,13 +1105,13 @@ def run_and_parse_tests(
                 raise ValueError(f"Unexpected testing type: {testing_type}")
         except subprocess.TimeoutExpired:
             logger.exception(
-                f'Error running tests in {", ".join(str(f) for f in test_files.test_files)}.\nTimeout Error'
+                f"Error running tests in {', '.join(str(f) for f in test_files.test_files)}.\nTimeout Error"
             )
             return TestResults(), None
         if run_result.returncode != 0 and testing_type == TestingMode.BEHAVIOR:
             logger.debug(
-                f'Nonzero return code {run_result.returncode} when running tests in '
-                f'{", ".join([str(f.instrumented_behavior_file_path) for f in test_files.test_files])}.\n'
+                f"Nonzero return code {run_result.returncode} when running tests in "
+                f"{', '.join([str(f.instrumented_behavior_file_path) for f in test_files.test_files])}.\n"
                 f"stdout: {run_result.stdout}\n"
                 f"stderr: {run_result.stderr}\n"
             )
@@ -1149,4 +1157,3 @@ def generate_and_instrument_tests(
                 zip(generated_test_paths, generated_perf_test_paths)
             )
         ]
-
diff --git a/codeflash/verification/test_runner.py b/codeflash/verification/test_runner.py
@@ -1,16 +1,21 @@
 from __future__ import annotations
 
+import re
 import shlex
 import subprocess
+import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
 
+from crosshair.auditwall import SideEffectDetected
+
 from codeflash.cli_cmds.console import logger
 from codeflash.code_utils.code_utils import get_run_tmp_file
 from codeflash.code_utils.compat import IS_POSIX, SAFE_SYS_EXECUTABLE
 from codeflash.code_utils.config_consts import TOTAL_LOOPING_TIME
 from codeflash.code_utils.coverage_utils import prepare_coverage_files
 from codeflash.models.models import TestFiles
+from codeflash.verification.codeflash_auditwall import transform_code
 from codeflash.verification.test_results import TestType
 
 if TYPE_CHECKING:
@@ -36,78 +41,97 @@ def run_behavioral_tests(
     pytest_target_runtime_seconds: int = TOTAL_LOOPING_TIME,
     enable_coverage: bool = False,
 ) -> tuple[Path, subprocess.CompletedProcess, Path | None]:
-    if test_framework == "pytest":
-        test_files: list[str] = []
-        for file in test_paths.test_files:
-            if file.test_type == TestType.REPLAY_TEST:
-                # TODO: Does this work for unittest framework?
-                test_files.extend(
-                    [
-                        str(file.instrumented_behavior_file_path) + "::" + test.test_function
-                        for test in file.tests_in_file
-                    ]
-                )
-            else:
-                test_files.append(str(file.instrumented_behavior_file_path))
-        test_files = list(set(test_files))  # remove multiple calls in the same test function
-        pytest_cmd_list = shlex.split(pytest_cmd, posix=IS_POSIX)
-
-        common_pytest_args = [
-            "--capture=tee-sys",
-            f"--timeout={pytest_timeout}",
-            "-q",
-            "--codeflash_loops_scope=session",
-            "--codeflash_min_loops=1",
-            "--codeflash_max_loops=1",
-            f"--codeflash_seconds={pytest_target_runtime_seconds}",  # TODO :This is unnecessary, update the plugin to not ask for this
-        ]
-
-        result_file_path = get_run_tmp_file(Path("pytest_results.xml"))
-        result_args = [f"--junitxml={result_file_path.as_posix()}", "-o", "junit_logging=all"]
-
-        pytest_test_env = test_env.copy()
-        pytest_test_env["PYTEST_PLUGINS"] = "codeflash.verification.pytest_plugin"
+    if test_framework not in ["pytest", "unittest"]:
+        raise ValueError(f"Unsupported test framework: {test_framework}")
 
-        if enable_coverage:
-            coverage_database_file, coveragercfile = prepare_coverage_files()
-
-            cov_erase = execute_test_subprocess(
-                shlex.split(f"{SAFE_SYS_EXECUTABLE} -m coverage erase"), cwd=cwd, env=pytest_test_env
-            )  # this cleanup is necessary to avoid coverage data from previous runs, if there are any,
-            # then the current run will be appended to the previous data, which skews the results
-            logger.debug(cov_erase)
-
-            results = execute_test_subprocess(
-                shlex.split(f"{SAFE_SYS_EXECUTABLE} -m coverage run --rcfile={coveragercfile.as_posix()} -m")
-                + pytest_cmd_list
-                + common_pytest_args
-                + result_args
-                + test_files,
-                cwd=cwd,
-                env=pytest_test_env,
-                timeout=600,
+    test_files: list[str] = []
+    for file in test_paths.test_files:
+        if file.test_type == TestType.REPLAY_TEST:
+            # TODO: Does this work for unittest framework?
+            test_files.extend(
+                [str(file.instrumented_behavior_file_path) + "::" + test.test_function for test in file.tests_in_file]
             )
-            logger.debug(
-                f"""Result return code: {results.returncode}, {"Result stderr:" + str(results.stderr) if results.stderr else ''}""")
         else:
-            results = execute_test_subprocess(
-                pytest_cmd_list + common_pytest_args + result_args + test_files,
-                cwd=cwd,
-                env=pytest_test_env,
-                timeout=600,  # TODO: Make this dynamic
+            test_files.append(str(file.instrumented_behavior_file_path))
+
+    source_code = next((file.original_source for file in test_paths.test_files if file.original_source), None)
+    if not source_code:
+        raise ValueError("No source code found for auditing")
+
+    audit_code = transform_code(source_code)
+    pytest_cmd_list = shlex.split(pytest_cmd, posix=IS_POSIX)
+    common_pytest_args = [
+        "--capture=tee-sys",
+        f"--timeout={pytest_timeout}",
+        "-q",
+        "--codeflash_loops_scope=session",
+        "--codeflash_min_loops=1",
+        "--codeflash_max_loops=1",
+        f"--codeflash_seconds={pytest_target_runtime_seconds}",
+        "-p",
+        "no:cacheprovider",
+    ]
+
+    result_file_path = get_run_tmp_file(Path("pytest_results.xml"))
+    result_args = [f"--junitxml={result_file_path.as_posix()}", "-o", "junit_logging=all"]
+
+    pytest_test_env = test_env.copy()
+    pytest_test_env["PYTEST_PLUGINS"] = "codeflash.verification.pytest_plugin"
+
+    with tempfile.TemporaryDirectory(
+        dir=Path(test_paths.test_files[0].instrumented_behavior_file_path).parent
+    ) as temp_dir:
+        audited_file_path = Path(temp_dir) / "audited_code.py"
+        audited_file_path.write_text(audit_code, encoding="utf8")
+
+        auditing_res = execute_test_subprocess(
+            pytest_cmd_list + common_pytest_args + [audited_file_path.as_posix()],
+            cwd=cwd,
+            env=pytest_test_env,
+            timeout=600,
+        )
+
+        if auditing_res.returncode != 0:
+            line_co = next(
+                (
+                    line
+                    for line in auditing_res.stderr.splitlines() + auditing_res.stdout.splitlines()
+                    if "crosshair.auditwall.SideEffectDetected" in line
+                ),
+                None,
             )
-            logger.debug(
-                f"""Result return code: {results.returncode}, {"Result stderr:" + str(results.stderr) if results.stderr else ''}""")
-    elif test_framework == "unittest":
+
+            if line_co:
+                match = re.search(r"crosshair\.auditwall\.SideEffectDetected: A(.*) operation was detected\.", line_co)
+                if match:
+                    msg = match.group(1)
+                    raise SideEffectDetected(msg)
+                logger.debug(auditing_res.stderr)
+            logger.debug(auditing_res.stdout)
+
+    if test_framework == "pytest":
+        coverage_database_file, coveragercfile = prepare_coverage_files()
+        cov_erase = execute_test_subprocess(
+            shlex.split(f"{SAFE_SYS_EXECUTABLE} -m coverage erase"), cwd=cwd, env=pytest_test_env
+        )
+        logger.debug(cov_erase)
+
+        results = execute_test_subprocess(
+            shlex.split(f"{SAFE_SYS_EXECUTABLE} -m coverage run --rcfile={coveragercfile.as_posix()} -m")
+            + pytest_cmd_list
+            + common_pytest_args
+            + result_args
+            + list(set(test_files)),  # remove duplicates
+            cwd=cwd,
+            env=pytest_test_env,
+            timeout=600,
+        )
+    else:  # unittest
         if enable_coverage:
             raise ValueError("Coverage is not supported yet for unittest framework")
         test_env["CODEFLASH_LOOP_INDEX"] = "1"
         test_files = [file.instrumented_behavior_file_path for file in test_paths.test_files]
         result_file_path, results = run_unittest_tests(verbose, test_files, test_env, cwd)
-        logger.debug(
-            f"""Result return code: {results.returncode}, {"Result stderr:" + str(results.stderr) if results.stderr else ''}""")
-    else:
-        raise ValueError(f"Unsupported test framework: {test_framework}")
 
     return result_file_path, results, coverage_database_file if enable_coverage else None