Merge pull request #586 from codeflash-ai/benchmark-fixture-fix

KRRT7 · web-flow · commit b2c2743695bb · 2025-07-28T18:05:31.000-07:00
Benchmark Fixture fixes
diff --git a/codeflash-benchmark/codeflash_benchmark/__init__.py b/codeflash-benchmark/codeflash_benchmark/__init__.py
@@ -0,0 +1,3 @@
+"""CodeFlash Benchmark - Pytest benchmarking plugin for codeflash.ai."""
+
+__version__ = "0.1.0"
diff --git a/codeflash-benchmark/codeflash_benchmark/plugin.py b/codeflash-benchmark/codeflash_benchmark/plugin.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+import importlib.util
+
+import pytest
+
+from codeflash.benchmarking.plugin.plugin import codeflash_benchmark_plugin
+
+PYTEST_BENCHMARK_INSTALLED = importlib.util.find_spec("pytest_benchmark") is not None
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    """Register the benchmark marker and disable conflicting plugins."""
+    config.addinivalue_line("markers", "benchmark: mark test as a benchmark that should be run with codeflash tracing")
+
+    if config.getoption("--codeflash-trace") and PYTEST_BENCHMARK_INSTALLED:
+        config.option.benchmark_disable = True
+        config.pluginmanager.set_blocked("pytest_benchmark")
+        config.pluginmanager.set_blocked("pytest-benchmark")
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    parser.addoption(
+        "--codeflash-trace", action="store_true", default=False, help="Enable CodeFlash tracing for benchmarks"
+    )
+
+
+@pytest.fixture
+def benchmark(request: pytest.FixtureRequest) -> object:
+    """Benchmark fixture that works with or without pytest-benchmark installed."""
+    config = request.config
+
+    # If --codeflash-trace is enabled, use our implementation
+    if config.getoption("--codeflash-trace"):
+        return codeflash_benchmark_plugin.Benchmark(request)
+
+    # If pytest-benchmark is installed and --codeflash-trace is not enabled,
+    # return the normal pytest-benchmark fixture
+    if PYTEST_BENCHMARK_INSTALLED:
+        from pytest_benchmark.fixture import BenchmarkFixture as BSF  # noqa: N814
+
+        bs = getattr(config, "_benchmarksession", None)
+        if bs and bs.skip:
+            pytest.skip("Benchmarks are skipped (--benchmark-skip was used).")
+
+        node = request.node
+        marker = node.get_closest_marker("benchmark")
+        options = dict(marker.kwargs) if marker else {}
+
+        if bs:
+            return BSF(
+                node,
+                add_stats=bs.benchmarks.append,
+                logger=bs.logger,
+                warner=request.node.warn,
+                disabled=bs.disabled,
+                **dict(bs.options, **options),
+            )
+        return lambda func, *args, **kwargs: func(*args, **kwargs)
+
+    return lambda func, *args, **kwargs: func(*args, **kwargs)
diff --git a/codeflash-benchmark/pyproject.toml b/codeflash-benchmark/pyproject.toml
@@ -0,0 +1,32 @@
+[project]
+name = "codeflash-benchmark"
+version = "0.1.0"
+description = "Pytest benchmarking plugin for codeflash.ai - automatic code performance optimization"
+authors = [{ name = "CodeFlash Inc.", email = "contact@codeflash.ai" }]
+requires-python = ">=3.9"
+readme = "README.md"
+license = {text = "BSL-1.1"}
+keywords = [
+    "codeflash",
+    "benchmark",
+    "pytest",
+    "performance",
+    "testing",
+]
+dependencies = [
+    "pytest>=7.0.0,!=8.3.4",
+]
+
+[project.urls]
+Homepage = "https://codeflash.ai"
+Repository = "https://github.com/codeflash-ai/codeflash-benchmark"
+
+[project.entry-points.pytest11]
+codeflash-benchmark = "codeflash_benchmark.plugin"
+
+[build-system]
+requires = ["setuptools>=45", "wheel", "setuptools_scm"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools]
+packages = ["codeflash_benchmark"]
diff --git a/codeflash/benchmarking/plugin/plugin.py b/codeflash/benchmarking/plugin/plugin.py
@@ -16,7 +16,7 @@
 if TYPE_CHECKING:
     from codeflash.models.models import BenchmarkKey
 
-IS_PYTEST_BENCHMARK_INSTALLED = importlib.util.find_spec("pytest_benchmark") is not None
+PYTEST_BENCHMARK_INSTALLED = importlib.util.find_spec("pytest_benchmark") is not None
 
 
 class CodeFlashBenchmarkPlugin:
@@ -251,8 +251,12 @@ def wrapped_func(*args, **kwargs):  # noqa: ANN002, ANN003, ANN202
 
         def _run_benchmark(self, func, *args, **kwargs):  # noqa: ANN001, ANN002, ANN003, ANN202
             """Actual benchmark implementation."""
+            node_path = getattr(self.request.node, "path", None) or getattr(self.request.node, "fspath", None)
+            if node_path is None:
+                raise RuntimeError("Unable to determine test file path from pytest node")
+
             benchmark_module_path = module_name_from_file_path(
-                Path(str(self.request.node.fspath)), Path(codeflash_benchmark_plugin.project_root), traverse_up=True
+                Path(str(node_path)), Path(codeflash_benchmark_plugin.project_root), traverse_up=True
             )
 
             benchmark_function_name = self.request.node.name
@@ -282,26 +286,3 @@ def _run_benchmark(self, func, *args, **kwargs):  # noqa: ANN001, ANN002, ANN003
 
 
 codeflash_benchmark_plugin = CodeFlashBenchmarkPlugin()
-
-
-def pytest_configure(config: pytest.Config) -> None:
-    """Register the benchmark marker and disable conflicting plugins."""
-    config.addinivalue_line("markers", "benchmark: mark test as a benchmark that should be run with codeflash tracing")
-
-    if config.getoption("--codeflash-trace") and IS_PYTEST_BENCHMARK_INSTALLED:
-        config.option.benchmark_disable = True
-        config.pluginmanager.set_blocked("pytest_benchmark")
-        config.pluginmanager.set_blocked("pytest-benchmark")
-
-
-def pytest_addoption(parser: pytest.Parser) -> None:
-    parser.addoption(
-        "--codeflash-trace", action="store_true", default=False, help="Enable CodeFlash tracing for benchmarks"
-    )
-
-
-@pytest.fixture
-def benchmark(request: pytest.FixtureRequest) -> object:
-    if not request.config.getoption("--codeflash-trace"):
-        return lambda func, *args, **kwargs: func(*args, **kwargs)
-    return codeflash_benchmark_plugin.Benchmark(request)
diff --git a/codeflash/benchmarking/pytest_new_process_trace_benchmarks.py b/codeflash/benchmarking/pytest_new_process_trace_benchmarks.py
@@ -7,11 +7,14 @@
 benchmarks_root = sys.argv[1]
 tests_root = sys.argv[2]
 trace_file = sys.argv[3]
-# current working directory
 project_root = Path.cwd()
+
 if __name__ == "__main__":
     import pytest
 
+    orig_recursion_limit = sys.getrecursionlimit()
+    sys.setrecursionlimit(orig_recursion_limit * 2)
+
     try:
         codeflash_benchmark_plugin.setup(trace_file, project_root)
         codeflash_trace.setup(trace_file)
@@ -32,9 +35,12 @@
                 "addopts=",
             ],
             plugins=[codeflash_benchmark_plugin],
-        )  # Errors will be printed to stdout, not stderr
-
+        )
     except Exception as e:
         print(f"Failed to collect tests: {e!s}", file=sys.stderr)
         exitcode = -1
+    finally:
+        # Restore the original recursion limit
+        sys.setrecursionlimit(orig_recursion_limit)
+
     sys.exit(exitcode)
diff --git a/codeflash/benchmarking/replay_test.py b/codeflash/benchmarking/replay_test.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import re
 import sqlite3
 import textwrap
 from pathlib import Path
@@ -14,6 +15,8 @@
 if TYPE_CHECKING:
     from collections.abc import Generator
 
+benchmark_context_cleaner = re.compile(r"[^a-zA-Z0-9_]+")
+
 
 def get_next_arg_and_return(
     trace_file: str,
@@ -46,6 +49,16 @@ def get_function_alias(module: str, function_name: str) -> str:
     return "_".join(module.split(".")) + "_" + function_name
 
 
+def get_unique_test_name(module: str, function_name: str, benchmark_name: str, class_name: str | None = None) -> str:
+    clean_benchmark = benchmark_context_cleaner.sub("_", benchmark_name).strip("_")
+
+    base_alias = get_function_alias(module, function_name)
+    if class_name:
+        class_alias = get_function_alias(module, class_name)
+        return f"{class_alias}_{function_name}_{clean_benchmark}"
+    return f"{base_alias}_{clean_benchmark}"
+
+
 def create_trace_replay_test_code(
     trace_file: str,
     functions_data: list[dict[str, Any]],
@@ -209,7 +222,8 @@ def create_trace_replay_test_code(
         formatted_test_body = textwrap.indent(test_body, "        " if test_framework == "unittest" else "    ")
 
         test_template += "    " if test_framework == "unittest" else ""
-        test_template += f"def test_{alias}({self}):\n{formatted_test_body}\n"
+        unique_test_name = get_unique_test_name(module_name, function_name, benchmark_function_name, class_name)
+        test_template += f"def test_{unique_test_name}({self}):\n{formatted_test_body}\n"
 
     return imports + "\n" + metadata + "\n" + test_template
 
diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py
@@ -150,11 +150,13 @@ def __init__(self, function_names_to_find: set[str]) -> None:
         self.imported_modules: set[str] = set()
         self.has_dynamic_imports: bool = False
         self.wildcard_modules: set[str] = set()
+        # Track aliases: alias_name -> original_name
+        self.alias_mapping: dict[str, str] = {}
 
         # Precompute function_names for prefix search
         # For prefix match, store mapping from prefix-root to candidates for O(1) matching
         self._exact_names = function_names_to_find
-        self._prefix_roots = {}
+        self._prefix_roots: dict[str, list[str]] = {}
         for name in function_names_to_find:
             if "." in name:
                 root = name.split(".", 1)[0]
@@ -206,6 +208,9 @@ def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
             imported_name = alias.asname if alias.asname else aname
             self.imported_modules.add(imported_name)
 
+            if alias.asname:
+                self.alias_mapping[imported_name] = aname
+
             # Fast check for dynamic import
             if mod == "importlib" and aname == "import_module":
                 self.has_dynamic_imports = True
@@ -222,7 +227,6 @@ def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
                 self.found_qualified_name = qname
                 return
 
-            # Fast prefix match: only for relevant roots
             prefix = qname + "."
             # Only bother if one of the targets startswith the prefix-root
             candidates = proots.get(qname, ())
@@ -247,6 +251,18 @@ def visit_Attribute(self, node: ast.Attribute) -> None:
             self.found_qualified_name = node.attr
             return
 
+        if isinstance(node.value, ast.Name) and node.value.id in self.imported_modules:
+            for target_func in self.function_names_to_find:
+                if "." in target_func:
+                    class_name, method_name = target_func.rsplit(".", 1)
+                    if node.attr == method_name:
+                        imported_name = node.value.id
+                        original_name = self.alias_mapping.get(imported_name, imported_name)
+                        if original_name == class_name:
+                            self.found_any_target_function = True
+                            self.found_qualified_name = target_func
+                            return
+
         # Check if this is accessing a target function through a dynamically imported module
         # Only if we've detected dynamic imports are being used
         if self.has_dynamic_imports and node.attr in self.function_names_to_find:
diff --git a/codeflash/discovery/functions_to_optimize.py b/codeflash/discovery/functions_to_optimize.py
@@ -204,6 +204,7 @@ def get_functions_to_optimize(
                 functions[file] = [found_function]
         else:
             logger.info("Finding all functions modified in the current git diff ...")
+            console.rule()
             ph("cli-optimizing-git-diff")
             functions = get_functions_within_git_diff()
         filtered_modified_functions, functions_count = filter_functions(
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -515,6 +515,7 @@ def group_by_benchmarks(
                 benchmark_replay_test_dir.resolve()
                 / f"test_{benchmark_key.module_path.replace('.', '_')}__replay_test_",
                 project_root,
+                traverse_up=True,
             )
         for test_result in self.test_results:
             if test_result.test_type == TestType.REPLAY_TEST:
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -840,6 +840,7 @@ def instrument_existing_tests(self, function_to_all_tests: dict[str, set[Functio
                 f"{concolic_coverage_test_files_count} concolic coverage test file"
                 f"{'s' if concolic_coverage_test_files_count != 1 else ''} for {func_qualname}"
             )
+            console.rule()
         return unique_instrumented_test_files
 
     def generate_tests_and_optimizations(
diff --git a/codeflash/optimization/optimizer.py b/codeflash/optimization/optimizer.py
@@ -106,7 +106,7 @@ def run_benchmarks(
                 for file in file_path_to_source_code:
                     with file.open("w", encoding="utf8") as f:
                         f.write(file_path_to_source_code[file])
-
+        console.rule()
         return function_benchmark_timings, total_benchmark_timings
 
     def get_optimizable_functions(self) -> tuple[dict[Path, list[FunctionToOptimize]], int, Path | None]:
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,7 @@ dependencies = [
     "line_profiler>=4.2.0",
     "platformdirs>=4.3.7",
     "pygls>=1.3.1",
+    "codeflash-benchmark",
 ]
 
 [project.urls]
@@ -262,6 +263,12 @@ skip-magic-trailing-comma = true
 [tool.hatch.version]
 source = "uv-dynamic-versioning"
 
+[tool.uv]
+workspace = { members = ["codeflash-benchmark"] }
+
+[tool.uv.sources]
+codeflash-benchmark = { workspace = true }
+
 [tool.uv-dynamic-versioning]
 enable = true
 style = "pep440"
@@ -300,5 +307,3 @@ markers = [
 requires = ["hatchling", "uv-dynamic-versioning"]
 build-backend = "hatchling.build"
 
-[project.entry-points.pytest11]
-codeflash = "codeflash.benchmarking.plugin.plugin"
diff --git a/tests/test_pickle_patcher.py b/tests/test_pickle_patcher.py
@@ -364,7 +364,7 @@ def test_run_and_parse_picklepatch() -> None:
         test_env["CODEFLASH_TEST_ITERATION"] = "0"
         test_env["CODEFLASH_LOOP_INDEX"] = "1"
         test_type = TestType.REPLAY_TEST
-        replay_test_function = "test_code_to_optimize_bubble_sort_picklepatch_test_unused_socket_bubble_sort_with_unused_socket"
+        replay_test_function = "test_code_to_optimize_bubble_sort_picklepatch_test_unused_socket_bubble_sort_with_unused_socket_test_socket_picklepatch"
         func_optimizer = opt.create_function_optimizer(func)
         func_optimizer.test_files = TestFiles(
             test_files=[
@@ -388,7 +388,7 @@ def test_run_and_parse_picklepatch() -> None:
         )
         assert len(test_results_unused_socket) == 1
         assert test_results_unused_socket.test_results[0].id.test_module_path == "code_to_optimize.tests.pytest.benchmarks_socket_test.codeflash_replay_tests.test_code_to_optimize_tests_pytest_benchmarks_socket_test_test_socket__replay_test_0"
-        assert test_results_unused_socket.test_results[0].id.test_function_name == "test_code_to_optimize_bubble_sort_picklepatch_test_unused_socket_bubble_sort_with_unused_socket"
+        assert test_results_unused_socket.test_results[0].id.test_function_name == "test_code_to_optimize_bubble_sort_picklepatch_test_unused_socket_bubble_sort_with_unused_socket_test_socket_picklepatch"
         assert test_results_unused_socket.test_results[0].did_pass == True
 
         # Replace with optimized candidate
@@ -439,7 +439,7 @@ def bubble_sort_with_unused_socket(data_container):
         test_type = TestType.REPLAY_TEST
         func = FunctionToOptimize(function_name="bubble_sort_with_used_socket", parents=[],
                                   file_path=Path(fto_used_socket_path))
-        replay_test_function = "test_code_to_optimize_bubble_sort_picklepatch_test_used_socket_bubble_sort_with_used_socket"
+        replay_test_function = "test_code_to_optimize_bubble_sort_picklepatch_test_used_socket_bubble_sort_with_used_socket_test_used_socket_picklepatch"
         func_optimizer = opt.create_function_optimizer(func)
         func_optimizer.test_files = TestFiles(
             test_files=[
@@ -467,7 +467,7 @@ def bubble_sort_with_unused_socket(data_container):
         assert test_results_used_socket.test_results[
                    0].id.test_module_path == "code_to_optimize.tests.pytest.benchmarks_socket_test.codeflash_replay_tests.test_code_to_optimize_tests_pytest_benchmarks_socket_test_test_socket__replay_test_0"
         assert test_results_used_socket.test_results[
-                   0].id.test_function_name == "test_code_to_optimize_bubble_sort_picklepatch_test_used_socket_bubble_sort_with_used_socket"
+                   0].id.test_function_name == "test_code_to_optimize_bubble_sort_picklepatch_test_used_socket_bubble_sort_with_used_socket_test_used_socket_picklepatch"
         assert test_results_used_socket.test_results[0].did_pass is False
         print("test results used socket")
         print(test_results_used_socket)
@@ -498,7 +498,7 @@ def bubble_sort_with_used_socket(data_container):
         assert test_results_used_socket.test_results[
                    0].id.test_module_path == "code_to_optimize.tests.pytest.benchmarks_socket_test.codeflash_replay_tests.test_code_to_optimize_tests_pytest_benchmarks_socket_test_test_socket__replay_test_0"
         assert test_results_used_socket.test_results[
-                   0].id.test_function_name == "test_code_to_optimize_bubble_sort_picklepatch_test_used_socket_bubble_sort_with_used_socket"
+                   0].id.test_function_name == "test_code_to_optimize_bubble_sort_picklepatch_test_used_socket_bubble_sort_with_used_socket_test_used_socket_picklepatch"
         assert test_results_used_socket.test_results[0].did_pass is False
 
         # Even though tests threw the same error, we reject this as the behavior of the unpickleable object could not be determined.
diff --git a/tests/test_trace_benchmarks.py b/tests/test_trace_benchmarks.py
diff --git a/tests/test_unit_test_discovery.py b/tests/test_unit_test_discovery.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+"""CodeFlash Benchmark - Pytest benchmarking plugin for codeflash.ai."""`
	`2`	`+`
	`3`	`+__version__ = "0.1.0"`
Original file line number	Diff line number	Diff line change
`@@ -515,6 +515,7 @@ def group_by_benchmarks(`
`515`	`515`	`benchmark_replay_test_dir.resolve()`
`516`	`516`	`/ f"test_{benchmark_key.module_path.replace('.', '_')}__replay_test_",`
`517`	`517`	`project_root,`
	`518`	`+ traverse_up=True,`
`518`	`519`	`)`
`519`	`520`	`for test_result in self.test_results:`
`520`	`521`	`if test_result.test_type == TestType.REPLAY_TEST:`
Original file line number	Diff line number	Diff line change
`@@ -840,6 +840,7 @@ def instrument_existing_tests(self, function_to_all_tests: dict[str, set[Functio`
`840`	`840`	`f"{concolic_coverage_test_files_count} concolic coverage test file"`
`841`	`841`	`f"{'s' if concolic_coverage_test_files_count != 1 else ''} for {func_qualname}"`
`842`	`842`	`)`
	`843`	`+ console.rule()`
`843`	`844`	`return unique_instrumented_test_files`
`844`	`845`
`845`	`846`	`def generate_tests_and_optimizations(`