started implementing group by benchmark

alvin-r · alvin-r · commit 57b80ec85f42 · 2025-03-28T14:58:49.000-07:00
diff --git a/codeflash/benchmarking/plugin/plugin.py b/codeflash/benchmarking/plugin/plugin.py
@@ -101,8 +101,7 @@ def get_function_benchmark_timings(trace_path: Path) -> dict[str, dict[Benchmark
                     qualified_name = f"{module_name}.{function_name}"
 
                 # Create the benchmark key (file::function::line)
-                benchmark_key = f"{benchmark_file}::{benchmark_func}::{benchmark_line}"
-                benchmark_key = BenchmarkKey(file_name=benchmark_file, function_name=benchmark_func, line_number=benchmark_line)
+                benchmark_key = BenchmarkKey(file_name=benchmark_file, function_name=benchmark_func)
                 # Initialize the inner dictionary if needed
                 if qualified_name not in result:
                     result[qualified_name] = {}
@@ -152,8 +151,7 @@ def get_benchmark_timings(trace_path: Path) -> dict[BenchmarkKey, int]:
             # Process overhead information
             for row in cursor.fetchall():
                 benchmark_file, benchmark_func, benchmark_line, total_overhead_ns = row
-                benchmark_key = f"{benchmark_file}::{benchmark_func}::{benchmark_line}"
-                benchmark_key = BenchmarkKey(file_name=benchmark_file, function_name=benchmark_func, line_number=benchmark_line)
+                benchmark_key = BenchmarkKey(file_name=benchmark_file, function_name=benchmark_func)
                 overhead_by_benchmark[benchmark_key] = total_overhead_ns or 0  # Handle NULL sum case
 
             # Query the benchmark_timings table for total times
@@ -167,8 +165,7 @@ def get_benchmark_timings(trace_path: Path) -> dict[BenchmarkKey, int]:
                 benchmark_file, benchmark_func, benchmark_line, time_ns = row
 
                 # Create the benchmark key (file::function::line)
-                benchmark_key = f"{benchmark_file}::{benchmark_func}::{benchmark_line}"
-                benchmark_key = BenchmarkKey(file_name=benchmark_file, function_name=benchmark_func, line_number=benchmark_line)
+                benchmark_key = BenchmarkKey(file_name=benchmark_file, function_name=benchmark_func)
                 # Subtract overhead from total time
                 overhead = overhead_by_benchmark.get(benchmark_key, 0)
                 result[benchmark_key] = time_ns - overhead
@@ -239,7 +236,7 @@ def test_something(benchmark):
                 The return value of the function
 
             """
-            benchmark_file_name = self.request.node.fspath.basename
+            benchmark_file_name = self.request.node.fspath
             benchmark_function_name = self.request.node.name
             line_number = int(str(sys._getframe(1).f_lineno))  # 1 frame up in the call stack
 
diff --git a/codeflash/benchmarking/replay_test.py b/codeflash/benchmarking/replay_test.py
@@ -196,12 +196,11 @@ def create_trace_replay_test_code(
     return imports + "\n" + metadata + "\n" + test_template
 
 def generate_replay_test(trace_file_path: Path, output_dir: Path, test_framework: str = "pytest", max_run_count: int = 100) -> int:
-    """Generate multiple replay tests from the traced function calls, grouping by benchmark name.
+    """Generate multiple replay tests from the traced function calls, grouped by benchmark.
 
     Args:
         trace_file_path: Path to the SQLite database file
         output_dir: Directory to write the generated tests (if None, only returns the code)
-        project_root: Root directory of the project for module imports
         test_framework: 'pytest' or 'unittest'
         max_run_count: Maximum number of runs to include per function
 
@@ -267,7 +266,7 @@ def generate_replay_test(trace_file_path: Path, output_dir: Path, test_framework
             # Write to file if requested
             if output_dir:
                 output_file = get_test_file_path(
-                    test_dir=Path(output_dir), function_name=f"{benchmark_file_name[5:]}_{benchmark_function_name}", test_type="replay"
+                    test_dir=Path(output_dir), function_name=f"{benchmark_file_name}_{benchmark_function_name}", test_type="replay"
                 )
                 # Write test code to file, parents = true
                 output_dir.mkdir(parents=True, exist_ok=True)
diff --git a/codeflash/benchmarking/utils.py b/codeflash/benchmarking/utils.py
@@ -93,7 +93,7 @@ def process_benchmark_data(
 
     for benchmark_key, og_benchmark_timing in fto_benchmark_timings.items():
         try:
-            benchmark_file_name, benchmark_test_function, line_number = benchmark_key.split("::")
+            benchmark_file_name, benchmark_test_function = benchmark_key.split("::")
         except ValueError:
             continue  # Skip malformed benchmark keys
 
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -82,10 +82,9 @@ class BestOptimization(BaseModel):
 class BenchmarkKey:
     file_name: str
     function_name: str
-    line_number: int
 
     def __str__(self) -> str:
-        return f"{self.file_name}::{self.function_name}::{self.line_number}"
+        return f"{self.file_name}::{self.function_name}"
 
 @dataclass
 class BenchmarkDetail:
@@ -270,7 +269,7 @@ class FunctionParent:
 class OriginalCodeBaseline(BaseModel):
     behavioral_test_results: TestResults
     benchmarking_test_results: TestResults
-    replay_benchmarking_test_results: Optional[TestResults] = None
+    replay_benchmarking_test_results: Optional[dict[BenchmarkKey, TestResults]] = None
     runtime: int
     coverage_results: Optional[CoverageData]
 
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -88,8 +88,8 @@ def __init__(
         function_to_tests: dict[str, list[FunctionCalledInTest]] | None = None,
         function_to_optimize_ast: ast.FunctionDef | None = None,
         aiservice_client: AiServiceClient | None = None,
-        function_benchmark_timings: dict[str, int] | None = None,
-        total_benchmark_timings: dict[str, int] | None = None,
+        function_benchmark_timings: dict[BenchmarkKey, int] | None = None,
+        total_benchmark_timings: dict[BenchmarkKey, int] | None = None,
         args: Namespace | None = None,
     ) -> None:
         self.project_root = test_cfg.project_root_path
@@ -428,20 +428,24 @@ def determine_best_candidate(
                         tree.add(f"Speedup percentage: {perf_gain * 100:.1f}%")
                         tree.add(f"Speedup ratio: {perf_gain + 1:.1f}X")
                         if self.args.benchmark:
-                            original_code_replay_runtime = original_code_baseline.replay_benchmarking_test_results.total_passed_runtime()
-                            candidate_replay_runtime = candidate_result.replay_benchmarking_test_results.total_passed_runtime()
-                            replay_perf_gain = performance_gain(
-                                original_runtime_ns=original_code_replay_runtime,
-                                optimized_runtime_ns=candidate_replay_runtime,
-                            )
-                            tree.add(f"Original benchmark replay runtime: {humanize_runtime(original_code_replay_runtime)}")
-                            tree.add(
-                                f"Best benchmark replay runtime: {humanize_runtime(candidate_replay_runtime)} "
-                                f"(measured over {candidate_result.max_loop_count} "
-                                f"loop{'s' if candidate_result.max_loop_count > 1 else ''})"
-                            )
-                            tree.add(f"Speedup percentage for benchmark replay test: {replay_perf_gain * 100:.1f}%")
-                            tree.add(f"Speedup ratio for benchmark replay test: {replay_perf_gain + 1:.1f}X")
+
+                            benchmark_keys = {(benchmark.file_name, benchmark.function_name) for benchmark in self.total_benchmark_timings}
+                            test_results_by_benchmark = candidate_result.benchmarking_test_results.group_by_benchmark(benchmark_keys)
+                            for benchmark_key, test_results in test_results_by_benchmark.items():
+                                original_code_replay_runtime = original_code_baseline.replay_benchmarking_test_results[benchmark_key].total_passed_runtime()
+                                candidate_replay_runtime = candidate_result.replay_benchmarking_test_results.total_passed_runtime()
+                                replay_perf_gain = performance_gain(
+                                    original_runtime_ns=original_code_replay_runtime,
+                                    optimized_runtime_ns=candidate_replay_runtime,
+                                )
+                                tree.add(f"Original benchmark replay runtime: {humanize_runtime(original_code_replay_runtime)}")
+                                tree.add(
+                                    f"Best benchmark replay runtime: {humanize_runtime(candidate_replay_runtime)} "
+                                    f"(measured over {candidate_result.max_loop_count} "
+                                    f"loop{'s' if candidate_result.max_loop_count > 1 else ''})"
+                                )
+                                tree.add(f"Speedup percentage for benchmark replay test: {replay_perf_gain * 100:.1f}%")
+                                tree.add(f"Speedup ratio for benchmark replay test: {replay_perf_gain + 1:.1f}X")
                         best_optimization = BestOptimization(
                             candidate=candidate,
                             helper_functions=code_context.helper_functions,
@@ -898,7 +902,7 @@ def establish_original_code_baseline(
             logger.debug(f"Total original code runtime (ns): {total_timing}")
 
             if self.args.benchmark:
-                replay_benchmarking_test_results = benchmarking_results.filter(TestType.REPLAY_TEST)
+                replay_benchmarking_test_results = benchmarking_results.filter_by_test_type(TestType.REPLAY_TEST)
                 logger.info(f"Total replay test runtime: {humanize_runtime(replay_benchmarking_test_results.total_passed_runtime())}")
             return Success(
                 (
@@ -1020,7 +1024,7 @@ def run_optimized_candidate(
 
             logger.debug(f"Total optimized code {optimization_candidate_index} runtime (ns): {total_candidate_timing}")
             if self.args.benchmark:
-                candidate_replay_benchmarking_results = candidate_benchmarking_results.filter(TestType.REPLAY_TEST)
+                candidate_replay_benchmarking_results = candidate_benchmarking_results.filter_by_test_type(TestType.REPLAY_TEST)
                 logger.debug(
                     f"Total optimized code {optimization_candidate_index} replay benchmark runtime (ns): {candidate_replay_benchmarking_results.total_passed_runtime()}"
                 )
diff --git a/codeflash/optimization/optimizer.py b/codeflash/optimization/optimizer.py
@@ -60,8 +60,8 @@ def create_function_optimizer(
         function_to_optimize_ast: ast.FunctionDef | None = None,
         function_to_tests: dict[str, list[FunctionCalledInTest]] | None = None,
         function_to_optimize_source_code: str | None = "",
-        function_benchmark_timings: dict[str, dict[str, float]] | None = None,
-        total_benchmark_timings: dict[str, float] | None = None,
+        function_benchmark_timings: dict[str, dict[BenchmarkKey, float]] | None = None,
+        total_benchmark_timings: dict[BenchmarkKey, float] | None = None,
     ) -> FunctionOptimizer:
         return FunctionOptimizer(
             function_to_optimize=function_to_optimize,
@@ -111,7 +111,10 @@ def run(self) -> None:
                 try:
                     instrument_codeflash_trace_decorator(file_to_funcs_to_optimize)
                     trace_file = Path(self.args.benchmarks_root) / "benchmarks.trace"
-                    replay_tests_dir = Path(self.args.tests_root) / "codeflash_replay_tests"
+                    if trace_file.exists():
+                        trace_file.unlink()
+
+                    replay_tests_dir = Path(self.args.tests_root)
                     trace_benchmarks_pytest(self.args.benchmarks_root, self.args.tests_root, self.args.project_root, trace_file) # Run all tests that use pytest-benchmark
                     replay_count = generate_replay_test(trace_file, replay_tests_dir)
                     if replay_count == 0:
diff --git a/codeflash/verification/test_results.py b/codeflash/verification/test_results.py
@@ -125,7 +125,7 @@ def merge(self, other: TestResults) -> None:
                 raise ValueError(msg)
             self.test_result_idx[k] = v + original_len
 
-    def filter(self, test_type: TestType) -> TestResults:
+    def filter_by_test_type(self, test_type: TestType) -> TestResults:
         filtered_test_results = []
         filtered_test_results_idx = {}
         for test_result in self.test_results:
@@ -134,6 +134,24 @@ def filter(self, test_type: TestType) -> TestResults:
                 filtered_test_results.append(test_result)
         return TestResults(test_results=filtered_test_results, test_result_idx=filtered_test_results_idx)
 
+    def group_by_benchmark(self, benchmark_key_set:set[tuple[str,str]]) -> dict[tuple[str,str],TestResults]:
+        """Group TestResults by benchmark key.
+
+        For now, use a tuple of (file_path, function_name) as the benchmark key. Can't import BenchmarkKey because of circular import.
+
+        Args:
+            benchmark_key_set (set[tuple[str,str]]): A set of tuples of (file_path, function_name)
+
+        Returns:
+            TestResults: A new TestResults object with the test results grouped by benchmark key.
+
+        """
+        test_result_by_benchmark = defaultdict(TestResults)
+        for test_result in self.test_results:
+            if test_result.test_type == TestType.REPLAY_TEST and (test_result.id.test_module_path,test_result.id.test_function_name) in benchmark_key_set:
+                test_result_by_benchmark[(test_result.id.test_module_path,test_result.id.test_function_name)].add(test_result)
+        return test_result_by_benchmark
+
     def get_by_unique_invocation_loop_id(self, unique_invocation_loop_id: str) -> FunctionTestInvocation | None:
         try:
             return self.test_results[self.test_result_idx[unique_invocation_loop_id]]