replay tests are now grouped by benchmark file. each benchmark test file will create one replay test file.

alvin-r · alvin-r · commit d703b13aee66 · 2025-04-01T14:44:17.000-07:00
diff --git a/code_to_optimize/tests/pytest/benchmarks_test/test_benchmark_bubble_sort.py b/code_to_optimize/tests/pytest/benchmarks_test/test_benchmark_bubble_sort.py
@@ -15,6 +15,12 @@ def test_sort2():
 def test_class_sort(benchmark):
     obj = Sorter(list(reversed(range(100))))
     result1 = benchmark(obj.sorter, 2)
+
+def test_class_sort2(benchmark):
     result2 = benchmark(Sorter.sort_class, list(reversed(range(100))))
+
+def test_class_sort3(benchmark):
     result3 = benchmark(Sorter.sort_static, list(reversed(range(100))))
+
+def test_class_sort4(benchmark):
     result4 = benchmark(Sorter, [1,2,3])
diff --git a/codeflash/benchmarking/replay_test.py b/codeflash/benchmarking/replay_test.py
@@ -16,21 +16,21 @@
 
 
 def get_next_arg_and_return(
-        trace_file: str, function_name: str, file_path: str, class_name: str | None = None, num_to_get: int = 256
+        trace_file: str, benchmark_function_name:str, function_name: str, file_path: str, class_name: str | None = None, num_to_get: int = 256
 ) -> Generator[Any]:
     db = sqlite3.connect(trace_file)
     cur = db.cursor()
     limit = num_to_get
 
     if class_name is not None:
         cursor = cur.execute(
-            "SELECT * FROM benchmark_function_timings WHERE function_name = ? AND file_path = ? AND class_name = ? LIMIT ?",
-            (function_name, file_path, class_name, limit),
+            "SELECT * FROM benchmark_function_timings WHERE benchmark_function_name = ? AND function_name = ? AND file_path = ? AND class_name = ? LIMIT ?",
+            (benchmark_function_name, function_name, file_path, class_name, limit),
         )
     else:
         cursor = cur.execute(
-            "SELECT * FROM benchmark_function_timings WHERE function_name = ? AND file_path = ? AND class_name = '' LIMIT ?",
-            (function_name, file_path, limit),
+            "SELECT * FROM benchmark_function_timings WHERE benchmark_function_name = ? AND function_name = ? AND file_path = ? AND class_name = '' LIMIT ?",
+            (benchmark_function_name, function_name, file_path, limit),
         )
 
     while (val := cursor.fetchone()) is not None:
@@ -61,6 +61,7 @@ def create_trace_replay_test_code(
     """
     assert test_framework in ["pytest", "unittest"]
 
+    # Create Imports
     imports = f"""import dill as pickle 
 {"import unittest" if test_framework == "unittest" else ""}
 from codeflash.benchmarking.replay_test import get_next_arg_and_return
@@ -82,16 +83,15 @@ def create_trace_replay_test_code(
 
     imports += "\n".join(function_imports)
 
-    functions_to_optimize = [func.get("function_name") for func in functions_data
-                             if func.get("function_name") != "__init__"]
+    functions_to_optimize = sorted({func.get("function_name") for func in functions_data
+                             if func.get("function_name") != "__init__"})
     metadata = f"""functions = {functions_to_optimize}
 trace_file_path = r"{trace_file}"
 """
-
     # Templates for different types of tests
     test_function_body = textwrap.dedent(
         """\
-        for args_pkl, kwargs_pkl in get_next_arg_and_return(trace_file=trace_file_path, function_name="{orig_function_name}", file_path=r"{file_path}", num_to_get={max_run_count}):
+        for args_pkl, kwargs_pkl in get_next_arg_and_return(trace_file=trace_file_path, benchmark_function_name="{benchmark_function_name}", function_name="{orig_function_name}", file_path=r"{file_path}", num_to_get={max_run_count}):
             args = pickle.loads(args_pkl)
             kwargs = pickle.loads(kwargs_pkl)
             ret = {function_name}(*args, **kwargs)
@@ -100,7 +100,7 @@ def create_trace_replay_test_code(
 
     test_method_body = textwrap.dedent(
         """\
-        for args_pkl, kwargs_pkl in get_next_arg_and_return(trace_file=trace_file_path, function_name="{orig_function_name}", file_path=r"{file_path}", class_name="{class_name}", num_to_get={max_run_count}):
+        for args_pkl, kwargs_pkl in get_next_arg_and_return(trace_file=trace_file_path, benchmark_function_name="{benchmark_function_name}", function_name="{orig_function_name}", file_path=r"{file_path}", class_name="{class_name}", num_to_get={max_run_count}):
             args = pickle.loads(args_pkl)
             kwargs = pickle.loads(kwargs_pkl){filter_variables}
             function_name = "{orig_function_name}"
@@ -115,7 +115,7 @@ def create_trace_replay_test_code(
 
     test_class_method_body = textwrap.dedent(
         """\
-        for args_pkl, kwargs_pkl in get_next_arg_and_return(trace_file=trace_file_path, function_name="{orig_function_name}", file_path=r"{file_path}", class_name="{class_name}", num_to_get={max_run_count}):
+        for args_pkl, kwargs_pkl in get_next_arg_and_return(trace_file=trace_file_path, benchmark_function_name="{benchmark_function_name}", function_name="{orig_function_name}", file_path=r"{file_path}", class_name="{class_name}", num_to_get={max_run_count}):
             args = pickle.loads(args_pkl)
             kwargs = pickle.loads(kwargs_pkl){filter_variables}
             if not args:
@@ -125,13 +125,15 @@ def create_trace_replay_test_code(
     )
     test_static_method_body = textwrap.dedent(
         """\
-        for args_pkl, kwargs_pkl in get_next_arg_and_return(trace_file=trace_file_path, function_name="{orig_function_name}", file_path=r"{file_path}", class_name="{class_name}", num_to_get={max_run_count}):
+        for args_pkl, kwargs_pkl in get_next_arg_and_return(trace_file=trace_file_path, benchmark_function_name="{benchmark_function_name}", function_name="{orig_function_name}", file_path=r"{file_path}", class_name="{class_name}", num_to_get={max_run_count}):
             args = pickle.loads(args_pkl)
             kwargs = pickle.loads(kwargs_pkl){filter_variables}
             ret = {class_name_alias}{method_name}(*args, **kwargs)
             """
     )
 
+    # Create main body
+
     if test_framework == "unittest":
         self = "self"
         test_template = "\nclass TestTracedFunctions(unittest.TestCase):\n"
@@ -140,17 +142,20 @@ def create_trace_replay_test_code(
         self = ""
 
     for func in functions_data:
+
         module_name = func.get("module_name")
         function_name = func.get("function_name")
         class_name = func.get("class_name")
         file_path = func.get("file_path")
+        benchmark_function_name = func.get("benchmark_function_name")
         function_properties = func.get("function_properties")
         if not class_name:
             alias = get_function_alias(module_name, function_name)
             test_body = test_function_body.format(
+                benchmark_function_name=benchmark_function_name,
+                orig_function_name=function_name,
                 function_name=alias,
                 file_path=file_path,
-                orig_function_name=function_name,
                 max_run_count=max_run_count,
             )
         else:
@@ -162,6 +167,7 @@ def create_trace_replay_test_code(
             method_name = "." + function_name if function_name != "__init__" else ""
             if function_properties.is_classmethod:
                 test_body = test_class_method_body.format(
+                    benchmark_function_name=benchmark_function_name,
                     orig_function_name=function_name,
                     file_path=file_path,
                     class_name_alias=class_name_alias,
@@ -172,6 +178,7 @@ def create_trace_replay_test_code(
                 )
             elif function_properties.is_staticmethod:
                 test_body = test_static_method_body.format(
+                    benchmark_function_name=benchmark_function_name,
                     orig_function_name=function_name,
                     file_path=file_path,
                     class_name_alias=class_name_alias,
@@ -182,6 +189,7 @@ def create_trace_replay_test_code(
                 )
             else:
                 test_body = test_method_body.format(
+                    benchmark_function_name=benchmark_function_name,
                     orig_function_name=function_name,
                     file_path=file_path,
                     class_name_alias=class_name_alias,
@@ -217,25 +225,25 @@ def generate_replay_test(trace_file_path: Path, output_dir: Path, test_framework
         conn = sqlite3.connect(trace_file_path.as_posix())
         cursor = conn.cursor()
 
-        # Get distinct benchmark names
+        # Get distinct benchmark file paths
         cursor.execute(
-            "SELECT DISTINCT benchmark_function_name, benchmark_file_path FROM benchmark_function_timings"
+            "SELECT DISTINCT benchmark_file_path FROM benchmark_function_timings"
         )
-        benchmarks = cursor.fetchall()
+        benchmark_files = cursor.fetchall()
 
-        # Generate a test for each benchmark
-        for benchmark in benchmarks:
-            benchmark_function_name, benchmark_file_path = benchmark
-            # Get functions associated with this benchmark
+        # Generate a test for each benchmark file
+        for benchmark_file in benchmark_files:
+            benchmark_file_path = benchmark_file[0]
+            # Get all benchmarks and functions associated with this file path
             cursor.execute(
-                "SELECT DISTINCT function_name, class_name, module_name, file_path, benchmark_line_number FROM benchmark_function_timings "
-                "WHERE benchmark_function_name = ? AND benchmark_file_path = ?",
-                (benchmark_function_name, benchmark_file_path)
+            "SELECT DISTINCT benchmark_function_name, function_name, class_name, module_name, file_path, benchmark_line_number FROM benchmark_function_timings "
+                "WHERE benchmark_file_path = ?",
+                (benchmark_file_path,)
             )
 
             functions_data = []
-            for func_row in cursor.fetchall():
-                function_name, class_name, module_name, file_path, benchmark_line_number = func_row
+            for row in cursor.fetchall():
+                benchmark_function_name, function_name, class_name, module_name, file_path, benchmark_line_number = row
                 # Add this function to our list
                 functions_data.append({
                     "function_name": function_name,
@@ -246,16 +254,15 @@ def generate_replay_test(trace_file_path: Path, output_dir: Path, test_framework
                     "benchmark_file_path": benchmark_file_path,
                     "benchmark_line_number": benchmark_line_number,
                     "function_properties": inspect_top_level_functions_or_methods(
-                            file_name=Path(file_path),
-                            function_or_method_name=function_name,
-                            class_name=class_name,
-                        )
+                        file_name=Path(file_path),
+                        function_or_method_name=function_name,
+                        class_name=class_name,
+                    )
                 })
 
             if not functions_data:
-                logger.info(f"No functions found for benchmark {benchmark_function_name} in {benchmark_file_path}")
+                logger.info(f"No benchmark test functions found in {benchmark_file_path}")
                 continue
-
             # Generate the test code for this benchmark
             test_code = create_trace_replay_test_code(
                 trace_file=trace_file_path.as_posix(),
@@ -265,17 +272,15 @@ def generate_replay_test(trace_file_path: Path, output_dir: Path, test_framework
             )
             test_code = isort.code(test_code)
 
-            # Write to file if requested
-            if output_dir:
-                name = Path(benchmark_file_path).name.split(".")[0][5:] # remove "test_" from the name since we add it in later
-                output_file = get_test_file_path(
-                    test_dir=Path(output_dir), function_name=f"{name}_{benchmark_function_name}", test_type="replay"
-                )
-                # Write test code to file, parents = true
-                output_dir.mkdir(parents=True, exist_ok=True)
-                output_file.write_text(test_code, "utf-8")
-                count += 1
-                logger.info(f"Replay test for benchmark `{benchmark_function_name}` in {name} written to {output_file}")
+            name = Path(benchmark_file_path).name.split(".")[0][5:] # remove "test_" from the name since we add it in later
+            output_file = get_test_file_path(
+                test_dir=Path(output_dir), function_name=f"{name}", test_type="replay"
+            )
+            # Write test code to file, parents = true
+            output_dir.mkdir(parents=True, exist_ok=True)
+            output_file.write_text(test_code, "utf-8")
+            count += 1
+            logger.info(f"Replay test for benchmark file `{benchmark_file_path}` in {name} written to {output_file}")
 
         conn.close()
 
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -483,7 +483,7 @@ def group_by_benchmarks(self, benchmark_keys:list[BenchmarkKey], benchmark_repla
         test_results_by_benchmark = defaultdict(TestResults)
         benchmark_module_path = {}
         for benchmark_key in benchmark_keys:
-            benchmark_module_path[benchmark_key] = module_name_from_file_path(benchmark_replay_test_dir.resolve() / f"test_{Path(benchmark_key.file_path).name.split('.')[0][5:]}_{benchmark_key.function_name}__replay_test_", project_root)
+            benchmark_module_path[benchmark_key] = module_name_from_file_path(benchmark_replay_test_dir.resolve() / f"test_{Path(benchmark_key.file_path).name.split('.')[0][5:]}__replay_test_", project_root)
         for test_result in self.test_results:
             if (test_result.test_type == TestType.REPLAY_TEST):
                 for benchmark_key, module_path in benchmark_module_path.items():
diff --git a/tests/test_trace_benchmarks.py b/tests/test_trace_benchmarks.py