paralelize test discovery

KRRT7 · KRRT7 · commit 6867932351a7 · 2025-06-05T21:08:25.000-07:00
diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import hashlib
+import multiprocessing
 import os
 import pickle
 import re
@@ -15,16 +16,9 @@
 
 import pytest
 from pydantic.dataclasses import dataclass
-from rich.panel import Panel
-from rich.text import Text
 
 from codeflash.cli_cmds.console import console, logger, test_files_progress_bar
-from codeflash.code_utils.code_utils import (
-    ImportErrorPattern,
-    custom_addopts,
-    get_run_tmp_file,
-    module_name_from_file_path,
-)
+from codeflash.code_utils.code_utils import custom_addopts, get_run_tmp_file, module_name_from_file_path
 from codeflash.code_utils.compat import SAFE_SYS_EXECUTABLE, codeflash_cache_db
 from codeflash.models.models import CodePosition, FunctionCalledInTest, TestsInFile, TestType
 
@@ -139,7 +133,7 @@ def close(self) -> None:
 
 def discover_unit_tests(
     cfg: TestConfig, discover_only_these_tests: list[Path] | None = None
-) -> dict[str, list[FunctionCalledInTest]]:
+) -> tuple[dict[str, list[FunctionCalledInTest]], int]:
     framework_strategies: dict[str, Callable] = {"pytest": discover_tests_pytest, "unittest": discover_tests_unittest}
     strategy = framework_strategies.get(cfg.test_framework, None)
     if not strategy:
@@ -151,7 +145,7 @@ def discover_unit_tests(
 
 def discover_tests_pytest(
     cfg: TestConfig, discover_only_these_tests: list[Path] | None = None
-) -> dict[Path, list[FunctionCalledInTest]]:
+) -> tuple[dict[str, list[FunctionCalledInTest]], int]:
     tests_root = cfg.tests_root
     project_root = cfg.project_root_path
 
@@ -187,10 +181,6 @@ def discover_tests_pytest(
             logger.warning(
                 f"Failed to collect tests. Pytest Exit code: {exitcode}={pytest.ExitCode(exitcode).name}\n {error_section}"
             )
-            if "ModuleNotFoundError" in result.stdout:
-                match = ImportErrorPattern.search(result.stdout).group()
-                panel = Panel(Text.from_markup(f"⚠️  {match} ", style="bold red"), expand=False)
-                console.print(panel)
 
         elif 0 <= exitcode <= 5:
             logger.warning(f"Failed to collect tests. Pytest Exit code: {exitcode}={pytest.ExitCode(exitcode).name}")
@@ -225,7 +215,7 @@ def discover_tests_pytest(
 
 def discover_tests_unittest(
     cfg: TestConfig, discover_only_these_tests: list[str] | None = None
-) -> dict[Path, list[FunctionCalledInTest]]:
+) -> tuple[dict[str, list[FunctionCalledInTest]], int]:
     tests_root: Path = cfg.tests_root
     loader: unittest.TestLoader = unittest.TestLoader()
     tests: unittest.TestSuite = loader.discover(str(tests_root))
@@ -290,27 +280,39 @@ def discover_parameters_unittest(function_name: str) -> tuple[bool, str, str | N
 
 def _process_single_test_file(
     test_file: Path, functions: list[TestsInFile], project_root_path: Path, test_framework: str
-) -> tuple[str, list[tuple[str, FunctionCalledInTest]]]:
+) -> tuple[str, list[tuple[str, FunctionCalledInTest]], int, list[dict]]:
     import jedi
 
     jedi_project = jedi.Project(path=project_root_path)
     goto_cache = {}
     results = []
+    cache_entries = []
 
     try:
         script = jedi.Script(path=test_file, project=jedi_project)
         test_functions = set()
 
         all_names = script.get_names(all_scopes=True, references=True)
-        all_defs = script.get_names(all_scopes=True, definitions=True)
-        all_names_top = script.get_names(all_scopes=True)
-
-        top_level_functions = {name.name: name for name in all_names_top if name.type == "function"}
-        top_level_classes = {name.name: name for name in all_names_top if name.type == "class"}
+        top_level_functions = {}
+        top_level_classes = {}
+        all_defs = []
+        reference_names = []
+
+        for name in all_names:
+            if name.type == "function":
+                top_level_functions[name.name] = name
+                if hasattr(name, "full_name") and name.full_name:
+                    all_defs.append(name)
+            elif name.type == "class":
+                top_level_classes[name.name] = name
+
+            if name.full_name is not None:
+                m = FUNCTION_NAME_REGEX.search(name.full_name)
+                if m:
+                    reference_names.append((name, m.group(1)))
     except Exception as e:
         logger.debug(f"Failed to get jedi script for {test_file}: {e}")
-        # tests_cache.close()
-        return str(test_file), results
+        return str(test_file), results, len(results), cache_entries
 
     if test_framework == "pytest":
         for function in functions:
@@ -340,11 +342,8 @@ def _process_single_test_file(
         matching_names = test_suites & top_level_classes.keys()
         for matched_name in matching_names:
             for def_name in all_defs:
-                if (
-                    def_name.type == "function"
-                    and def_name.full_name is not None
-                    and f".{matched_name}." in def_name.full_name
-                ):
+                # all_defs already contains only functions, no need to check type
+                if def_name.full_name is not None and f".{matched_name}." in def_name.full_name:
                     for function in functions_to_search:
                         (is_parameterized, new_function, parameters) = discover_parameters_unittest(function)
 
@@ -374,14 +373,7 @@ def _process_single_test_file(
     for i, func_name in enumerate(test_functions_raw):
         test_functions_by_name[func_name].append(i)
 
-    for name in all_names:
-        if name.full_name is None:
-            continue
-        m = FUNCTION_NAME_REGEX.search(name.full_name)
-        if not m:
-            continue
-
-        scope = m.group(1)
+    for name, scope in reference_names:
         if scope not in test_functions_by_name:
             continue
 
@@ -432,28 +424,73 @@ def _process_single_test_file(
                 )
                 results.append((qualified_name_with_modules_from_root, function_called_in_test))
 
-    return str(test_file), results
+                cache_entries.append(
+                    {
+                        "qualified_name_with_modules_from_root": qualified_name_with_modules_from_root,
+                        "function_name": scope,
+                        "test_class": scope_test_class,
+                        "test_function": scope_test_function,
+                        "test_type": test_type,
+                        "line_number": name.line,
+                        "col_number": name.column,
+                    }
+                )
+
+    return str(test_file), results, len(results), cache_entries
 
 
 def process_test_files(
     file_to_test_map: dict[Path, list[TestsInFile]], cfg: TestConfig
-) -> dict[str, list[FunctionCalledInTest]]:
+) -> tuple[dict[str, list[FunctionCalledInTest]], int]:
     project_root_path = cfg.project_root_path
     test_framework = cfg.test_framework
     function_to_test_map = defaultdict(set)
+    total_count = 0
 
-    import multiprocessing
+    tests_cache = TestsCache()
 
-    max_workers = min(len(file_to_test_map), multiprocessing.cpu_count())
-    max_workers = max(1, max_workers)
+    max_workers = min(len(file_to_test_map) or 1, multiprocessing.cpu_count())
 
     with test_files_progress_bar(total=len(file_to_test_map), description="Processing test files") as (
         progress,
         task_id,
     ):
-        if len(file_to_test_map) == 1 or max_workers == 1:
-            for test_file, functions in file_to_test_map.items():
-                _, results = _process_single_test_file(test_file, functions, project_root_path, test_framework)
+        cached_files = {}
+        uncached_files = {}
+
+        for test_file, functions in file_to_test_map.items():
+            file_hash = TestsCache.compute_file_hash(str(test_file))
+            cached_tests = tests_cache.get_tests_for_file(str(test_file), file_hash)
+
+            if cached_tests:
+                cached_files[test_file] = (functions, cached_tests, file_hash)
+            else:
+                uncached_files[test_file] = functions
+
+        # Process cached files first
+        for test_file, (_functions, cached_tests, file_hash) in cached_files.items():
+            cur = tests_cache.cur
+            cur.execute(
+                "SELECT qualified_name_with_modules_from_root FROM discovered_tests WHERE file_path = ? AND file_hash = ?",
+                (str(test_file), file_hash),
+            )
+            qualified_names = [row[0] for row in cur.fetchall()]
+            for cached_test, qualified_name in zip(cached_tests, qualified_names):
+                function_to_test_map[qualified_name].add(cached_test)
+            total_count += len(cached_tests)
+            progress.advance(task_id)
+
+        if len(uncached_files) == 1 or max_workers == 1:
+            for test_file, functions in uncached_files.items():
+                _, results, count, cache_entries = _process_single_test_file(
+                    test_file, functions, project_root_path, test_framework
+                )
+                total_count += count
+
+                file_hash = TestsCache.compute_file_hash(str(test_file))
+                for cache_entry in cache_entries:
+                    tests_cache.insert_test(file_path=str(test_file), file_hash=file_hash, **cache_entry)
+
                 for qualified_name, function_called in results:
                     function_to_test_map[qualified_name].add(function_called)
                 progress.advance(task_id)
@@ -463,12 +500,19 @@ def process_test_files(
                     executor.submit(
                         _process_single_test_file, test_file, functions, project_root_path, test_framework
                     ): test_file
-                    for test_file, functions in file_to_test_map.items()
+                    for test_file, functions in uncached_files.items()
                 }
 
                 for future in as_completed(future_to_file):
                     try:
-                        _, results = future.result()
+                        _, results, count, cache_entries = future.result()
+                        total_count += count
+
+                        test_file = future_to_file[future]
+                        file_hash = TestsCache.compute_file_hash(str(test_file))
+                        for cache_entry in cache_entries:
+                            tests_cache.insert_test(file_path=str(test_file), file_hash=file_hash, **cache_entry)
+
                         for qualified_name, function_called in results:
                             function_to_test_map[qualified_name].add(function_called)
                         progress.advance(task_id)
@@ -477,4 +521,6 @@ def process_test_files(
                         logger.error(f"Error processing test file {test_file}: {e}")
                         progress.advance(task_id)
 
-    return {function: list(tests) for function, tests in function_to_test_map.items()}
+    tests_cache.close()
+    function_to_tests_dict = {function: list(tests) for function, tests in function_to_test_map.items()}
+    return function_to_tests_dict, total_count