chore: reverts to clasic regex + ripgrep search (#776)

kopekC · web-flow · commit 0d600cf62528 · 2025-03-07T20:01:07.000-05:00
# Motivation

&lt;!-- Why is this change necessary? --&gt;

# Content

&lt;!-- Please include a summary of the change --&gt;

# Testing

&lt;!-- How was the change tested? --&gt;

# Please check the following before marking your PR as ready for review

- [ ] I have added tests for my changes
- [ ] I have updated the documentation or added new documentation as
needed
diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py
@@ -112,111 +112,24 @@ def _run(self, dirpath: str = "./", depth: int = 1) -> str:
 
 
 class SearchInput(BaseModel):
+    """Input for searching the codebase."""
+
     query: str = Field(
         ...,
-        description="""The text or pattern to search for in the codebase.
-
-        For simple text search (use_regex=False):
-        - Uses ripgrep's fixed-strings mode (--fixed-strings)
-        - Case-insensitive matching (--ignore-case)
-        - All characters are treated literally, including special regex characters
-        - Exact string matching (no regex interpretation)
-
-        For regex search (use_regex=True):
-        - Full regex pattern support
-        - Case-sensitive by default
-        - Special characters have regex meaning and need proper escaping
-        - Uses ripgrep's default regex mode
-
-        If no exact matches are found, automatically falls back to semantic search
-        to find relevant code even without exact text matches.""",
-    )
-
-    target_directories: Optional[list[str]] = Field(
-        default=None,
-        description="""Optional list of directories to limit the search scope.
-
-        - Paths should be relative to the workspace root
-        - Multiple directories are searched in parallel
-        - If None, searches the entire codebase
-
-        Example: ["src/frontend", "tests/unit"]""",
-    )
-
-    file_extensions: Optional[list[str]] = Field(
-        default=None,
-        description="""Optional list of file extensions to filter the search.
-
-        - Include the dot in extensions (e.g. ['.py', '.ts'])
-        - Multiple extensions are combined with OR logic
-        - If None, searches all file types
-        - Binary files are automatically excluded
-
-        Example: [".py", ".tsx", ".md"]""",
-    )
-
-    page: int = Field(
-        default=1,
-        description="""Page number for paginated results (1-based indexing).
-
-        - Use with files_per_page to navigate large result sets
-        - If page exceeds available pages, returns last available page
-        - Note: When falling back to semantic search, pagination is not supported
-
-        Example: page=2 with files_per_page=10 shows files 11-20""",
-    )
-
-    files_per_page: int = Field(
-        default=10,
-        description="""Number of files to show per page.
-
-        - Each file can contain multiple matching lines
-        - Reasonable values are between 5 and 50
-        - Larger values may impact performance
-        - When falling back to semantic search, this becomes the number of semantic results
-
-        Example: files_per_page=20 shows up to 20 files with matches""",
-    )
-
-    use_regex: bool = Field(
-        default=False,
-        description="""Whether to treat the query as a regex pattern.
-
-        - False (default): Simple text search, case-insensitive
-        - True: Full regex syntax, case-sensitive
-        - Invalid regex patterns will return an error
-        - Note: Semantic fallback is used regardless of this setting when no matches found
-
-        Example: Set to True to use patterns like "test_.*_func.*" """,
+        description="The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True. Ripgrep is the preferred method.",
     )
+    target_directories: Optional[list[str]] = Field(default=None, description="Optional list of directories to search in")
+    file_extensions: Optional[list[str]] = Field(default=None, description="Optional list of file extensions to search (e.g. ['.py', '.ts'])")
+    page: int = Field(default=1, description="Page number to return (1-based, default: 1)")
+    files_per_page: int = Field(default=10, description="Number of files to return per page (default: 10)")
+    use_regex: bool = Field(default=False, description="Whether to treat query as a regex pattern (default: False)")
 
 
 class SearchTool(BaseTool):
     """Tool for searching the codebase."""
 
     name: ClassVar[str] = "search"
-    description: ClassVar[str] = r"""Search the codebase using text search or regex pattern matching.
-
-    This tool provides powerful text-based search capabilities across your codebase,
-    with support for both simple text matching and regular expressions. It uses ripgrep
-    when available for high-performance searches.
-
-    If no exact matches are found, automatically falls back to semantic search to find
-    relevant code even without exact text matches.
-
-    Features:
-    - Plain text or regex pattern matching
-    - Directory and file type filtering
-    - Paginated results for large codebases
-    - Case-insensitive by default for simple text searches
-    - Semantic fallback for finding related code
-
-    Example queries:
-    1. Simple text: "function calculateTotal" (matches exactly, case-insensitive)
-    2. Regex: "def.*calculate.*\(.*\)" (with use_regex=True)
-    3. File-specific: "TODO" with file_extensions=[".py", ".ts"]
-    4. Directory-specific: "api" with target_directories=["src/backend"]
-    """
+    description: ClassVar[str] = "Search the codebase using text search or regex pattern matching"
     args_schema: ClassVar[type[BaseModel]] = SearchInput
     codebase: Codebase = Field(exclude=True)
 
diff --git a/src/codegen/extensions/tools/search.py b/src/codegen/extensions/tools/search.py
@@ -3,8 +3,6 @@
 This performs either a regex pattern match or simple text search across all files in the codebase.
 Each matching line will be returned with its line number.
 Results are paginated with a default of 10 files per page.
-
-If no exact matches are found, falls back to semantic search to find relevant code.
 """
 
 import os
@@ -17,7 +15,6 @@
 from codegen.sdk.core.codebase import Codebase
 
 from .observation import Observation
-from .semantic_search import SearchResult, semantic_search
 
 
 class SearchMatch(Observation):
@@ -128,7 +125,7 @@ def _search_with_ripgrep(
     This is faster than the Python implementation, especially for large codebases.
     """
     # Build ripgrep command
-    cmd = ["rg", "--line-number", "--with-filename"]
+    cmd = ["rg", "--line-number"]
 
     # Add case insensitivity if not using regex
     if not use_regex:
@@ -203,6 +200,8 @@ def _search_with_ripgrep(
                 match_text = query
                 if use_regex:
                     # For regex, we need to find what actually matched
+                    # This is a simplification - ideally we'd use ripgrep's --json option
+                    # to get the exact match positions
                     pattern = re.compile(query)
                     match_obj = pattern.search(content)
                     if match_obj:
@@ -227,20 +226,11 @@ def _search_with_ripgrep(
         # Convert to SearchFileResult objects
         file_results = []
         for filepath, matches in all_results.items():
-            # Sort matches by line number and deduplicate
-            unique_matches = []
-            seen = set()
-            for match in sorted(matches, key=lambda x: x.line_number):
-                key = (match.line_number, match.match)
-                if key not in seen:
-                    seen.add(key)
-                    unique_matches.append(match)
-
             file_results.append(
                 SearchFileResult(
                     status="success",
                     filepath=filepath,
-                    matches=unique_matches,
+                    matches=sorted(matches, key=lambda x: x.line_number),
                 )
             )
 
@@ -271,40 +261,120 @@ def _search_with_ripgrep(
         raise
 
 
-def _convert_semantic_to_search_results(semantic_results: list[SearchResult], query: str) -> list[SearchFileResult]:
-    """Convert semantic search results to regular search results format."""
-    file_results = []
-    for result in semantic_results:
-        file_results.append(
-            SearchFileResult(
-                status="success",
-                filepath=result.filepath,
-                matches=[
+def _search_with_python(
+    codebase: Codebase,
+    query: str,
+    target_directories: Optional[list[str]] = None,
+    file_extensions: Optional[list[str]] = None,
+    page: int = 1,
+    files_per_page: int = 10,
+    use_regex: bool = False,
+) -> SearchObservation:
+    """Search the codebase using Python's regex engine.
+
+    This is a fallback for when ripgrep is not available.
+    """
+    # Validate pagination parameters
+    if page < 1:
+        page = 1
+    if files_per_page < 1:
+        files_per_page = 10
+
+    # Prepare the search pattern
+    if use_regex:
+        try:
+            pattern = re.compile(query)
+        except re.error as e:
+            return SearchObservation(
+                status="error",
+                error=f"Invalid regex pattern: {e!s}",
+                query=query,
+                page=page,
+                total_pages=0,
+                total_files=0,
+                files_per_page=files_per_page,
+                results=[],
+            )
+    else:
+        # For non-regex searches, escape special characters and make case-insensitive
+        pattern = re.compile(re.escape(query), re.IGNORECASE)
+
+    # Handle file extensions
+    extensions = file_extensions if file_extensions is not None else "*"
+
+    all_results = []
+    for file in codebase.files(extensions=extensions):
+        # Skip if file doesn't match target directories
+        if target_directories and not any(file.filepath.startswith(d) for d in target_directories):
+            continue
+
+        # Skip binary files
+        try:
+            content = file.content
+        except ValueError:  # File is binary
+            continue
+
+        file_matches = []
+        # Split content into lines and store with line numbers (1-based)
+        lines = enumerate(content.splitlines(), 1)
+
+        # Search each line for the pattern
+        for line_number, line in lines:
+            match = pattern.search(line)
+            if match:
+                file_matches.append(
                     SearchMatch(
                         status="success",
-                        line_number=1,  # We don't have line numbers for semantic matches
-                        line=result.preview,
-                        match=query,
+                        line_number=line_number,
+                        line=line.strip(),
+                        match=match.group(0),
                     )
-                ],
+                )
+
+        if file_matches:
+            all_results.append(
+                SearchFileResult(
+                    status="success",
+                    filepath=file.filepath,
+                    matches=sorted(file_matches, key=lambda x: x.line_number),
+                )
             )
-        )
-    return file_results
+
+    # Sort all results by filepath
+    all_results.sort(key=lambda x: x.filepath)
+
+    # Calculate pagination
+    total_files = len(all_results)
+    total_pages = (total_files + files_per_page - 1) // files_per_page
+    start_idx = (page - 1) * files_per_page
+    end_idx = start_idx + files_per_page
+
+    # Get the current page of results
+    paginated_results = all_results[start_idx:end_idx]
+
+    return SearchObservation(
+        status="success",
+        query=query,
+        page=page,
+        total_pages=total_pages,
+        total_files=total_files,
+        files_per_page=files_per_page,
+        results=paginated_results,
+    )
 
 
 def search(
     codebase: Codebase,
     query: str,
     target_directories: Optional[list[str]] = None,
-    file_extensions: Optional[list[str] | str] = None,
+    file_extensions: Optional[list[str]] = None,
     page: int = 1,
     files_per_page: int = 10,
     use_regex: bool = False,
 ) -> SearchObservation:
     """Search the codebase using text search or regex pattern matching.
 
-    Uses ripgrep for performance when available. If no exact matches are found,
-    falls back to semantic search to find relevant code.
+    Uses ripgrep for performance when available, with fallback to Python's regex engine.
     If use_regex is True, performs a regex pattern match on each line.
     Otherwise, performs a case-insensitive text search.
     Returns matching lines with their line numbers, grouped by file.
@@ -323,52 +393,9 @@ def search(
     Returns:
         SearchObservation containing search results with matches and their sources
     """
+    # Try to use ripgrep first
     try:
-        # Try ripgrep first
-        result = _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)
-
-        # If no results found, try semantic search
-        if not result.results:
-            semantic_results = semantic_search(codebase, query, k=files_per_page)
-            if semantic_results.status == "success" and semantic_results.results:
-                # Convert semantic results to regular search results format
-                file_results = _convert_semantic_to_search_results(semantic_results.results, query)
-
-                return SearchObservation(
-                    status="success",
-                    query=query,
-                    page=1,  # Semantic search doesn't support pagination yet
-                    total_pages=1,
-                    total_files=len(file_results),
-                    files_per_page=files_per_page,
-                    results=file_results,
-                )
-
-        return result
-
+        return _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)
     except (FileNotFoundError, subprocess.SubprocessError):
-        # If ripgrep fails, try semantic search directly
-        semantic_results = semantic_search(codebase, query, k=files_per_page)
-        if semantic_results.status == "success":
-            file_results = _convert_semantic_to_search_results(semantic_results.results, query)
-
-            return SearchObservation(
-                status="success",
-                query=query,
-                page=1,
-                total_pages=1,
-                total_files=len(file_results),
-                files_per_page=files_per_page,
-                results=file_results,
-            )
-        else:
-            return SearchObservation(
-                status="error",
-                error=f"Both text search and semantic search failed: {semantic_results.error}",
-                query=query,
-                page=page,
-                total_pages=0,
-                total_files=0,
-                files_per_page=files_per_page,
-                results=[],
-            )
+        # Fall back to Python implementation if ripgrep fails or isn't available
+        return _search_with_python(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)