diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py index 8551b68e6..7cda6d7f9 100644 --- a/src/codegen/extensions/langchain/tools.py +++ b/src/codegen/extensions/langchain/tools.py @@ -112,111 +112,24 @@ def _run(self, dirpath: str = "./", depth: int = 1) -> str: class SearchInput(BaseModel): + """Input for searching the codebase.""" + query: str = Field( ..., - description="""The text or pattern to search for in the codebase. - - For simple text search (use_regex=False): - - Uses ripgrep's fixed-strings mode (--fixed-strings) - - Case-insensitive matching (--ignore-case) - - All characters are treated literally, including special regex characters - - Exact string matching (no regex interpretation) - - For regex search (use_regex=True): - - Full regex pattern support - - Case-sensitive by default - - Special characters have regex meaning and need proper escaping - - Uses ripgrep's default regex mode - - If no exact matches are found, automatically falls back to semantic search - to find relevant code even without exact text matches.""", - ) - - target_directories: Optional[list[str]] = Field( - default=None, - description="""Optional list of directories to limit the search scope. - - - Paths should be relative to the workspace root - - Multiple directories are searched in parallel - - If None, searches the entire codebase - - Example: ["src/frontend", "tests/unit"]""", - ) - - file_extensions: Optional[list[str]] = Field( - default=None, - description="""Optional list of file extensions to filter the search. - - - Include the dot in extensions (e.g. ['.py', '.ts']) - - Multiple extensions are combined with OR logic - - If None, searches all file types - - Binary files are automatically excluded - - Example: [".py", ".tsx", ".md"]""", - ) - - page: int = Field( - default=1, - description="""Page number for paginated results (1-based indexing). - - - Use with files_per_page to navigate large result sets - - If page exceeds available pages, returns last available page - - Note: When falling back to semantic search, pagination is not supported - - Example: page=2 with files_per_page=10 shows files 11-20""", - ) - - files_per_page: int = Field( - default=10, - description="""Number of files to show per page. - - - Each file can contain multiple matching lines - - Reasonable values are between 5 and 50 - - Larger values may impact performance - - When falling back to semantic search, this becomes the number of semantic results - - Example: files_per_page=20 shows up to 20 files with matches""", - ) - - use_regex: bool = Field( - default=False, - description="""Whether to treat the query as a regex pattern. - - - False (default): Simple text search, case-insensitive - - True: Full regex syntax, case-sensitive - - Invalid regex patterns will return an error - - Note: Semantic fallback is used regardless of this setting when no matches found - - Example: Set to True to use patterns like "test_.*_func.*" """, + description="The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True. Ripgrep is the preferred method.", ) + target_directories: Optional[list[str]] = Field(default=None, description="Optional list of directories to search in") + file_extensions: Optional[list[str]] = Field(default=None, description="Optional list of file extensions to search (e.g. ['.py', '.ts'])") + page: int = Field(default=1, description="Page number to return (1-based, default: 1)") + files_per_page: int = Field(default=10, description="Number of files to return per page (default: 10)") + use_regex: bool = Field(default=False, description="Whether to treat query as a regex pattern (default: False)") class SearchTool(BaseTool): """Tool for searching the codebase.""" name: ClassVar[str] = "search" - description: ClassVar[str] = r"""Search the codebase using text search or regex pattern matching. - - This tool provides powerful text-based search capabilities across your codebase, - with support for both simple text matching and regular expressions. It uses ripgrep - when available for high-performance searches. - - If no exact matches are found, automatically falls back to semantic search to find - relevant code even without exact text matches. - - Features: - - Plain text or regex pattern matching - - Directory and file type filtering - - Paginated results for large codebases - - Case-insensitive by default for simple text searches - - Semantic fallback for finding related code - - Example queries: - 1. Simple text: "function calculateTotal" (matches exactly, case-insensitive) - 2. Regex: "def.*calculate.*\(.*\)" (with use_regex=True) - 3. File-specific: "TODO" with file_extensions=[".py", ".ts"] - 4. Directory-specific: "api" with target_directories=["src/backend"] - """ + description: ClassVar[str] = "Search the codebase using text search or regex pattern matching" args_schema: ClassVar[type[BaseModel]] = SearchInput codebase: Codebase = Field(exclude=True) diff --git a/src/codegen/extensions/tools/search.py b/src/codegen/extensions/tools/search.py index 8083e7db8..4bcdfb74e 100644 --- a/src/codegen/extensions/tools/search.py +++ b/src/codegen/extensions/tools/search.py @@ -3,8 +3,6 @@ This performs either a regex pattern match or simple text search across all files in the codebase. Each matching line will be returned with its line number. Results are paginated with a default of 10 files per page. - -If no exact matches are found, falls back to semantic search to find relevant code. """ import os @@ -17,7 +15,6 @@ from codegen.sdk.core.codebase import Codebase from .observation import Observation -from .semantic_search import SearchResult, semantic_search class SearchMatch(Observation): @@ -128,7 +125,7 @@ def _search_with_ripgrep( This is faster than the Python implementation, especially for large codebases. """ # Build ripgrep command - cmd = ["rg", "--line-number", "--with-filename"] + cmd = ["rg", "--line-number"] # Add case insensitivity if not using regex if not use_regex: @@ -203,6 +200,8 @@ def _search_with_ripgrep( match_text = query if use_regex: # For regex, we need to find what actually matched + # This is a simplification - ideally we'd use ripgrep's --json option + # to get the exact match positions pattern = re.compile(query) match_obj = pattern.search(content) if match_obj: @@ -227,20 +226,11 @@ def _search_with_ripgrep( # Convert to SearchFileResult objects file_results = [] for filepath, matches in all_results.items(): - # Sort matches by line number and deduplicate - unique_matches = [] - seen = set() - for match in sorted(matches, key=lambda x: x.line_number): - key = (match.line_number, match.match) - if key not in seen: - seen.add(key) - unique_matches.append(match) - file_results.append( SearchFileResult( status="success", filepath=filepath, - matches=unique_matches, + matches=sorted(matches, key=lambda x: x.line_number), ) ) @@ -271,40 +261,120 @@ def _search_with_ripgrep( raise -def _convert_semantic_to_search_results(semantic_results: list[SearchResult], query: str) -> list[SearchFileResult]: - """Convert semantic search results to regular search results format.""" - file_results = [] - for result in semantic_results: - file_results.append( - SearchFileResult( - status="success", - filepath=result.filepath, - matches=[ +def _search_with_python( + codebase: Codebase, + query: str, + target_directories: Optional[list[str]] = None, + file_extensions: Optional[list[str]] = None, + page: int = 1, + files_per_page: int = 10, + use_regex: bool = False, +) -> SearchObservation: + """Search the codebase using Python's regex engine. + + This is a fallback for when ripgrep is not available. + """ + # Validate pagination parameters + if page < 1: + page = 1 + if files_per_page < 1: + files_per_page = 10 + + # Prepare the search pattern + if use_regex: + try: + pattern = re.compile(query) + except re.error as e: + return SearchObservation( + status="error", + error=f"Invalid regex pattern: {e!s}", + query=query, + page=page, + total_pages=0, + total_files=0, + files_per_page=files_per_page, + results=[], + ) + else: + # For non-regex searches, escape special characters and make case-insensitive + pattern = re.compile(re.escape(query), re.IGNORECASE) + + # Handle file extensions + extensions = file_extensions if file_extensions is not None else "*" + + all_results = [] + for file in codebase.files(extensions=extensions): + # Skip if file doesn't match target directories + if target_directories and not any(file.filepath.startswith(d) for d in target_directories): + continue + + # Skip binary files + try: + content = file.content + except ValueError: # File is binary + continue + + file_matches = [] + # Split content into lines and store with line numbers (1-based) + lines = enumerate(content.splitlines(), 1) + + # Search each line for the pattern + for line_number, line in lines: + match = pattern.search(line) + if match: + file_matches.append( SearchMatch( status="success", - line_number=1, # We don't have line numbers for semantic matches - line=result.preview, - match=query, + line_number=line_number, + line=line.strip(), + match=match.group(0), ) - ], + ) + + if file_matches: + all_results.append( + SearchFileResult( + status="success", + filepath=file.filepath, + matches=sorted(file_matches, key=lambda x: x.line_number), + ) ) - ) - return file_results + + # Sort all results by filepath + all_results.sort(key=lambda x: x.filepath) + + # Calculate pagination + total_files = len(all_results) + total_pages = (total_files + files_per_page - 1) // files_per_page + start_idx = (page - 1) * files_per_page + end_idx = start_idx + files_per_page + + # Get the current page of results + paginated_results = all_results[start_idx:end_idx] + + return SearchObservation( + status="success", + query=query, + page=page, + total_pages=total_pages, + total_files=total_files, + files_per_page=files_per_page, + results=paginated_results, + ) def search( codebase: Codebase, query: str, target_directories: Optional[list[str]] = None, - file_extensions: Optional[list[str] | str] = None, + file_extensions: Optional[list[str]] = None, page: int = 1, files_per_page: int = 10, use_regex: bool = False, ) -> SearchObservation: """Search the codebase using text search or regex pattern matching. - Uses ripgrep for performance when available. If no exact matches are found, - falls back to semantic search to find relevant code. + Uses ripgrep for performance when available, with fallback to Python's regex engine. If use_regex is True, performs a regex pattern match on each line. Otherwise, performs a case-insensitive text search. Returns matching lines with their line numbers, grouped by file. @@ -323,52 +393,9 @@ def search( Returns: SearchObservation containing search results with matches and their sources """ + # Try to use ripgrep first try: - # Try ripgrep first - result = _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex) - - # If no results found, try semantic search - if not result.results: - semantic_results = semantic_search(codebase, query, k=files_per_page) - if semantic_results.status == "success" and semantic_results.results: - # Convert semantic results to regular search results format - file_results = _convert_semantic_to_search_results(semantic_results.results, query) - - return SearchObservation( - status="success", - query=query, - page=1, # Semantic search doesn't support pagination yet - total_pages=1, - total_files=len(file_results), - files_per_page=files_per_page, - results=file_results, - ) - - return result - + return _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex) except (FileNotFoundError, subprocess.SubprocessError): - # If ripgrep fails, try semantic search directly - semantic_results = semantic_search(codebase, query, k=files_per_page) - if semantic_results.status == "success": - file_results = _convert_semantic_to_search_results(semantic_results.results, query) - - return SearchObservation( - status="success", - query=query, - page=1, - total_pages=1, - total_files=len(file_results), - files_per_page=files_per_page, - results=file_results, - ) - else: - return SearchObservation( - status="error", - error=f"Both text search and semantic search failed: {semantic_results.error}", - query=query, - page=page, - total_pages=0, - total_files=0, - files_per_page=files_per_page, - results=[], - ) + # Fall back to Python implementation if ripgrep fails or isn't available + return _search_with_python(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)