Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 9 additions & 96 deletions src/codegen/extensions/langchain/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,111 +112,24 @@ def _run(self, dirpath: str = "./", depth: int = 1) -> str:


class SearchInput(BaseModel):
"""Input for searching the codebase."""

query: str = Field(
...,
description="""The text or pattern to search for in the codebase.

For simple text search (use_regex=False):
- Uses ripgrep's fixed-strings mode (--fixed-strings)
- Case-insensitive matching (--ignore-case)
- All characters are treated literally, including special regex characters
- Exact string matching (no regex interpretation)

For regex search (use_regex=True):
- Full regex pattern support
- Case-sensitive by default
- Special characters have regex meaning and need proper escaping
- Uses ripgrep's default regex mode

If no exact matches are found, automatically falls back to semantic search
to find relevant code even without exact text matches.""",
)

target_directories: Optional[list[str]] = Field(
default=None,
description="""Optional list of directories to limit the search scope.

- Paths should be relative to the workspace root
- Multiple directories are searched in parallel
- If None, searches the entire codebase

Example: ["src/frontend", "tests/unit"]""",
)

file_extensions: Optional[list[str]] = Field(
default=None,
description="""Optional list of file extensions to filter the search.

- Include the dot in extensions (e.g. ['.py', '.ts'])
- Multiple extensions are combined with OR logic
- If None, searches all file types
- Binary files are automatically excluded

Example: [".py", ".tsx", ".md"]""",
)

page: int = Field(
default=1,
description="""Page number for paginated results (1-based indexing).

- Use with files_per_page to navigate large result sets
- If page exceeds available pages, returns last available page
- Note: When falling back to semantic search, pagination is not supported

Example: page=2 with files_per_page=10 shows files 11-20""",
)

files_per_page: int = Field(
default=10,
description="""Number of files to show per page.

- Each file can contain multiple matching lines
- Reasonable values are between 5 and 50
- Larger values may impact performance
- When falling back to semantic search, this becomes the number of semantic results

Example: files_per_page=20 shows up to 20 files with matches""",
)

use_regex: bool = Field(
default=False,
description="""Whether to treat the query as a regex pattern.

- False (default): Simple text search, case-insensitive
- True: Full regex syntax, case-sensitive
- Invalid regex patterns will return an error
- Note: Semantic fallback is used regardless of this setting when no matches found

Example: Set to True to use patterns like "test_.*_func.*" """,
description="The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True. Ripgrep is the preferred method.",
)
target_directories: Optional[list[str]] = Field(default=None, description="Optional list of directories to search in")
file_extensions: Optional[list[str]] = Field(default=None, description="Optional list of file extensions to search (e.g. ['.py', '.ts'])")
page: int = Field(default=1, description="Page number to return (1-based, default: 1)")
files_per_page: int = Field(default=10, description="Number of files to return per page (default: 10)")
use_regex: bool = Field(default=False, description="Whether to treat query as a regex pattern (default: False)")


class SearchTool(BaseTool):
"""Tool for searching the codebase."""

name: ClassVar[str] = "search"
description: ClassVar[str] = r"""Search the codebase using text search or regex pattern matching.

This tool provides powerful text-based search capabilities across your codebase,
with support for both simple text matching and regular expressions. It uses ripgrep
when available for high-performance searches.

If no exact matches are found, automatically falls back to semantic search to find
relevant code even without exact text matches.

Features:
- Plain text or regex pattern matching
- Directory and file type filtering
- Paginated results for large codebases
- Case-insensitive by default for simple text searches
- Semantic fallback for finding related code

Example queries:
1. Simple text: "function calculateTotal" (matches exactly, case-insensitive)
2. Regex: "def.*calculate.*\(.*\)" (with use_regex=True)
3. File-specific: "TODO" with file_extensions=[".py", ".ts"]
4. Directory-specific: "api" with target_directories=["src/backend"]
"""
description: ClassVar[str] = "Search the codebase using text search or regex pattern matching"
args_schema: ClassVar[type[BaseModel]] = SearchInput
codebase: Codebase = Field(exclude=True)

Expand Down
185 changes: 106 additions & 79 deletions src/codegen/extensions/tools/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
This performs either a regex pattern match or simple text search across all files in the codebase.
Each matching line will be returned with its line number.
Results are paginated with a default of 10 files per page.

If no exact matches are found, falls back to semantic search to find relevant code.
"""

import os
Expand All @@ -17,7 +15,6 @@
from codegen.sdk.core.codebase import Codebase

from .observation import Observation
from .semantic_search import SearchResult, semantic_search


class SearchMatch(Observation):
Expand Down Expand Up @@ -128,7 +125,7 @@ def _search_with_ripgrep(
This is faster than the Python implementation, especially for large codebases.
"""
# Build ripgrep command
cmd = ["rg", "--line-number", "--with-filename"]
cmd = ["rg", "--line-number"]

# Add case insensitivity if not using regex
if not use_regex:
Expand Down Expand Up @@ -203,6 +200,8 @@ def _search_with_ripgrep(
match_text = query
if use_regex:
# For regex, we need to find what actually matched
# This is a simplification - ideally we'd use ripgrep's --json option
# to get the exact match positions
pattern = re.compile(query)
match_obj = pattern.search(content)
if match_obj:
Expand All @@ -227,20 +226,11 @@ def _search_with_ripgrep(
# Convert to SearchFileResult objects
file_results = []
for filepath, matches in all_results.items():
# Sort matches by line number and deduplicate
unique_matches = []
seen = set()
for match in sorted(matches, key=lambda x: x.line_number):
key = (match.line_number, match.match)
if key not in seen:
seen.add(key)
unique_matches.append(match)

file_results.append(
SearchFileResult(
status="success",
filepath=filepath,
matches=unique_matches,
matches=sorted(matches, key=lambda x: x.line_number),
)
)

Expand Down Expand Up @@ -271,40 +261,120 @@ def _search_with_ripgrep(
raise


def _convert_semantic_to_search_results(semantic_results: list[SearchResult], query: str) -> list[SearchFileResult]:
"""Convert semantic search results to regular search results format."""
file_results = []
for result in semantic_results:
file_results.append(
SearchFileResult(
status="success",
filepath=result.filepath,
matches=[
def _search_with_python(
codebase: Codebase,
query: str,
target_directories: Optional[list[str]] = None,
file_extensions: Optional[list[str]] = None,
page: int = 1,
files_per_page: int = 10,
use_regex: bool = False,
) -> SearchObservation:
"""Search the codebase using Python's regex engine.

This is a fallback for when ripgrep is not available.
"""
# Validate pagination parameters
if page < 1:
page = 1
if files_per_page < 1:
files_per_page = 10

# Prepare the search pattern
if use_regex:
try:
pattern = re.compile(query)
except re.error as e:
return SearchObservation(
status="error",
error=f"Invalid regex pattern: {e!s}",
query=query,
page=page,
total_pages=0,
total_files=0,
files_per_page=files_per_page,
results=[],
)
else:
# For non-regex searches, escape special characters and make case-insensitive
pattern = re.compile(re.escape(query), re.IGNORECASE)

# Handle file extensions
extensions = file_extensions if file_extensions is not None else "*"

all_results = []
for file in codebase.files(extensions=extensions):
# Skip if file doesn't match target directories
if target_directories and not any(file.filepath.startswith(d) for d in target_directories):
continue

# Skip binary files
try:
content = file.content
except ValueError: # File is binary
continue

file_matches = []
# Split content into lines and store with line numbers (1-based)
lines = enumerate(content.splitlines(), 1)

# Search each line for the pattern
for line_number, line in lines:
match = pattern.search(line)
if match:
file_matches.append(
SearchMatch(
status="success",
line_number=1, # We don't have line numbers for semantic matches
line=result.preview,
match=query,
line_number=line_number,
line=line.strip(),
match=match.group(0),
)
],
)

if file_matches:
all_results.append(
SearchFileResult(
status="success",
filepath=file.filepath,
matches=sorted(file_matches, key=lambda x: x.line_number),
)
)
)
return file_results

# Sort all results by filepath
all_results.sort(key=lambda x: x.filepath)

# Calculate pagination
total_files = len(all_results)
total_pages = (total_files + files_per_page - 1) // files_per_page
start_idx = (page - 1) * files_per_page
end_idx = start_idx + files_per_page

# Get the current page of results
paginated_results = all_results[start_idx:end_idx]

return SearchObservation(
status="success",
query=query,
page=page,
total_pages=total_pages,
total_files=total_files,
files_per_page=files_per_page,
results=paginated_results,
)


def search(
codebase: Codebase,
query: str,
target_directories: Optional[list[str]] = None,
file_extensions: Optional[list[str] | str] = None,
file_extensions: Optional[list[str]] = None,
page: int = 1,
files_per_page: int = 10,
use_regex: bool = False,
) -> SearchObservation:
"""Search the codebase using text search or regex pattern matching.

Uses ripgrep for performance when available. If no exact matches are found,
falls back to semantic search to find relevant code.
Uses ripgrep for performance when available, with fallback to Python's regex engine.
If use_regex is True, performs a regex pattern match on each line.
Otherwise, performs a case-insensitive text search.
Returns matching lines with their line numbers, grouped by file.
Expand All @@ -323,52 +393,9 @@ def search(
Returns:
SearchObservation containing search results with matches and their sources
"""
# Try to use ripgrep first
try:
# Try ripgrep first
result = _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)

# If no results found, try semantic search
if not result.results:
semantic_results = semantic_search(codebase, query, k=files_per_page)
if semantic_results.status == "success" and semantic_results.results:
# Convert semantic results to regular search results format
file_results = _convert_semantic_to_search_results(semantic_results.results, query)

return SearchObservation(
status="success",
query=query,
page=1, # Semantic search doesn't support pagination yet
total_pages=1,
total_files=len(file_results),
files_per_page=files_per_page,
results=file_results,
)

return result

return _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)
except (FileNotFoundError, subprocess.SubprocessError):
# If ripgrep fails, try semantic search directly
semantic_results = semantic_search(codebase, query, k=files_per_page)
if semantic_results.status == "success":
file_results = _convert_semantic_to_search_results(semantic_results.results, query)

return SearchObservation(
status="success",
query=query,
page=1,
total_pages=1,
total_files=len(file_results),
files_per_page=files_per_page,
results=file_results,
)
else:
return SearchObservation(
status="error",
error=f"Both text search and semantic search failed: {semantic_results.error}",
query=query,
page=page,
total_pages=0,
total_files=0,
files_per_page=files_per_page,
results=[],
)
# Fall back to Python implementation if ripgrep fails or isn't available
return _search_with_python(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)
Loading