Skip to content

Commit 0d600cf

Browse files
authored
chore: reverts to clasic regex + ripgrep search (#776)
# Motivation <!-- Why is this change necessary? --> # Content <!-- Please include a summary of the change --> # Testing <!-- How was the change tested? --> # Please check the following before marking your PR as ready for review - [ ] I have added tests for my changes - [ ] I have updated the documentation or added new documentation as needed
1 parent b763d1f commit 0d600cf

File tree

2 files changed

+115
-175
lines changed

2 files changed

+115
-175
lines changed

src/codegen/extensions/langchain/tools.py

Lines changed: 9 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -112,111 +112,24 @@ def _run(self, dirpath: str = "./", depth: int = 1) -> str:
112112

113113

114114
class SearchInput(BaseModel):
115+
"""Input for searching the codebase."""
116+
115117
query: str = Field(
116118
...,
117-
description="""The text or pattern to search for in the codebase.
118-
119-
For simple text search (use_regex=False):
120-
- Uses ripgrep's fixed-strings mode (--fixed-strings)
121-
- Case-insensitive matching (--ignore-case)
122-
- All characters are treated literally, including special regex characters
123-
- Exact string matching (no regex interpretation)
124-
125-
For regex search (use_regex=True):
126-
- Full regex pattern support
127-
- Case-sensitive by default
128-
- Special characters have regex meaning and need proper escaping
129-
- Uses ripgrep's default regex mode
130-
131-
If no exact matches are found, automatically falls back to semantic search
132-
to find relevant code even without exact text matches.""",
133-
)
134-
135-
target_directories: Optional[list[str]] = Field(
136-
default=None,
137-
description="""Optional list of directories to limit the search scope.
138-
139-
- Paths should be relative to the workspace root
140-
- Multiple directories are searched in parallel
141-
- If None, searches the entire codebase
142-
143-
Example: ["src/frontend", "tests/unit"]""",
144-
)
145-
146-
file_extensions: Optional[list[str]] = Field(
147-
default=None,
148-
description="""Optional list of file extensions to filter the search.
149-
150-
- Include the dot in extensions (e.g. ['.py', '.ts'])
151-
- Multiple extensions are combined with OR logic
152-
- If None, searches all file types
153-
- Binary files are automatically excluded
154-
155-
Example: [".py", ".tsx", ".md"]""",
156-
)
157-
158-
page: int = Field(
159-
default=1,
160-
description="""Page number for paginated results (1-based indexing).
161-
162-
- Use with files_per_page to navigate large result sets
163-
- If page exceeds available pages, returns last available page
164-
- Note: When falling back to semantic search, pagination is not supported
165-
166-
Example: page=2 with files_per_page=10 shows files 11-20""",
167-
)
168-
169-
files_per_page: int = Field(
170-
default=10,
171-
description="""Number of files to show per page.
172-
173-
- Each file can contain multiple matching lines
174-
- Reasonable values are between 5 and 50
175-
- Larger values may impact performance
176-
- When falling back to semantic search, this becomes the number of semantic results
177-
178-
Example: files_per_page=20 shows up to 20 files with matches""",
179-
)
180-
181-
use_regex: bool = Field(
182-
default=False,
183-
description="""Whether to treat the query as a regex pattern.
184-
185-
- False (default): Simple text search, case-insensitive
186-
- True: Full regex syntax, case-sensitive
187-
- Invalid regex patterns will return an error
188-
- Note: Semantic fallback is used regardless of this setting when no matches found
189-
190-
Example: Set to True to use patterns like "test_.*_func.*" """,
119+
description="The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True. Ripgrep is the preferred method.",
191120
)
121+
target_directories: Optional[list[str]] = Field(default=None, description="Optional list of directories to search in")
122+
file_extensions: Optional[list[str]] = Field(default=None, description="Optional list of file extensions to search (e.g. ['.py', '.ts'])")
123+
page: int = Field(default=1, description="Page number to return (1-based, default: 1)")
124+
files_per_page: int = Field(default=10, description="Number of files to return per page (default: 10)")
125+
use_regex: bool = Field(default=False, description="Whether to treat query as a regex pattern (default: False)")
192126

193127

194128
class SearchTool(BaseTool):
195129
"""Tool for searching the codebase."""
196130

197131
name: ClassVar[str] = "search"
198-
description: ClassVar[str] = r"""Search the codebase using text search or regex pattern matching.
199-
200-
This tool provides powerful text-based search capabilities across your codebase,
201-
with support for both simple text matching and regular expressions. It uses ripgrep
202-
when available for high-performance searches.
203-
204-
If no exact matches are found, automatically falls back to semantic search to find
205-
relevant code even without exact text matches.
206-
207-
Features:
208-
- Plain text or regex pattern matching
209-
- Directory and file type filtering
210-
- Paginated results for large codebases
211-
- Case-insensitive by default for simple text searches
212-
- Semantic fallback for finding related code
213-
214-
Example queries:
215-
1. Simple text: "function calculateTotal" (matches exactly, case-insensitive)
216-
2. Regex: "def.*calculate.*\(.*\)" (with use_regex=True)
217-
3. File-specific: "TODO" with file_extensions=[".py", ".ts"]
218-
4. Directory-specific: "api" with target_directories=["src/backend"]
219-
"""
132+
description: ClassVar[str] = "Search the codebase using text search or regex pattern matching"
220133
args_schema: ClassVar[type[BaseModel]] = SearchInput
221134
codebase: Codebase = Field(exclude=True)
222135

src/codegen/extensions/tools/search.py

Lines changed: 106 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
This performs either a regex pattern match or simple text search across all files in the codebase.
44
Each matching line will be returned with its line number.
55
Results are paginated with a default of 10 files per page.
6-
7-
If no exact matches are found, falls back to semantic search to find relevant code.
86
"""
97

108
import os
@@ -17,7 +15,6 @@
1715
from codegen.sdk.core.codebase import Codebase
1816

1917
from .observation import Observation
20-
from .semantic_search import SearchResult, semantic_search
2118

2219

2320
class SearchMatch(Observation):
@@ -128,7 +125,7 @@ def _search_with_ripgrep(
128125
This is faster than the Python implementation, especially for large codebases.
129126
"""
130127
# Build ripgrep command
131-
cmd = ["rg", "--line-number", "--with-filename"]
128+
cmd = ["rg", "--line-number"]
132129

133130
# Add case insensitivity if not using regex
134131
if not use_regex:
@@ -203,6 +200,8 @@ def _search_with_ripgrep(
203200
match_text = query
204201
if use_regex:
205202
# For regex, we need to find what actually matched
203+
# This is a simplification - ideally we'd use ripgrep's --json option
204+
# to get the exact match positions
206205
pattern = re.compile(query)
207206
match_obj = pattern.search(content)
208207
if match_obj:
@@ -227,20 +226,11 @@ def _search_with_ripgrep(
227226
# Convert to SearchFileResult objects
228227
file_results = []
229228
for filepath, matches in all_results.items():
230-
# Sort matches by line number and deduplicate
231-
unique_matches = []
232-
seen = set()
233-
for match in sorted(matches, key=lambda x: x.line_number):
234-
key = (match.line_number, match.match)
235-
if key not in seen:
236-
seen.add(key)
237-
unique_matches.append(match)
238-
239229
file_results.append(
240230
SearchFileResult(
241231
status="success",
242232
filepath=filepath,
243-
matches=unique_matches,
233+
matches=sorted(matches, key=lambda x: x.line_number),
244234
)
245235
)
246236

@@ -271,40 +261,120 @@ def _search_with_ripgrep(
271261
raise
272262

273263

274-
def _convert_semantic_to_search_results(semantic_results: list[SearchResult], query: str) -> list[SearchFileResult]:
275-
"""Convert semantic search results to regular search results format."""
276-
file_results = []
277-
for result in semantic_results:
278-
file_results.append(
279-
SearchFileResult(
280-
status="success",
281-
filepath=result.filepath,
282-
matches=[
264+
def _search_with_python(
265+
codebase: Codebase,
266+
query: str,
267+
target_directories: Optional[list[str]] = None,
268+
file_extensions: Optional[list[str]] = None,
269+
page: int = 1,
270+
files_per_page: int = 10,
271+
use_regex: bool = False,
272+
) -> SearchObservation:
273+
"""Search the codebase using Python's regex engine.
274+
275+
This is a fallback for when ripgrep is not available.
276+
"""
277+
# Validate pagination parameters
278+
if page < 1:
279+
page = 1
280+
if files_per_page < 1:
281+
files_per_page = 10
282+
283+
# Prepare the search pattern
284+
if use_regex:
285+
try:
286+
pattern = re.compile(query)
287+
except re.error as e:
288+
return SearchObservation(
289+
status="error",
290+
error=f"Invalid regex pattern: {e!s}",
291+
query=query,
292+
page=page,
293+
total_pages=0,
294+
total_files=0,
295+
files_per_page=files_per_page,
296+
results=[],
297+
)
298+
else:
299+
# For non-regex searches, escape special characters and make case-insensitive
300+
pattern = re.compile(re.escape(query), re.IGNORECASE)
301+
302+
# Handle file extensions
303+
extensions = file_extensions if file_extensions is not None else "*"
304+
305+
all_results = []
306+
for file in codebase.files(extensions=extensions):
307+
# Skip if file doesn't match target directories
308+
if target_directories and not any(file.filepath.startswith(d) for d in target_directories):
309+
continue
310+
311+
# Skip binary files
312+
try:
313+
content = file.content
314+
except ValueError: # File is binary
315+
continue
316+
317+
file_matches = []
318+
# Split content into lines and store with line numbers (1-based)
319+
lines = enumerate(content.splitlines(), 1)
320+
321+
# Search each line for the pattern
322+
for line_number, line in lines:
323+
match = pattern.search(line)
324+
if match:
325+
file_matches.append(
283326
SearchMatch(
284327
status="success",
285-
line_number=1, # We don't have line numbers for semantic matches
286-
line=result.preview,
287-
match=query,
328+
line_number=line_number,
329+
line=line.strip(),
330+
match=match.group(0),
288331
)
289-
],
332+
)
333+
334+
if file_matches:
335+
all_results.append(
336+
SearchFileResult(
337+
status="success",
338+
filepath=file.filepath,
339+
matches=sorted(file_matches, key=lambda x: x.line_number),
340+
)
290341
)
291-
)
292-
return file_results
342+
343+
# Sort all results by filepath
344+
all_results.sort(key=lambda x: x.filepath)
345+
346+
# Calculate pagination
347+
total_files = len(all_results)
348+
total_pages = (total_files + files_per_page - 1) // files_per_page
349+
start_idx = (page - 1) * files_per_page
350+
end_idx = start_idx + files_per_page
351+
352+
# Get the current page of results
353+
paginated_results = all_results[start_idx:end_idx]
354+
355+
return SearchObservation(
356+
status="success",
357+
query=query,
358+
page=page,
359+
total_pages=total_pages,
360+
total_files=total_files,
361+
files_per_page=files_per_page,
362+
results=paginated_results,
363+
)
293364

294365

295366
def search(
296367
codebase: Codebase,
297368
query: str,
298369
target_directories: Optional[list[str]] = None,
299-
file_extensions: Optional[list[str] | str] = None,
370+
file_extensions: Optional[list[str]] = None,
300371
page: int = 1,
301372
files_per_page: int = 10,
302373
use_regex: bool = False,
303374
) -> SearchObservation:
304375
"""Search the codebase using text search or regex pattern matching.
305376
306-
Uses ripgrep for performance when available. If no exact matches are found,
307-
falls back to semantic search to find relevant code.
377+
Uses ripgrep for performance when available, with fallback to Python's regex engine.
308378
If use_regex is True, performs a regex pattern match on each line.
309379
Otherwise, performs a case-insensitive text search.
310380
Returns matching lines with their line numbers, grouped by file.
@@ -323,52 +393,9 @@ def search(
323393
Returns:
324394
SearchObservation containing search results with matches and their sources
325395
"""
396+
# Try to use ripgrep first
326397
try:
327-
# Try ripgrep first
328-
result = _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)
329-
330-
# If no results found, try semantic search
331-
if not result.results:
332-
semantic_results = semantic_search(codebase, query, k=files_per_page)
333-
if semantic_results.status == "success" and semantic_results.results:
334-
# Convert semantic results to regular search results format
335-
file_results = _convert_semantic_to_search_results(semantic_results.results, query)
336-
337-
return SearchObservation(
338-
status="success",
339-
query=query,
340-
page=1, # Semantic search doesn't support pagination yet
341-
total_pages=1,
342-
total_files=len(file_results),
343-
files_per_page=files_per_page,
344-
results=file_results,
345-
)
346-
347-
return result
348-
398+
return _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)
349399
except (FileNotFoundError, subprocess.SubprocessError):
350-
# If ripgrep fails, try semantic search directly
351-
semantic_results = semantic_search(codebase, query, k=files_per_page)
352-
if semantic_results.status == "success":
353-
file_results = _convert_semantic_to_search_results(semantic_results.results, query)
354-
355-
return SearchObservation(
356-
status="success",
357-
query=query,
358-
page=1,
359-
total_pages=1,
360-
total_files=len(file_results),
361-
files_per_page=files_per_page,
362-
results=file_results,
363-
)
364-
else:
365-
return SearchObservation(
366-
status="error",
367-
error=f"Both text search and semantic search failed: {semantic_results.error}",
368-
query=query,
369-
page=page,
370-
total_pages=0,
371-
total_files=0,
372-
files_per_page=files_per_page,
373-
results=[],
374-
)
400+
# Fall back to Python implementation if ripgrep fails or isn't available
401+
return _search_with_python(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)

0 commit comments

Comments
 (0)