Skip to content

Commit 15bdff2

Browse files
committed
Optimize planner scanning performance
1 parent cd29f7a commit 15bdff2

File tree

3 files changed

+355
-115
lines changed

3 files changed

+355
-115
lines changed

AGENTS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ Exit codes:
150150
6. For file and directory names, detect and schedule renames with depth ordering
151151
7. Emit `plan.json` and fast summary stats
152152

153+
- Implementation note: `scan_repository_multi` pre-filters candidate files with an `AhoCorasick` automaton, processes them in parallel via `rayon`, and only runs the expensive compound identifier scan on lines discovered by direct variant hits or token heuristics. When adjusting matching logic, keep the `token_line_hits` bookkeeping in sync with the `additional_lines` fed into `find_enhanced_matches`.
154+
153155
Boundary rules
154156

155157
- Avoid partial token matches inside larger identifiers unless intended

renamify-core/src/compound_scanner.rs

Lines changed: 144 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,79 @@ use crate::pattern::{build_pattern, is_boundary, Match};
44
use crate::scanner::{CoercionMode, MatchHunk, VariantMap};
55
use bstr::ByteSlice;
66
use regex::bytes::Regex;
7+
use std::collections::BTreeSet;
78
use std::path::{Path, PathBuf};
89

10+
/// Precompiled identifier extractor reused across files to avoid recompiling
11+
/// regex patterns on every scan iteration.
12+
pub struct IdentifierExtractor {
13+
regex: Regex,
14+
split_on_dots: bool,
15+
}
16+
17+
impl IdentifierExtractor {
18+
/// Construct an extractor tuned for the provided style set.
19+
pub fn new(styles: &[Style]) -> Self {
20+
let title_pattern = "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)*";
21+
let identifier_pattern = "[a-zA-Z_][a-zA-Z0-9_\\-\\.]*";
22+
let pattern = if styles.contains(&Style::Title) {
23+
format!(r"\b(?:{}|{})\b", title_pattern, identifier_pattern)
24+
} else {
25+
format!(r"\b{}\b", identifier_pattern)
26+
};
27+
28+
let regex = Regex::new(&pattern).expect("identifier regex must compile");
29+
let split_on_dots = !styles.contains(&Style::Dot);
30+
31+
Self {
32+
regex,
33+
split_on_dots,
34+
}
35+
}
36+
37+
/// Find all potential identifiers in the content using the precompiled pattern.
38+
pub fn find_all(&self, content: &[u8]) -> Vec<(usize, usize, String)> {
39+
let mut identifiers = Vec::new();
40+
41+
for m in self.regex.find_iter(content) {
42+
let identifier = String::from_utf8_lossy(m.as_bytes()).to_string();
43+
44+
if std::env::var("RENAMIFY_DEBUG_IDENTIFIERS").is_ok() {
45+
println!(
46+
"Found identifier: '{}' at {}-{}",
47+
identifier,
48+
m.start(),
49+
m.end()
50+
);
51+
}
52+
53+
if identifier.contains('.') && self.split_on_dots {
54+
let parts: Vec<&str> = identifier.split('.').collect();
55+
let mut current_pos = m.start();
56+
57+
for (i, part) in parts.iter().enumerate() {
58+
if !part.is_empty() {
59+
identifiers.push((
60+
current_pos,
61+
current_pos + part.len(),
62+
(*part).to_string(),
63+
));
64+
}
65+
current_pos += part.len() + 1; // Account for the dot separator
66+
67+
if i < parts.len() - 1 && current_pos <= m.end() {
68+
// Dot already handled by position increment above.
69+
}
70+
}
71+
} else {
72+
identifiers.push((m.start(), m.end(), identifier));
73+
}
74+
}
75+
76+
identifiers
77+
}
78+
}
79+
980
/// Normalize a path by removing Windows long path prefix if present
1081
fn normalize_path(path: &Path) -> PathBuf {
1182
#[cfg(windows)]
@@ -39,66 +110,6 @@ fn byte_offset_to_char_offset(text: &str, byte_offset: usize) -> usize {
39110
char_offset
40111
}
41112

42-
/// Find all potential identifiers in the content using a broad regex pattern
43-
fn find_all_identifiers(content: &[u8], styles: &[Style]) -> Vec<(usize, usize, String)> {
44-
let mut identifiers = Vec::new();
45-
46-
// Pattern to match identifier-like strings, including dots in some contexts
47-
// This is tricky: we want to split on dots for things like obj.prop but keep
48-
// dots for mixed-style identifiers like config.max_value
49-
// For Title style, we need to include spaces to capture "Title Case" patterns
50-
let pattern = if styles.len() == 1 && styles[0] == Style::Title {
51-
// Special pattern for Title style that includes spaces
52-
r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b"
53-
} else {
54-
r"\b[a-zA-Z_][a-zA-Z0-9_\-\.]*\b"
55-
};
56-
let regex = Regex::new(pattern).unwrap();
57-
58-
for m in regex.find_iter(content) {
59-
let identifier = String::from_utf8_lossy(m.as_bytes()).to_string();
60-
61-
// Debug: print what identifiers are being found
62-
if std::env::var("RENAMIFY_DEBUG_IDENTIFIERS").is_ok() {
63-
println!(
64-
"Found identifier: '{}' at {}-{}",
65-
identifier,
66-
m.start(),
67-
m.end()
68-
);
69-
}
70-
71-
// Only split on dots if dot style is NOT in the selected styles
72-
// When dot style is selected, keep dot-separated identifiers intact
73-
let should_split_on_dots = !styles.contains(&Style::Dot);
74-
75-
if identifier.contains('.') && should_split_on_dots {
76-
// Split on dots for things like obj.method or this.property
77-
// But NOT when we're specifically looking for dot.case style
78-
let parts: Vec<&str> = identifier.split('.').collect();
79-
let mut current_pos = m.start();
80-
81-
for (i, part) in parts.iter().enumerate() {
82-
if !part.is_empty() {
83-
identifiers.push((current_pos, current_pos + part.len(), (*part).to_string()));
84-
}
85-
current_pos += part.len() + 1; // +1 for the dot
86-
87-
// If there are more parts, we've consumed a dot
88-
if i < parts.len() - 1 && current_pos <= m.end() {
89-
// The dot is at current_pos - 1, move past it
90-
// current_pos is already at the right position for the next part
91-
}
92-
}
93-
} else {
94-
// Keep as single identifier (including dots)
95-
identifiers.push((m.start(), m.end(), identifier));
96-
}
97-
}
98-
99-
identifiers
100-
}
101-
102113
/// Enhanced matching that finds both exact and compound matches
103114
pub fn find_enhanced_matches(
104115
content: &[u8],
@@ -107,6 +118,8 @@ pub fn find_enhanced_matches(
107118
replace: &str,
108119
variant_map: &VariantMap,
109120
styles: &[Style],
121+
identifier_extractor: &IdentifierExtractor,
122+
additional_lines: Option<&BTreeSet<usize>>,
110123
) -> Vec<Match> {
111124
let mut all_matches = Vec::new();
112125
let mut processed_ranges = Vec::new(); // Track (start, end) ranges that were exactly matched
@@ -163,7 +176,60 @@ pub fn find_enhanced_matches(
163176

164177
// Third, find all identifiers and check for compound matches
165178
{
166-
let identifiers = find_all_identifiers(content, styles);
179+
let identifiers = if processed_ranges.is_empty() {
180+
identifier_extractor.find_all(content)
181+
} else {
182+
let mut candidate_lines = BTreeSet::new();
183+
for m in &all_matches {
184+
candidate_lines.insert(m.line);
185+
if m.line > 1 {
186+
candidate_lines.insert(m.line - 1);
187+
}
188+
candidate_lines.insert(m.line + 1);
189+
}
190+
191+
if let Some(extra_lines) = additional_lines {
192+
candidate_lines.extend(extra_lines.iter().copied());
193+
}
194+
195+
if candidate_lines.is_empty() {
196+
identifier_extractor.find_all(content)
197+
} else {
198+
let mut line_offsets = Vec::new();
199+
let mut pos = 0;
200+
for line in content.lines_with_terminator() {
201+
line_offsets.push(pos);
202+
pos += line.len();
203+
}
204+
205+
let mut scoped_identifiers = Vec::new();
206+
for line_idx in candidate_lines {
207+
let idx = line_idx.saturating_sub(1);
208+
if idx >= line_offsets.len() {
209+
continue;
210+
}
211+
212+
let start = line_offsets[idx];
213+
let end = if idx + 1 < line_offsets.len() {
214+
line_offsets[idx + 1]
215+
} else {
216+
content.len()
217+
};
218+
let slice = &content[start..end];
219+
220+
for (local_start, local_end, identifier) in identifier_extractor.find_all(slice)
221+
{
222+
scoped_identifiers.push((
223+
start + local_start,
224+
start + local_end,
225+
identifier,
226+
));
227+
}
228+
}
229+
230+
scoped_identifiers
231+
}
232+
};
167233

168234
for (start, end, identifier) in identifiers {
169235
// Skip if this identifier was already matched exactly or if it's completely contained within a processed range
@@ -381,7 +447,8 @@ mod tests {
381447
let content = b"let preview_format_arg = PreviewFormatArg::new();";
382448
// Use default styles for test
383449
let styles = vec![Style::Snake, Style::Pascal];
384-
let identifiers = find_all_identifiers(content, &styles);
450+
let extractor = IdentifierExtractor::new(&styles);
451+
let identifiers = extractor.find_all(content);
385452

386453
// Should find: let, preview_format_arg, PreviewFormatArg, new
387454
assert!(identifiers.len() >= 4);
@@ -397,7 +464,8 @@ mod tests {
397464

398465
// When looking for dot style only, keep dot-separated identifiers intact
399466
let dot_styles = vec![Style::Dot];
400-
let identifiers = find_all_identifiers(content, &dot_styles);
467+
let extractor = IdentifierExtractor::new(&dot_styles);
468+
let identifiers = extractor.find_all(content);
401469
let names: Vec<String> = identifiers.iter().map(|(_, _, id)| id.clone()).collect();
402470
assert!(names.contains(&"test.case".to_string()));
403471
assert!(names.contains(&"use.case".to_string()));
@@ -406,7 +474,8 @@ mod tests {
406474

407475
// When using other styles, split on dots
408476
let other_styles = vec![Style::Snake, Style::Camel];
409-
let identifiers = find_all_identifiers(content, &other_styles);
477+
let extractor = IdentifierExtractor::new(&other_styles);
478+
let identifiers = extractor.find_all(content);
410479
let names: Vec<String> = identifiers.iter().map(|(_, _, id)| id.clone()).collect();
411480
// Should split into individual parts
412481
assert!(names.contains(&"test".to_string()));
@@ -439,9 +508,18 @@ mod tests {
439508
);
440509

441510
let styles = vec![Style::Snake, Style::Pascal];
511+
let extractor = IdentifierExtractor::new(&styles);
442512

443-
let matches =
444-
find_enhanced_matches(content, "test.rs", search, replace, &variant_map, &styles);
513+
let matches = find_enhanced_matches(
514+
content,
515+
"test.rs",
516+
search,
517+
replace,
518+
&variant_map,
519+
&styles,
520+
&extractor,
521+
None,
522+
);
445523

446524
// Should find both preview_format_arg and PreviewFormatArg
447525
assert_eq!(matches.len(), 2);

0 commit comments

Comments
 (0)