Skip to content

Commit e55ca28

Browse files
cursoragentscript3r
andcommitted
feat: Optimize file discovery with git ls-files and parallel walk
Co-authored-by: script3r <[email protected]>
1 parent 4994956 commit e55ca28

File tree

1 file changed

+145
-26
lines changed

1 file changed

+145
-26
lines changed

crates/scanner-core/src/lib.rs

Lines changed: 145 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use std::io::Read;
1111
use std::path::{Path, PathBuf};
1212
use std::sync::Arc;
1313
use std::sync::Mutex;
14+
use std::process::Command;
1415

1516
// ---------------- Types ----------------
1617

@@ -722,58 +723,144 @@ impl<'a> Scanner<'a> {
722723
}
723724

724725
pub fn discover_files(&self, roots: &[PathBuf]) -> Vec<PathBuf> {
725-
let mut paths = Vec::new();
726+
let mut discovered_paths = Vec::new();
726727

727-
// Build glob matcher for include patterns
728+
// Compile include and exclude glob sets once
728729
let include_matcher: Option<globset::GlobSet> = if !self.config.include_globs.is_empty() {
729730
let mut builder = globset::GlobSetBuilder::new();
730731
for pattern in &self.config.include_globs {
731-
match globset::Glob::new(pattern) {
732-
Ok(glob) => {
733-
builder.add(glob);
734-
}
735-
Err(_) => {
736-
return Vec::new(); // Return empty on pattern error
737-
}
732+
if let Ok(glob) = globset::Glob::new(pattern) {
733+
builder.add(glob);
734+
} else {
735+
// If any pattern is invalid, return empty to avoid expensive scan with bad filter
736+
return Vec::new();
738737
}
739738
}
740739
builder.build().ok()
741740
} else {
742741
None
743742
};
744743

744+
let exclude_matcher: Option<globset::GlobSet> = if !self.config.exclude_globs.is_empty() {
745+
let mut builder = globset::GlobSetBuilder::new();
746+
for pattern in &self.config.exclude_globs {
747+
if let Ok(glob) = globset::Glob::new(pattern) {
748+
builder.add(glob);
749+
} else {
750+
return Vec::new();
751+
}
752+
}
753+
builder.build().ok()
754+
} else {
755+
None
756+
};
757+
758+
// Helper to apply path-based filters early (before metadata calls when possible)
759+
let path_allowed = |p: &Path| -> bool {
760+
if let Some(ref ex) = exclude_matcher {
761+
if ex.is_match(p) {
762+
return false;
763+
}
764+
}
765+
if let Some(ref inc) = include_matcher {
766+
if !inc.is_match(p) {
767+
return false;
768+
}
769+
}
770+
true
771+
};
772+
745773
for root in roots {
774+
// Fast path: leverage git index if available
775+
if root.join(".git").exists() {
776+
if let Some(list) = git_list_files_fast(root) {
777+
for path in list {
778+
if !path_allowed(&path) {
779+
continue;
780+
}
781+
// Only then stat for size
782+
if let Ok(md) = fs::metadata(&path) {
783+
if md.is_file() && (md.len() as usize) <= self.config.max_file_size {
784+
discovered_paths.push(path);
785+
}
786+
}
787+
}
788+
// Move on to next root after using the git fast path
789+
continue;
790+
}
791+
}
792+
793+
// Fallback: parallel directory walk with ignore rules
746794
let mut builder = WalkBuilder::new(root);
747795
builder
748-
.hidden(false)
796+
.hidden(false) // preserve previous behavior: include hidden files/dirs
749797
.git_ignore(true)
750798
.git_exclude(true)
751-
.ignore(true);
799+
.ignore(true)
800+
.parents(true)
801+
.follow_links(false)
802+
.same_file_system(false);
752803

753-
for entry in builder.build().flatten() {
754-
let md = match entry.metadata() {
755-
Ok(m) => m,
756-
Err(_) => continue,
757-
};
758-
if md.is_file() {
759-
if md.len() as usize > self.config.max_file_size {
760-
continue;
804+
if let Ok(n) = std::thread::available_parallelism() {
805+
builder.threads(n.get());
806+
}
807+
808+
let out: Arc<Mutex<Vec<PathBuf>>> = Arc::new(Mutex::new(Vec::with_capacity(4096)));
809+
let out_ref = out.clone();
810+
811+
builder.build_parallel().run(|| {
812+
let out = out_ref.clone();
813+
Box::new(move |res| {
814+
let entry = match res {
815+
Ok(e) => e,
816+
Err(_) => return ignore::WalkState::Continue,
817+
};
818+
819+
// Quickly skip non-files using cheap file_type when available
820+
if let Some(ft) = entry.file_type() {
821+
if !ft.is_file() {
822+
return ignore::WalkState::Continue;
823+
}
824+
} else {
825+
// Fallback to metadata if file_type unavailable
826+
if let Ok(md) = entry.metadata() {
827+
if !md.is_file() {
828+
return ignore::WalkState::Continue;
829+
}
830+
} else {
831+
return ignore::WalkState::Continue;
832+
}
761833
}
762834

763835
let path = entry.into_path();
764836

765-
// Apply include glob filtering
766-
if let Some(ref matcher) = include_matcher {
767-
if !matcher.is_match(&path) {
768-
continue;
837+
// Apply path-based filters first to avoid unnecessary metadata calls
838+
if !path_allowed(&path) {
839+
return ignore::WalkState::Continue;
840+
}
841+
842+
// Size filter
843+
if let Ok(md) = fs::metadata(&path) {
844+
if (md.len() as usize) > self.config.max_file_size {
845+
return ignore::WalkState::Continue;
769846
}
847+
} else {
848+
return ignore::WalkState::Continue;
770849
}
771850

772-
paths.push(path);
773-
}
851+
if let Ok(mut guard) = out.lock() {
852+
guard.push(path);
853+
}
854+
855+
ignore::WalkState::Continue
856+
})
857+
});
858+
859+
if let Ok(mut guard) = out.lock() {
860+
discovered_paths.append(&mut *guard);
774861
}
775862
}
776-
paths
863+
discovered_paths
777864
}
778865

779866
pub fn detect_language(path: &Path) -> Option<Language> {
@@ -913,6 +1000,38 @@ impl<'a> Scanner<'a> {
9131000
}
9141001
}
9151002

1003+
fn git_list_files_fast(root: &Path) -> Option<Vec<PathBuf>> {
1004+
// Use git index for fast listing of tracked and untracked (non-ignored) files
1005+
// Equivalent to: git -C <root> ls-files -z --cached --others --exclude-standard
1006+
let output = Command::new("git")
1007+
.arg("-C")
1008+
.arg(root)
1009+
.arg("ls-files")
1010+
.arg("-z")
1011+
.arg("--cached")
1012+
.arg("--others")
1013+
.arg("--exclude-standard")
1014+
.output()
1015+
.ok()?;
1016+
if !output.status.success() {
1017+
return None;
1018+
}
1019+
let bytes = output.stdout;
1020+
if bytes.is_empty() {
1021+
return Some(Vec::new());
1022+
}
1023+
let mut list = Vec::new();
1024+
for rel in bytes.split(|b| *b == 0) {
1025+
if rel.is_empty() {
1026+
continue;
1027+
}
1028+
if let Ok(rel_path) = std::str::from_utf8(rel) {
1029+
list.push(root.join(rel_path));
1030+
}
1031+
}
1032+
Some(list)
1033+
}
1034+
9161035
fn prefilter_hit(det: &dyn Detector, stripped: &[u8]) -> bool {
9171036
let pf = det.prefilter();
9181037
if pf.substrings.is_empty() {

0 commit comments

Comments
 (0)