Skip to content

Commit 02c5649

Browse files
cursoragentscript3r
andcommitted
feat: Optimize directory traversal and file processing
Co-authored-by: script3r <[email protected]>
1 parent bbea9b6 commit 02c5649

File tree

4 files changed

+136
-65
lines changed

4 files changed

+136
-65
lines changed

Cargo.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,5 @@ humantime = "2"
4141
globset = "0.4"
4242
crossbeam-channel = "0.5"
4343
walkdir = "2"
44+
num_cpus = "1"
4445

crates/scanner-core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ ignore = { workspace = true }
1818
memmap2 = { workspace = true }
1919
globset = { workspace = true }
2020
crossbeam-channel = { workspace = true }
21+
num_cpus = { workspace = true }
2122

2223
[dev-dependencies]
2324
criterion = "0.5"

crates/scanner-core/src/lib.rs

Lines changed: 123 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ use std::collections::{BTreeSet, HashMap};
3737
use std::fs;
3838
use std::io::Read;
3939
use std::path::{Path, PathBuf};
40+
use std::sync::atomic::{AtomicUsize, Ordering};
4041
use std::sync::{Arc, Mutex};
4142
use std::thread;
4243

@@ -778,7 +779,7 @@ impl<'a> Scanner<'a> {
778779
};
779780

780781
let max_file_size = self.config.max_file_size;
781-
let files_discovered = Arc::new(Mutex::new(0usize));
782+
let files_discovered = Arc::new(AtomicUsize::new(0));
782783

783784
for root in roots {
784785
let mut builder = WalkBuilder::new(root);
@@ -788,7 +789,9 @@ impl<'a> Scanner<'a> {
788789
.git_exclude(true) // Respect .git/info/exclude
789790
.ignore(true) // Respect .ignore files
790791
.follow_links(false) // Don't follow symlinks for safety
791-
.max_depth(None); // No depth limit
792+
.max_depth(None) // No depth limit
793+
.threads(num_cpus::get().max(4)) // Use optimal thread count for directory traversal
794+
.same_file_system(true); // Don't cross filesystem boundaries for better performance
792795

793796
// Configure exclude globs if provided
794797
for exclude_glob in &self.config.exclude_globs {
@@ -820,35 +823,38 @@ impl<'a> Scanner<'a> {
820823

821824
let path = entry.path();
822825

823-
// Check file size before processing
824-
if let Ok(metadata) = entry.metadata() {
825-
if metadata.len() as usize > max_file_size {
826-
return ignore::WalkState::Continue;
827-
}
826+
// Fast language detection BEFORE expensive operations
827+
if Scanner::detect_language(path).is_none() {
828+
return ignore::WalkState::Continue;
828829
}
829830

830-
// Apply include glob filtering if specified
831+
// Apply include glob filtering if specified (after language check)
831832
if let Some(ref matcher) = include_matcher {
832833
if !matcher.is_match(path) {
833834
return ignore::WalkState::Continue;
834835
}
835836
}
836837

837-
// Only send files with supported extensions to reduce consumer work
838-
if Scanner::detect_language(path).is_some() {
839-
if work_sender.send(path.to_path_buf()).is_err() {
840-
return ignore::WalkState::Quit;
838+
// Check file size ONLY for files we're interested in
839+
// Use DirEntry's metadata which might be cached
840+
if let Ok(metadata) = entry.metadata() {
841+
if metadata.len() as usize > max_file_size {
842+
return ignore::WalkState::Continue;
841843
}
844+
}
842845

843-
// Update discovered files counter
844-
{
845-
let mut count = files_discovered.lock().unwrap();
846-
*count += 1;
847-
}
846+
// Send file to work queue
847+
if work_sender.send(path.to_path_buf()).is_err() {
848+
return ignore::WalkState::Quit;
849+
}
848850

849-
// Send progress update if callback exists (1 = file discovered)
850-
if let Some(ref progress_tx) = progress_sender {
851-
let _ = progress_tx.send(1);
851+
// Update discovered files counter atomically (no lock!)
852+
let count = files_discovered.fetch_add(1, Ordering::Relaxed);
853+
854+
// Send progress update every 1000 files to reduce channel overhead
855+
if let Some(ref progress_tx) = progress_sender {
856+
if count % 1000 == 0 {
857+
let _ = progress_tx.send(1000); // Send batch size
852858
}
853859
}
854860

@@ -867,25 +873,57 @@ impl<'a> Scanner<'a> {
867873
findings_sender: Sender<Finding>,
868874
progress_sender: Option<Sender<usize>>,
869875
) -> Result<()> {
870-
// Use rayon to process files in parallel
871-
work_receiver
872-
.into_iter()
873-
.collect::<Vec<_>>()
874-
.par_iter()
875-
.for_each(|path| {
876-
// Placeholder function call - this is where the actual file scanning happens
877-
if let Err(e) = self.scan_file(path, &findings_sender) {
878-
eprintln!("Error scanning file {:?}: {}", path, e);
879-
}
880-
881-
// Send progress update if callback exists (2 = file processed)
876+
const BATCH_SIZE: usize = 1000; // Process files in batches for better cache locality
877+
878+
let mut batch = Vec::with_capacity(BATCH_SIZE);
879+
let mut _processed_count = 0usize;
880+
881+
// Collect files into batches and process them
882+
for path in work_receiver.iter() {
883+
batch.push(path);
884+
885+
if batch.len() >= BATCH_SIZE {
886+
_processed_count += self.process_batch(&batch, &findings_sender)?;
887+
batch.clear();
888+
889+
// Send progress update for the entire batch
882890
if let Some(ref progress_tx) = progress_sender {
883-
let _ = progress_tx.send(2);
891+
let _ = progress_tx.send(BATCH_SIZE);
884892
}
885-
});
893+
}
894+
}
895+
896+
// Process remaining files in the final batch
897+
if !batch.is_empty() {
898+
_processed_count += self.process_batch(&batch, &findings_sender)?;
899+
900+
if let Some(ref progress_tx) = progress_sender {
901+
let _ = progress_tx.send(batch.len());
902+
}
903+
}
886904

887905
Ok(())
888906
}
907+
908+
/// Process a batch of files in parallel for better performance
909+
fn process_batch(
910+
&self,
911+
batch: &[PathBuf],
912+
findings_sender: &Sender<Finding>
913+
) -> Result<usize> {
914+
// Process the batch in parallel using rayon
915+
batch
916+
.par_iter()
917+
.map(|path| {
918+
if let Err(e) = self.scan_file(path, findings_sender) {
919+
eprintln!("Error scanning file {:?}: {}", path, e);
920+
}
921+
1 // Return 1 for each processed file
922+
})
923+
.sum::<usize>();
924+
925+
Ok(batch.len())
926+
}
889927

890928
/// Core file scanning logic - processes a single file
891929
fn scan_file(&self, path: &PathBuf, findings_sender: &Sender<Finding>) -> Result<()> {
@@ -1000,29 +1038,47 @@ impl<'a> Scanner<'a> {
10001038
paths
10011039
}
10021040

1041+
/// Ultra-fast language detection that avoids string allocations
10031042
pub fn detect_language(path: &Path) -> Option<Language> {
1004-
match path
1005-
.extension()
1006-
.and_then(|e| e.to_str())
1007-
.unwrap_or("")
1008-
.to_ascii_lowercase()
1009-
.as_str()
1010-
{
1011-
"go" => Some(Language::Go),
1012-
"java" => Some(Language::Java),
1013-
"c" => Some(Language::C),
1014-
"h" => Some(Language::C),
1015-
"hpp" => Some(Language::Cpp),
1016-
"hh" => Some(Language::Cpp),
1017-
"cc" | "cpp" | "cxx" => Some(Language::Cpp),
1018-
"rs" => Some(Language::Rust),
1019-
"py" | "pyw" | "pyi" => Some(Language::Python),
1020-
"php" | "phtml" | "php3" | "php4" | "php5" | "phps" => Some(Language::Php),
1021-
"swift" => Some(Language::Swift),
1022-
"m" | "mm" | "M" => Some(Language::ObjC),
1023-
"kt" | "kts" => Some(Language::Kotlin),
1024-
"erl" | "hrl" | "beam" => Some(Language::Erlang),
1025-
_ => None,
1043+
let ext = path.extension()?;
1044+
1045+
// Fast path: check common extensions without string conversion
1046+
match ext.as_encoded_bytes() {
1047+
// Single char extensions
1048+
b"c" => Some(Language::C),
1049+
b"h" => Some(Language::C),
1050+
b"m" | b"M" => Some(Language::ObjC),
1051+
1052+
// Two char extensions
1053+
b"go" => Some(Language::Go),
1054+
b"rs" => Some(Language::Rust),
1055+
b"py" => Some(Language::Python),
1056+
b"kt" => Some(Language::Kotlin),
1057+
b"cc" => Some(Language::Cpp),
1058+
b"mm" => Some(Language::ObjC),
1059+
1060+
// Three char extensions
1061+
b"cpp" | b"cxx" | b"hpp" | b"hxx" => Some(Language::Cpp),
1062+
b"php" => Some(Language::Php),
1063+
b"pyw" | b"pyi" => Some(Language::Python),
1064+
b"kts" => Some(Language::Kotlin),
1065+
b"erl" | b"hrl" => Some(Language::Erlang),
1066+
1067+
// Four+ char extensions
1068+
b"java" => Some(Language::Java),
1069+
b"swift" => Some(Language::Swift),
1070+
b"phtml" => Some(Language::Php),
1071+
b"php3" | b"php4" | b"php5" | b"phps" => Some(Language::Php),
1072+
b"beam" => Some(Language::Erlang),
1073+
1074+
// Fallback to string comparison for edge cases
1075+
_ => {
1076+
let ext_str = ext.to_str()?.to_ascii_lowercase();
1077+
match ext_str.as_str() {
1078+
"c++" | "h++" => Some(Language::Cpp),
1079+
_ => None,
1080+
}
1081+
}
10261082
}
10271083
}
10281084

@@ -1061,17 +1117,19 @@ impl<'a> Scanner<'a> {
10611117
// Initial callback
10621118
callback(0, 0, 0);
10631119

1064-
for update in progress_rx.iter() {
1065-
if update == 1 {
1066-
files_discovered += 1;
1067-
// Update callback every 100 files discovered to avoid spam
1068-
if files_discovered % 100 == 0 {
1120+
for batch_size in progress_rx.iter() {
1121+
if batch_size >= 1000 {
1122+
// This is a discovery batch
1123+
files_discovered += batch_size;
1124+
// Update callback every 10k files discovered to reduce overhead
1125+
if files_discovered % 10_000 == 0 {
10691126
callback(files_processed, files_discovered, findings_count);
10701127
}
1071-
} else if update == 2 {
1072-
files_processed += 1;
1073-
// Update callback every 50 files processed
1074-
if files_processed % 50 == 0 {
1128+
} else {
1129+
// This is a processing batch
1130+
files_processed += batch_size;
1131+
// Update callback every 5k files processed
1132+
if files_processed % 5_000 == 0 {
10751133
callback(files_processed, files_discovered, findings_count);
10761134
}
10771135
}

0 commit comments

Comments
 (0)