@@ -37,6 +37,7 @@ use std::collections::{BTreeSet, HashMap};
3737use std:: fs;
3838use std:: io:: Read ;
3939use std:: path:: { Path , PathBuf } ;
40+ use std:: sync:: atomic:: { AtomicUsize , Ordering } ;
4041use std:: sync:: { Arc , Mutex } ;
4142use std:: thread;
4243
@@ -778,7 +779,7 @@ impl<'a> Scanner<'a> {
778779 } ;
779780
780781 let max_file_size = self . config . max_file_size ;
781- let files_discovered = Arc :: new ( Mutex :: new ( 0usize ) ) ;
782+ let files_discovered = Arc :: new ( AtomicUsize :: new ( 0 ) ) ;
782783
783784 for root in roots {
784785 let mut builder = WalkBuilder :: new ( root) ;
@@ -788,7 +789,9 @@ impl<'a> Scanner<'a> {
788789 . git_exclude ( true ) // Respect .git/info/exclude
789790 . ignore ( true ) // Respect .ignore files
790791 . follow_links ( false ) // Don't follow symlinks for safety
791- . max_depth ( None ) ; // No depth limit
792+ . max_depth ( None ) // No depth limit
793+ . threads ( num_cpus:: get ( ) . max ( 4 ) ) // Use optimal thread count for directory traversal
794+ . same_file_system ( true ) ; // Don't cross filesystem boundaries for better performance
792795
793796 // Configure exclude globs if provided
794797 for exclude_glob in & self . config . exclude_globs {
@@ -820,35 +823,38 @@ impl<'a> Scanner<'a> {
820823
821824 let path = entry. path ( ) ;
822825
823- // Check file size before processing
824- if let Ok ( metadata) = entry. metadata ( ) {
825- if metadata. len ( ) as usize > max_file_size {
826- return ignore:: WalkState :: Continue ;
827- }
826+ // Fast language detection BEFORE expensive operations
827+ if Scanner :: detect_language ( path) . is_none ( ) {
828+ return ignore:: WalkState :: Continue ;
828829 }
829830
830- // Apply include glob filtering if specified
831+ // Apply include glob filtering if specified (after language check)
831832 if let Some ( ref matcher) = include_matcher {
832833 if !matcher. is_match ( path) {
833834 return ignore:: WalkState :: Continue ;
834835 }
835836 }
836837
837- // Only send files with supported extensions to reduce consumer work
838- if Scanner :: detect_language ( path) . is_some ( ) {
839- if work_sender. send ( path. to_path_buf ( ) ) . is_err ( ) {
840- return ignore:: WalkState :: Quit ;
838+ // Check file size ONLY for files we're interested in
839+ // Use DirEntry's metadata which might be cached
840+ if let Ok ( metadata) = entry. metadata ( ) {
841+ if metadata. len ( ) as usize > max_file_size {
842+ return ignore:: WalkState :: Continue ;
841843 }
844+ }
842845
843- // Update discovered files counter
844- {
845- let mut count = files_discovered. lock ( ) . unwrap ( ) ;
846- * count += 1 ;
847- }
846+ // Send file to work queue
847+ if work_sender. send ( path. to_path_buf ( ) ) . is_err ( ) {
848+ return ignore:: WalkState :: Quit ;
849+ }
848850
849- // Send progress update if callback exists (1 = file discovered)
850- if let Some ( ref progress_tx) = progress_sender {
851- let _ = progress_tx. send ( 1 ) ;
851+ // Update discovered files counter atomically (no lock!)
852+ let count = files_discovered. fetch_add ( 1 , Ordering :: Relaxed ) ;
853+
854+ // Send progress update every 1000 files to reduce channel overhead
855+ if let Some ( ref progress_tx) = progress_sender {
856+ if count % 1000 == 0 {
857+ let _ = progress_tx. send ( 1000 ) ; // Send batch size
852858 }
853859 }
854860
@@ -867,25 +873,57 @@ impl<'a> Scanner<'a> {
867873 findings_sender : Sender < Finding > ,
868874 progress_sender : Option < Sender < usize > > ,
869875 ) -> Result < ( ) > {
870- // Use rayon to process files in parallel
871- work_receiver
872- . into_iter ( )
873- . collect :: < Vec < _ > > ( )
874- . par_iter ( )
875- . for_each ( |path| {
876- // Placeholder function call - this is where the actual file scanning happens
877- if let Err ( e) = self . scan_file ( path, & findings_sender) {
878- eprintln ! ( "Error scanning file {:?}: {}" , path, e) ;
879- }
880-
881- // Send progress update if callback exists (2 = file processed)
876+ const BATCH_SIZE : usize = 1000 ; // Process files in batches for better cache locality
877+
878+ let mut batch = Vec :: with_capacity ( BATCH_SIZE ) ;
879+ let mut _processed_count = 0usize ;
880+
881+ // Collect files into batches and process them
882+ for path in work_receiver. iter ( ) {
883+ batch. push ( path) ;
884+
885+ if batch. len ( ) >= BATCH_SIZE {
886+ _processed_count += self . process_batch ( & batch, & findings_sender) ?;
887+ batch. clear ( ) ;
888+
889+ // Send progress update for the entire batch
882890 if let Some ( ref progress_tx) = progress_sender {
883- let _ = progress_tx. send ( 2 ) ;
891+ let _ = progress_tx. send ( BATCH_SIZE ) ;
884892 }
885- } ) ;
893+ }
894+ }
895+
896+ // Process remaining files in the final batch
897+ if !batch. is_empty ( ) {
898+ _processed_count += self . process_batch ( & batch, & findings_sender) ?;
899+
900+ if let Some ( ref progress_tx) = progress_sender {
901+ let _ = progress_tx. send ( batch. len ( ) ) ;
902+ }
903+ }
886904
887905 Ok ( ( ) )
888906 }
907+
908+ /// Process a batch of files in parallel for better performance
909+ fn process_batch (
910+ & self ,
911+ batch : & [ PathBuf ] ,
912+ findings_sender : & Sender < Finding >
913+ ) -> Result < usize > {
914+ // Process the batch in parallel using rayon
915+ batch
916+ . par_iter ( )
917+ . map ( |path| {
918+ if let Err ( e) = self . scan_file ( path, findings_sender) {
919+ eprintln ! ( "Error scanning file {:?}: {}" , path, e) ;
920+ }
921+ 1 // Return 1 for each processed file
922+ } )
923+ . sum :: < usize > ( ) ;
924+
925+ Ok ( batch. len ( ) )
926+ }
889927
890928 /// Core file scanning logic - processes a single file
891929 fn scan_file ( & self , path : & PathBuf , findings_sender : & Sender < Finding > ) -> Result < ( ) > {
@@ -1000,29 +1038,47 @@ impl<'a> Scanner<'a> {
10001038 paths
10011039 }
10021040
1041+ /// Ultra-fast language detection that avoids string allocations
10031042 pub fn detect_language ( path : & Path ) -> Option < Language > {
1004- match path
1005- . extension ( )
1006- . and_then ( |e| e. to_str ( ) )
1007- . unwrap_or ( "" )
1008- . to_ascii_lowercase ( )
1009- . as_str ( )
1010- {
1011- "go" => Some ( Language :: Go ) ,
1012- "java" => Some ( Language :: Java ) ,
1013- "c" => Some ( Language :: C ) ,
1014- "h" => Some ( Language :: C ) ,
1015- "hpp" => Some ( Language :: Cpp ) ,
1016- "hh" => Some ( Language :: Cpp ) ,
1017- "cc" | "cpp" | "cxx" => Some ( Language :: Cpp ) ,
1018- "rs" => Some ( Language :: Rust ) ,
1019- "py" | "pyw" | "pyi" => Some ( Language :: Python ) ,
1020- "php" | "phtml" | "php3" | "php4" | "php5" | "phps" => Some ( Language :: Php ) ,
1021- "swift" => Some ( Language :: Swift ) ,
1022- "m" | "mm" | "M" => Some ( Language :: ObjC ) ,
1023- "kt" | "kts" => Some ( Language :: Kotlin ) ,
1024- "erl" | "hrl" | "beam" => Some ( Language :: Erlang ) ,
1025- _ => None ,
1043+ let ext = path. extension ( ) ?;
1044+
1045+ // Fast path: check common extensions without string conversion
1046+ match ext. as_encoded_bytes ( ) {
1047+ // Single char extensions
1048+ b"c" => Some ( Language :: C ) ,
1049+ b"h" => Some ( Language :: C ) ,
1050+ b"m" | b"M" => Some ( Language :: ObjC ) ,
1051+
1052+ // Two char extensions
1053+ b"go" => Some ( Language :: Go ) ,
1054+ b"rs" => Some ( Language :: Rust ) ,
1055+ b"py" => Some ( Language :: Python ) ,
1056+ b"kt" => Some ( Language :: Kotlin ) ,
1057+ b"cc" => Some ( Language :: Cpp ) ,
1058+ b"mm" => Some ( Language :: ObjC ) ,
1059+
1060+ // Three char extensions
1061+ b"cpp" | b"cxx" | b"hpp" | b"hxx" => Some ( Language :: Cpp ) ,
1062+ b"php" => Some ( Language :: Php ) ,
1063+ b"pyw" | b"pyi" => Some ( Language :: Python ) ,
1064+ b"kts" => Some ( Language :: Kotlin ) ,
1065+ b"erl" | b"hrl" => Some ( Language :: Erlang ) ,
1066+
1067+ // Four+ char extensions
1068+ b"java" => Some ( Language :: Java ) ,
1069+ b"swift" => Some ( Language :: Swift ) ,
1070+ b"phtml" => Some ( Language :: Php ) ,
1071+ b"php3" | b"php4" | b"php5" | b"phps" => Some ( Language :: Php ) ,
1072+ b"beam" => Some ( Language :: Erlang ) ,
1073+
1074+ // Fallback to string comparison for edge cases
1075+ _ => {
1076+ let ext_str = ext. to_str ( ) ?. to_ascii_lowercase ( ) ;
1077+ match ext_str. as_str ( ) {
1078+ "c++" | "h++" => Some ( Language :: Cpp ) ,
1079+ _ => None ,
1080+ }
1081+ }
10261082 }
10271083 }
10281084
@@ -1061,17 +1117,19 @@ impl<'a> Scanner<'a> {
10611117 // Initial callback
10621118 callback ( 0 , 0 , 0 ) ;
10631119
1064- for update in progress_rx. iter ( ) {
1065- if update == 1 {
1066- files_discovered += 1 ;
1067- // Update callback every 100 files discovered to avoid spam
1068- if files_discovered % 100 == 0 {
1120+ for batch_size in progress_rx. iter ( ) {
1121+ if batch_size >= 1000 {
1122+ // This is a discovery batch
1123+ files_discovered += batch_size;
1124+ // Update callback every 10k files discovered to reduce overhead
1125+ if files_discovered % 10_000 == 0 {
10691126 callback ( files_processed, files_discovered, findings_count) ;
10701127 }
1071- } else if update == 2 {
1072- files_processed += 1 ;
1073- // Update callback every 50 files processed
1074- if files_processed % 50 == 0 {
1128+ } else {
1129+ // This is a processing batch
1130+ files_processed += batch_size;
1131+ // Update callback every 5k files processed
1132+ if files_processed % 5_000 == 0 {
10751133 callback ( files_processed, files_discovered, findings_count) ;
10761134 }
10771135 }
0 commit comments