66
77import logging
88import os
9+ import subprocess
910import sys
1011import time
1112from collections import defaultdict
@@ -109,6 +110,7 @@ def __init__(
109110 verbose : bool = False ,
110111 include_private : bool = False ,
111112 enable_similarity : bool = True ,
113+ respect_gitignore : bool = True ,
112114 ):
113115 """
114116 Initialize the project analyzer.
@@ -124,6 +126,7 @@ def __init__(
124126 self .verbose = verbose
125127 self .include_private = include_private
126128 self .enable_similarity = enable_similarity
129+ self .respect_gitignore = respect_gitignore
127130 self .modules : List [ModuleInfo ] = []
128131 self .languages : Dict [str , int ] = defaultdict (int )
129132
@@ -225,11 +228,16 @@ def _scan_files(self):
225228 files_matched = 0
226229 scan_progress_every = 500
227230
228- for root , dirnames , filenames in os .walk (self .root_path ):
229- dirnames [:] = [d for d in dirnames if d not in self .IGNORE_DIRS ]
230- for filename in filenames :
231+ git_file_list : Optional [List [Path ]] = None
232+ if self .respect_gitignore :
233+ git_file_list = self ._get_git_nonignored_files ()
234+ if git_file_list is not None and self .verbose :
235+ log .info ("Using git file list (non-ignored): files=%d" , len (git_file_list ))
236+
237+ if git_file_list is not None :
238+ for fp in git_file_list :
231239 files_seen += 1
232- fp = Path ( root ) / filename
240+ filename = fp . name
233241
234242 if filename in self .IGNORE_FILES :
235243 continue
@@ -293,6 +301,75 @@ def _scan_files(self):
293301 module .file_bytes = len (content .encode ('utf-8' , errors = 'ignore' ))
294302 self .modules .append (module )
295303
304+ else :
305+ for root , dirnames , filenames in os .walk (self .root_path ):
306+ dirnames [:] = [d for d in dirnames if d not in self .IGNORE_DIRS ]
307+ for filename in filenames :
308+ files_seen += 1
309+ fp = Path (root ) / filename
310+
311+ if filename in self .IGNORE_FILES :
312+ continue
313+
314+ ext = fp .suffix .lower ()
315+ language = self .LANGUAGE_EXTENSIONS .get (ext )
316+ if language is None and ext == '' :
317+ try :
318+ with fp .open ('r' , encoding = 'utf-8' , errors = 'ignore' ) as f :
319+ language = self ._language_from_shebang (f .readline ())
320+ except Exception :
321+ language = None
322+
323+ if language is None :
324+ continue
325+
326+ files_matched += 1
327+ self .languages [language ] += 1
328+
329+ if self .verbose and files_seen > 0 and (files_seen % scan_progress_every ) == 0 :
330+ log .info (
331+ "Scan progress: seen=%d matched=%d parsed=%d modules=%d time=%.2fs" ,
332+ files_seen ,
333+ files_matched ,
334+ files_parsed ,
335+ len (self .modules ),
336+ time .time () - scan_start ,
337+ )
338+
339+ try :
340+ content = fp .read_text (encoding = 'utf-8' , errors = 'ignore' )
341+ except Exception :
342+ continue
343+
344+ try :
345+ rel_path = str (fp .relative_to (self .root_path ))
346+ except Exception :
347+ rel_path = str (fp )
348+
349+ module = None
350+ try :
351+ if self .ts_parser and self .ts_parser .is_available (language ):
352+ module = self .ts_parser .parse (rel_path , content , language )
353+ except Exception as e :
354+ if self .verbose :
355+ log .debug ("Tree-sitter parser failed for %s: %s" , rel_path , e )
356+
357+ if module is None :
358+ try :
359+ module = self .fallback_parser .parse (rel_path , content , language )
360+ except Exception as e :
361+ if self .verbose :
362+ log .debug ("Fallback parser failed for %s: %s" , rel_path , e )
363+ continue
364+
365+ if module :
366+ files_parsed += 1
367+ try :
368+ module .file_bytes = fp .stat ().st_size
369+ except Exception :
370+ module .file_bytes = len (content .encode ('utf-8' , errors = 'ignore' ))
371+ self .modules .append (module )
372+
296373 if self .verbose :
297374 log .info (
298375 "Scan finished: seen=%d matched=%d parsed=%d modules=%d time=%.2fs" ,
@@ -303,6 +380,34 @@ def _scan_files(self):
303380 time .time () - scan_start ,
304381 )
305382
383+ def _get_git_nonignored_files (self ) -> Optional [List [Path ]]:
384+ """Return list of non-ignored files according to git, or None if unavailable."""
385+ git_dir = self .root_path / '.git'
386+ if not git_dir .exists ():
387+ return None
388+
389+ try :
390+ proc = subprocess .run (
391+ ['git' , '-C' , str (self .root_path ), 'ls-files' , '-co' , '--exclude-standard' ],
392+ check = False ,
393+ stdout = subprocess .PIPE ,
394+ stderr = subprocess .DEVNULL ,
395+ text = True ,
396+ )
397+ except Exception :
398+ return None
399+
400+ if proc .returncode != 0 :
401+ return None
402+
403+ files : List [Path ] = []
404+ for line in (proc .stdout or '' ).splitlines ():
405+ rel = (line or '' ).strip ()
406+ if not rel :
407+ continue
408+ files .append (self .root_path / rel )
409+ return files
410+
306411 def _detect_entrypoints (self ) -> List [str ]:
307412 """Detect project entry points."""
308413 eps = []
0 commit comments