11import os
22import fnmatch
3- from tqdm import tqdm
3+ import pathspec
44
5- def crawl_local_files (directory , include_patterns = None , exclude_patterns = None , max_file_size = None , use_relative_paths = True ):
5+
6+ def crawl_local_files (
7+ directory ,
8+ include_patterns = None ,
9+ exclude_patterns = None ,
10+ max_file_size = None ,
11+ use_relative_paths = True ,
12+ ):
613 """
714 Crawl files in a local directory with similar interface as crawl_github_files.
8- Implements efficient folder-level filtering to skip entire directories that match exclude patterns,
9- significantly improving performance when excluding large directory trees.
10-
1115 Args:
1216 directory (str): Path to local directory
1317 include_patterns (set): File patterns to include (e.g. {"*.py", "*.js"})
1418 exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
1519 max_file_size (int): Maximum file size in bytes
1620 use_relative_paths (bool): Whether to use paths relative to directory
17-
21+
1822 Returns:
1923 dict: {"files": {filepath: content}}
2024 """
2125 if not os .path .isdir (directory ):
2226 raise ValueError (f"Directory does not exist: { directory } " )
23-
27+
2428 files_dict = {}
25-
29+
30+ # --- Load .gitignore ---
31+ gitignore_path = os .path .join (directory , ".gitignore" )
32+ gitignore_spec = None
33+ if os .path .exists (gitignore_path ):
34+ try :
35+ with open (gitignore_path , "r" , encoding = "utf-8" ) as f :
36+ gitignore_patterns = f .readlines ()
37+ gitignore_spec = pathspec .PathSpec .from_lines (
38+ "gitwildmatch" , gitignore_patterns
39+ )
40+ print (f"Loaded .gitignore patterns from { gitignore_path } " )
41+ except Exception as e :
42+ print (
43+ f"Warning: Could not read or parse .gitignore file { gitignore_path } : { e } "
44+ )
45+ # --- End Load .gitignore ---
46+
2647 for root , dirs , files in os .walk (directory ):
27- print (f"root: { root } " )
28- # Check if current directory should be excluded
29- if exclude_patterns :
48+ # Filter directories using .gitignore and exclude_patterns early to avoid descending
49+ # Need to process dirs list *in place* for os.walk to respect it
50+ excluded_dirs = set ()
51+ for d in dirs :
52+ dirpath_rel = os .path .relpath (os .path .join (root , d ), directory )
53+
54+ # Check against .gitignore (important for directories)
55+ if gitignore_spec and gitignore_spec .match_file (dirpath_rel ):
56+ excluded_dirs .add (d )
57+ continue # Skip further checks if gitignored
58+
59+ # Check against standard exclude_patterns
60+ if exclude_patterns :
61+ for pattern in exclude_patterns :
62+ # Match pattern against full relative path or directory name itself
63+ if fnmatch .fnmatch (dirpath_rel , pattern ) or fnmatch .fnmatch (
64+ d , pattern
65+ ):
66+ excluded_dirs .add (d )
67+ break
68+
69+ # Modify dirs in-place: remove excluded ones
70+ # Iterate over a copy (.copy()) because we are modifying the list during iteration
71+ for d in dirs .copy ():
72+ if d in excluded_dirs :
73+ dirs .remove (d )
74+
75+ # Now process files in the non-excluded directories
76+ for filename in files :
77+ filepath = os .path .join (root , filename )
78+
3079 # Get path relative to directory if requested
31- rel_root = os .path .relpath (root , directory ) if use_relative_paths else root
32-
33- # Handle the case where rel_root is the current directory
34- if rel_root == '.' :
35- rel_root = ''
36-
37- # Check if directory matches any exclude pattern
38- for pattern in exclude_patterns :
39- # Normalize pattern to handle both forward and backward slashes
40- norm_pattern = pattern .replace ("/" , os .path .sep )
41-
42- # Check if the directory matches the pattern
43- if fnmatch .fnmatch (rel_root , norm_pattern ) or \
44- fnmatch .fnmatch (os .path .join (rel_root , '' ), norm_pattern + os .path .sep ):
45- # Skip this directory and all subdirectories
46- dirs [:] = [] # Clear dirs list to prevent further traversal
47- print (f"Skipping directory: { rel_root } (matches pattern { pattern } )" )
48- break
49- else :
50- # print(root)
51- for filename in files :
52- filepath = os .path .join (root , filename )
53-
54- # Get path relative to directory if requested
55- if use_relative_paths :
56- relpath = os .path .relpath (filepath , directory )
57- else :
58- relpath = filepath
59-
60- # Check if file matches any include pattern
61- included = False
62- if include_patterns :
63- for pattern in include_patterns :
64- if fnmatch .fnmatch (relpath , pattern ):
65- included = True
66- break
67- else :
80+ if use_relative_paths :
81+ relpath = os .path .relpath (filepath , directory )
82+ else :
83+ relpath = filepath
84+
85+ # --- Exclusion check ---
86+ excluded = False
87+ # 1. Check .gitignore first
88+ if gitignore_spec and gitignore_spec .match_file (relpath ):
89+ excluded = True
90+
91+ # 2. Check standard exclude_patterns if not already excluded by .gitignore
92+ if not excluded and exclude_patterns :
93+ for pattern in exclude_patterns :
94+ if fnmatch .fnmatch (relpath , pattern ):
95+ excluded = True
96+ break
97+
98+ included = False
99+ if include_patterns :
100+ for pattern in include_patterns :
101+ if fnmatch .fnmatch (relpath , pattern ):
68102 included = True
69-
70- # Check if file matches any exclude pattern
71- excluded = False
72- if exclude_patterns :
73- for pattern in exclude_patterns :
74- if fnmatch .fnmatch (relpath , pattern ) or fnmatch .fnmatch (relpath , pattern .replace ("/" , "\\ " )):
75- print (relpath , pattern )
76- excluded = True
77- break
78-
79- if not included or excluded :
80- continue
81-
82- # Check file size
83- if max_file_size and os .path .getsize (filepath ) > max_file_size :
84- continue
85-
86- try :
87- with open (filepath , 'r' , encoding = 'utf-8' ) as f :
88- content = f .read ()
89- files_dict [relpath ] = content
90- except Exception as e :
91- print (f"Warning: Could not read file { filepath } : { e } " )
92-
103+ break
104+ else :
105+ # If no include patterns, include everything *not excluded*
106+ included = True
107+
108+ # Skip if not included or if excluded (by either method)
109+ if not included or excluded :
110+ continue
111+
112+ # Check file size
113+ if max_file_size and os .path .getsize (filepath ) > max_file_size :
114+ continue
115+
116+ try :
117+ with open (filepath , "r" , encoding = "utf-8" ) as f :
118+ content = f .read ()
119+ files_dict [relpath ] = content
120+ except Exception as e :
121+ print (f"Warning: Could not read file { filepath } : { e } " )
122+
93123 return {"files" : files_dict }
94124
125+
95126if __name__ == "__main__" :
96127 print ("--- Crawling parent directory ('..') ---" )
97- files_data = crawl_local_files (".." , exclude_patterns = {"*.pyc" , "__pycache__/*" ,".venv/*" , ".git/*" ,"docs/*" , "output/*" })
128+ files_data = crawl_local_files (
129+ ".." ,
130+ exclude_patterns = {
131+ "*.pyc" ,
132+ "__pycache__/*" ,
133+ ".venv/*" ,
134+ ".git/*" ,
135+ "docs/*" ,
136+ "output/*" ,
137+ },
138+ )
98139 print (f"Found { len (files_data ['files' ])} files:" )
99140 for path in files_data ["files" ]:
100- print (f" { path } " )
141+ print (f" { path } " )
0 commit comments