|
1 | 1 | import os |
2 | 2 | import fnmatch |
| 3 | +import pathspec |
3 | 4 |
|
4 | 5 | def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, max_file_size=None, use_relative_paths=True): |
5 | 6 | """ |
6 | 7 | Crawl files in a local directory with similar interface as crawl_github_files. |
7 | | - |
| 8 | +
|
8 | 9 | Args: |
9 | 10 | directory (str): Path to local directory |
10 | 11 | include_patterns (set): File patterns to include (e.g. {"*.py", "*.js"}) |
11 | 12 | exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"}) |
12 | 13 | max_file_size (int): Maximum file size in bytes |
13 | 14 | use_relative_paths (bool): Whether to use paths relative to directory |
14 | | - |
| 15 | +
|
15 | 16 | Returns: |
16 | 17 | dict: {"files": {filepath: content}} |
17 | 18 | """ |
18 | 19 | if not os.path.isdir(directory): |
19 | 20 | raise ValueError(f"Directory does not exist: {directory}") |
20 | | - |
| 21 | + |
21 | 22 | files_dict = {} |
22 | | - |
23 | | - for root, _, files in os.walk(directory): |
| 23 | + |
| 24 | + # --- Load .gitignore --- |
| 25 | + gitignore_path = os.path.join(directory, '.gitignore') |
| 26 | + gitignore_spec = None |
| 27 | + if os.path.exists(gitignore_path): |
| 28 | + try: |
| 29 | + with open(gitignore_path, 'r', encoding='utf-8') as f: |
| 30 | + gitignore_patterns = f.readlines() |
| 31 | + gitignore_spec = pathspec.PathSpec.from_lines('gitwildmatch', gitignore_patterns) |
| 32 | + print(f"Loaded .gitignore patterns from {gitignore_path}") |
| 33 | + except Exception as e: |
| 34 | + print(f"Warning: Could not read or parse .gitignore file {gitignore_path}: {e}") |
| 35 | + # --- End Load .gitignore --- |
| 36 | + |
| 37 | + for root, dirs, files in os.walk(directory): |
| 38 | + # Filter directories using .gitignore and exclude_patterns early to avoid descending |
| 39 | + # Need to process dirs list *in place* for os.walk to respect it |
| 40 | + excluded_dirs = set() |
| 41 | + for d in dirs: |
| 42 | + dirpath_rel = os.path.relpath(os.path.join(root, d), directory) |
| 43 | + |
| 44 | + # Check against .gitignore (important for directories) |
| 45 | + if gitignore_spec and gitignore_spec.match_file(dirpath_rel): |
| 46 | + excluded_dirs.add(d) |
| 47 | + continue # Skip further checks if gitignored |
| 48 | + |
| 49 | + # Check against standard exclude_patterns |
| 50 | + if exclude_patterns: |
| 51 | + for pattern in exclude_patterns: |
| 52 | + # Match pattern against full relative path or directory name itself |
| 53 | + if fnmatch.fnmatch(dirpath_rel, pattern) or fnmatch.fnmatch(d, pattern): |
| 54 | + excluded_dirs.add(d) |
| 55 | + break |
| 56 | + |
| 57 | + # Modify dirs in-place: remove excluded ones |
| 58 | + # Iterate over a copy (.copy()) because we are modifying the list during iteration |
| 59 | + for d in dirs.copy(): |
| 60 | + if d in excluded_dirs: |
| 61 | + dirs.remove(d) |
| 62 | + |
| 63 | + # Now process files in the non-excluded directories |
24 | 64 | for filename in files: |
25 | 65 | filepath = os.path.join(root, filename) |
26 | | - |
| 66 | + |
27 | 67 | # Get path relative to directory if requested |
28 | 68 | if use_relative_paths: |
29 | 69 | relpath = os.path.relpath(filepath, directory) |
30 | 70 | else: |
31 | 71 | relpath = filepath |
32 | | - |
33 | | - # Check if file matches any include pattern |
| 72 | + |
| 73 | + # --- Exclusion check --- |
| 74 | + excluded = False |
| 75 | + # 1. Check .gitignore first |
| 76 | + if gitignore_spec and gitignore_spec.match_file(relpath): |
| 77 | + excluded = True |
| 78 | + |
| 79 | + # 2. Check standard exclude_patterns if not already excluded by .gitignore |
| 80 | + if not excluded and exclude_patterns: |
| 81 | + for pattern in exclude_patterns: |
| 82 | + if fnmatch.fnmatch(relpath, pattern): |
| 83 | + excluded = True |
| 84 | + break |
| 85 | + |
| 86 | + # --- Inclusion Check (remains the same) --- |
34 | 87 | included = False |
35 | 88 | if include_patterns: |
36 | 89 | for pattern in include_patterns: |
37 | 90 | if fnmatch.fnmatch(relpath, pattern): |
38 | 91 | included = True |
39 | 92 | break |
40 | 93 | else: |
| 94 | + # If no include patterns, include everything *not excluded* |
41 | 95 | included = True |
42 | | - |
43 | | - # Check if file matches any exclude pattern |
44 | | - excluded = False |
45 | | - if exclude_patterns: |
46 | | - for pattern in exclude_patterns: |
47 | | - if fnmatch.fnmatch(relpath, pattern): |
48 | | - excluded = True |
49 | | - break |
50 | | - |
| 96 | + |
| 97 | + # Skip if not included or if excluded (by either method) |
51 | 98 | if not included or excluded: |
52 | 99 | continue |
53 | | - |
| 100 | + # --- End Exclusion/Inclusion Logic --- |
| 101 | + |
54 | 102 | # Check file size |
55 | 103 | if max_file_size and os.path.getsize(filepath) > max_file_size: |
56 | 104 | continue |
57 | | - |
| 105 | + |
58 | 106 | try: |
59 | 107 | with open(filepath, 'r', encoding='utf-8') as f: |
60 | 108 | content = f.read() |
61 | 109 | files_dict[relpath] = content |
62 | 110 | except Exception as e: |
63 | 111 | print(f"Warning: Could not read file {filepath}: {e}") |
64 | | - |
| 112 | + |
65 | 113 | return {"files": files_dict} |
66 | 114 |
|
67 | 115 | if __name__ == "__main__": |
|
0 commit comments