Skip to content

Commit c27c79a

Browse files
committed
2 parents 31e45f3 + a5991a6 commit c27c79a

File tree

2 files changed

+117
-75
lines changed

2 files changed

+117
-75
lines changed

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ requests>=2.28.0
44
gitpython>=3.1.0
55
google-cloud-aiplatform>=1.25.0
66
google-genai>=1.9.0
7-
python-dotenv>=1.0.0
7+
python-dotenv>=1.0.0
8+
pathspec>=0.11.0

utils/crawl_local_files.py

Lines changed: 115 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,100 +1,141 @@
11
import os
22
import fnmatch
3-
from tqdm import tqdm
3+
import pathspec
44

5-
def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, max_file_size=None, use_relative_paths=True):
5+
6+
def crawl_local_files(
7+
directory,
8+
include_patterns=None,
9+
exclude_patterns=None,
10+
max_file_size=None,
11+
use_relative_paths=True,
12+
):
613
"""
714
Crawl files in a local directory with similar interface as crawl_github_files.
8-
Implements efficient folder-level filtering to skip entire directories that match exclude patterns,
9-
significantly improving performance when excluding large directory trees.
10-
1115
Args:
1216
directory (str): Path to local directory
1317
include_patterns (set): File patterns to include (e.g. {"*.py", "*.js"})
1418
exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
1519
max_file_size (int): Maximum file size in bytes
1620
use_relative_paths (bool): Whether to use paths relative to directory
17-
21+
1822
Returns:
1923
dict: {"files": {filepath: content}}
2024
"""
2125
if not os.path.isdir(directory):
2226
raise ValueError(f"Directory does not exist: {directory}")
23-
27+
2428
files_dict = {}
25-
29+
30+
# --- Load .gitignore ---
31+
gitignore_path = os.path.join(directory, ".gitignore")
32+
gitignore_spec = None
33+
if os.path.exists(gitignore_path):
34+
try:
35+
with open(gitignore_path, "r", encoding="utf-8") as f:
36+
gitignore_patterns = f.readlines()
37+
gitignore_spec = pathspec.PathSpec.from_lines(
38+
"gitwildmatch", gitignore_patterns
39+
)
40+
print(f"Loaded .gitignore patterns from {gitignore_path}")
41+
except Exception as e:
42+
print(
43+
f"Warning: Could not read or parse .gitignore file {gitignore_path}: {e}"
44+
)
45+
# --- End Load .gitignore ---
46+
2647
for root, dirs, files in os.walk(directory):
27-
print(f"root: {root}")
28-
# Check if current directory should be excluded
29-
if exclude_patterns:
48+
# Filter directories using .gitignore and exclude_patterns early to avoid descending
49+
# Need to process dirs list *in place* for os.walk to respect it
50+
excluded_dirs = set()
51+
for d in dirs:
52+
dirpath_rel = os.path.relpath(os.path.join(root, d), directory)
53+
54+
# Check against .gitignore (important for directories)
55+
if gitignore_spec and gitignore_spec.match_file(dirpath_rel):
56+
excluded_dirs.add(d)
57+
continue # Skip further checks if gitignored
58+
59+
# Check against standard exclude_patterns
60+
if exclude_patterns:
61+
for pattern in exclude_patterns:
62+
# Match pattern against full relative path or directory name itself
63+
if fnmatch.fnmatch(dirpath_rel, pattern) or fnmatch.fnmatch(
64+
d, pattern
65+
):
66+
excluded_dirs.add(d)
67+
break
68+
69+
# Modify dirs in-place: remove excluded ones
70+
# Iterate over a copy (.copy()) because we are modifying the list during iteration
71+
for d in dirs.copy():
72+
if d in excluded_dirs:
73+
dirs.remove(d)
74+
75+
# Now process files in the non-excluded directories
76+
for filename in files:
77+
filepath = os.path.join(root, filename)
78+
3079
# Get path relative to directory if requested
31-
rel_root = os.path.relpath(root, directory) if use_relative_paths else root
32-
33-
# Handle the case where rel_root is the current directory
34-
if rel_root == '.':
35-
rel_root = ''
36-
37-
# Check if directory matches any exclude pattern
38-
for pattern in exclude_patterns:
39-
# Normalize pattern to handle both forward and backward slashes
40-
norm_pattern = pattern.replace("/", os.path.sep)
41-
42-
# Check if the directory matches the pattern
43-
if fnmatch.fnmatch(rel_root, norm_pattern) or \
44-
fnmatch.fnmatch(os.path.join(rel_root, ''), norm_pattern + os.path.sep):
45-
# Skip this directory and all subdirectories
46-
dirs[:] = [] # Clear dirs list to prevent further traversal
47-
print(f"Skipping directory: {rel_root} (matches pattern {pattern})")
48-
break
49-
else:
50-
# print(root)
51-
for filename in files:
52-
filepath = os.path.join(root, filename)
53-
54-
# Get path relative to directory if requested
55-
if use_relative_paths:
56-
relpath = os.path.relpath(filepath, directory)
57-
else:
58-
relpath = filepath
59-
60-
# Check if file matches any include pattern
61-
included = False
62-
if include_patterns:
63-
for pattern in include_patterns:
64-
if fnmatch.fnmatch(relpath, pattern):
65-
included = True
66-
break
67-
else:
80+
if use_relative_paths:
81+
relpath = os.path.relpath(filepath, directory)
82+
else:
83+
relpath = filepath
84+
85+
# --- Exclusion check ---
86+
excluded = False
87+
# 1. Check .gitignore first
88+
if gitignore_spec and gitignore_spec.match_file(relpath):
89+
excluded = True
90+
91+
# 2. Check standard exclude_patterns if not already excluded by .gitignore
92+
if not excluded and exclude_patterns:
93+
for pattern in exclude_patterns:
94+
if fnmatch.fnmatch(relpath, pattern):
95+
excluded = True
96+
break
97+
98+
included = False
99+
if include_patterns:
100+
for pattern in include_patterns:
101+
if fnmatch.fnmatch(relpath, pattern):
68102
included = True
69-
70-
# Check if file matches any exclude pattern
71-
excluded = False
72-
if exclude_patterns:
73-
for pattern in exclude_patterns:
74-
if fnmatch.fnmatch(relpath, pattern) or fnmatch.fnmatch(relpath, pattern.replace("/", "\\")):
75-
print(relpath, pattern)
76-
excluded = True
77-
break
78-
79-
if not included or excluded:
80-
continue
81-
82-
# Check file size
83-
if max_file_size and os.path.getsize(filepath) > max_file_size:
84-
continue
85-
86-
try:
87-
with open(filepath, 'r', encoding='utf-8') as f:
88-
content = f.read()
89-
files_dict[relpath] = content
90-
except Exception as e:
91-
print(f"Warning: Could not read file {filepath}: {e}")
92-
103+
break
104+
else:
105+
# If no include patterns, include everything *not excluded*
106+
included = True
107+
108+
# Skip if not included or if excluded (by either method)
109+
if not included or excluded:
110+
continue
111+
112+
# Check file size
113+
if max_file_size and os.path.getsize(filepath) > max_file_size:
114+
continue
115+
116+
try:
117+
with open(filepath, "r", encoding="utf-8") as f:
118+
content = f.read()
119+
files_dict[relpath] = content
120+
except Exception as e:
121+
print(f"Warning: Could not read file {filepath}: {e}")
122+
93123
return {"files": files_dict}
94124

125+
95126
if __name__ == "__main__":
96127
print("--- Crawling parent directory ('..') ---")
97-
files_data = crawl_local_files("..", exclude_patterns={"*.pyc", "__pycache__/*",".venv/*", ".git/*","docs/*", "output/*"})
128+
files_data = crawl_local_files(
129+
"..",
130+
exclude_patterns={
131+
"*.pyc",
132+
"__pycache__/*",
133+
".venv/*",
134+
".git/*",
135+
"docs/*",
136+
"output/*",
137+
},
138+
)
98139
print(f"Found {len(files_data['files'])} files:")
99140
for path in files_data["files"]:
100-
print(f" {path}")
141+
print(f" {path}")

0 commit comments

Comments
 (0)