Skip to content

Commit 0d2c29e

Browse files
committed
move crawl local to utils
1 parent 9bd29cd commit 0d2c29e

File tree

2 files changed

+74
-65
lines changed

2 files changed

+74
-65
lines changed

nodes.py

Lines changed: 2 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,9 @@
11
import os
22
import yaml
3-
import fnmatch
43
from pocketflow import Node, BatchNode
54
from utils.crawl_github_files import crawl_github_files
6-
from utils.call_llm import call_llm # Assuming you have this utility
7-
8-
def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, max_file_size=None, use_relative_paths=True):
9-
"""
10-
Crawl files in a local directory with similar interface as crawl_github_files.
11-
12-
Args:
13-
directory (str): Path to local directory
14-
include_patterns (set): File patterns to include (e.g. {"*.py", "*.js"})
15-
exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
16-
max_file_size (int): Maximum file size in bytes
17-
use_relative_paths (bool): Whether to use paths relative to directory
18-
19-
Returns:
20-
dict: {"files": {filepath: content}}
21-
"""
22-
if not os.path.isdir(directory):
23-
raise ValueError(f"Directory does not exist: {directory}")
24-
25-
files_dict = {}
26-
27-
for root, _, files in os.walk(directory):
28-
for filename in files:
29-
filepath = os.path.join(root, filename)
30-
31-
# Get path relative to directory if requested
32-
if use_relative_paths:
33-
relpath = os.path.relpath(filepath, directory)
34-
else:
35-
relpath = filepath
36-
37-
# Check if file matches any include pattern
38-
included = False
39-
if include_patterns:
40-
for pattern in include_patterns:
41-
if fnmatch.fnmatch(relpath, pattern):
42-
included = True
43-
break
44-
else:
45-
included = True
46-
47-
# Check if file matches any exclude pattern
48-
excluded = False
49-
if exclude_patterns:
50-
for pattern in exclude_patterns:
51-
if fnmatch.fnmatch(relpath, pattern):
52-
excluded = True
53-
break
54-
55-
if not included or excluded:
56-
continue
57-
58-
# Check file size
59-
if max_file_size and os.path.getsize(filepath) > max_file_size:
60-
continue
61-
62-
try:
63-
with open(filepath, 'r', encoding='utf-8') as f:
64-
content = f.read()
65-
files_dict[relpath] = content
66-
except Exception as e:
67-
print(f"Warning: Could not read file {filepath}: {e}")
68-
69-
return {"files": files_dict}
5+
from utils.call_llm import call_llm
6+
from utils.crawl_local_files import crawl_local_files
707

718
# Helper to create context from files, respecting limits (basic example)
729
def create_llm_context(files_data):

utils/crawl_local_files.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import os
2+
import fnmatch
3+
4+
def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, max_file_size=None, use_relative_paths=True):
5+
"""
6+
Crawl files in a local directory with similar interface as crawl_github_files.
7+
8+
Args:
9+
directory (str): Path to local directory
10+
include_patterns (set): File patterns to include (e.g. {"*.py", "*.js"})
11+
exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
12+
max_file_size (int): Maximum file size in bytes
13+
use_relative_paths (bool): Whether to use paths relative to directory
14+
15+
Returns:
16+
dict: {"files": {filepath: content}}
17+
"""
18+
if not os.path.isdir(directory):
19+
raise ValueError(f"Directory does not exist: {directory}")
20+
21+
files_dict = {}
22+
23+
for root, _, files in os.walk(directory):
24+
for filename in files:
25+
filepath = os.path.join(root, filename)
26+
27+
# Get path relative to directory if requested
28+
if use_relative_paths:
29+
relpath = os.path.relpath(filepath, directory)
30+
else:
31+
relpath = filepath
32+
33+
# Check if file matches any include pattern
34+
included = False
35+
if include_patterns:
36+
for pattern in include_patterns:
37+
if fnmatch.fnmatch(relpath, pattern):
38+
included = True
39+
break
40+
else:
41+
included = True
42+
43+
# Check if file matches any exclude pattern
44+
excluded = False
45+
if exclude_patterns:
46+
for pattern in exclude_patterns:
47+
if fnmatch.fnmatch(relpath, pattern):
48+
excluded = True
49+
break
50+
51+
if not included or excluded:
52+
continue
53+
54+
# Check file size
55+
if max_file_size and os.path.getsize(filepath) > max_file_size:
56+
continue
57+
58+
try:
59+
with open(filepath, 'r', encoding='utf-8') as f:
60+
content = f.read()
61+
files_dict[relpath] = content
62+
except Exception as e:
63+
print(f"Warning: Could not read file {filepath}: {e}")
64+
65+
return {"files": files_dict}
66+
67+
if __name__ == "__main__":
68+
print("--- Crawling parent directory ('..') ---")
69+
files_data = crawl_local_files("..", exclude_patterns={"*.pyc", "__pycache__/*", ".git/*", "output/*"})
70+
print(f"Found {len(files_data['files'])} files:")
71+
for path in files_data["files"]:
72+
print(f" {path}")

0 commit comments

Comments
 (0)