Skip to content

Commit 98fa9fc

Browse files
committed
fix: improve file filtering, add new utility,
- Improved the speed of file filtering in `crawl_local_files.py` with folder-level exclusion - Added `fix_yaml.py` utility for YAML indentation fixes - Updated `nodes.py` to support up to 20 core abstractions - add option for no cache.
1 parent 4da9374 commit 98fa9fc

File tree

8 files changed

+240
-82
lines changed

8 files changed

+240
-82
lines changed

.env.sample

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
GEMINI_PROJECT_ID=<GEMINI_PROJECT_ID>
2-
GITHUB_TOKEN=<GITHUB_TOKEN>
2+
GEMINI_API_KEY=<GEMINI_API_KEY>
3+
GITHUB_TOKEN=<GITHUB_TOKEN>
4+
OPENROUTER_API_KEY = <OPENROUTER_API_KEY>
5+
OPENROUTER_MODEL = <OPENROUTER_MODEL>

.gitignore

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,4 +99,11 @@ coverage/
9999
llm_cache.json
100100

101101
# Output files
102-
output/
102+
output/
103+
104+
# uv manage
105+
pyproject.toml
106+
uv.lock
107+
108+
docs/*.pdf
109+
docs/design-cn.md

main.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@
1414
}
1515

1616
DEFAULT_EXCLUDE_PATTERNS = {
17+
"assets/*", "data/*", "examples/*", "images/*", "public/*", "static/*", "temp/*",
18+
"docs/*",
1719
"venv/*", ".venv/*", "*test*", "tests/*", "docs/*", "examples/*", "v1/*",
18-
"dist/*", "build/*", "experimental/*", "deprecated/*",
20+
"dist/*", "build/*", "experimental/*", "deprecated/*", "misc/*",
1921
"legacy/*", ".git/*", ".github/*", ".next/*", ".vscode/*", "obj/*", "bin/*", "node_modules/*", "*.log"
2022
}
2123

@@ -36,6 +38,8 @@ def main():
3638
parser.add_argument("-s", "--max-size", type=int, default=100000, help="Maximum file size in bytes (default: 100000, about 100KB).")
3739
# Add language parameter for multi-language support
3840
parser.add_argument("--language", default="english", help="Language for the generated tutorial (default: english)")
41+
# Add use_cache parameter to control LLM caching
42+
parser.add_argument("--no-cache", action="store_true", help="Disable LLM response caching (default: caching enabled)")
3943

4044
args = parser.parse_args()
4145

@@ -61,6 +65,9 @@ def main():
6165

6266
# Add language for multi-language support
6367
"language": args.language,
68+
69+
# Add use_cache flag (inverse of no-cache flag)
70+
"use_cache": not args.no_cache,
6471

6572
# Outputs will be populated by the nodes
6673
"files": [],
@@ -73,6 +80,7 @@ def main():
7380

7481
# Display starting message with repository/directory and language
7582
print(f"Starting tutorial generation for: {args.repo or args.dir} in {args.language.capitalize()} language")
83+
print(f"LLM caching: {'Disabled' if args.no_cache else 'Enabled'}")
7684

7785
# Create the flow instance
7886
tutorial_flow = create_tutorial_flow()

nodes.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import os
2+
import re
23
import yaml
34
from pocketflow import Node, BatchNode
45
from utils.crawl_github_files import crawl_github_files
56
from utils.call_llm import call_llm
67
from utils.crawl_local_files import crawl_local_files
8+
from utils.fix_yaml import add_indentation
9+
710

811
# Helper to get content for specific file indices
912
def get_content_for_indices(files_data, indices):
@@ -79,6 +82,7 @@ def prep(self, shared):
7982
files_data = shared["files"]
8083
project_name = shared["project_name"] # Get project name
8184
language = shared.get("language", "english") # Get language
85+
use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True
8286

8387
# Helper to create context from files, respecting limits (basic example)
8488
def create_llm_context(files_data):
@@ -94,10 +98,10 @@ def create_llm_context(files_data):
9498
context, file_info = create_llm_context(files_data)
9599
# Format file info for the prompt (comment is just a hint for LLM)
96100
file_listing_for_prompt = "\n".join([f"- {idx} # {path}" for idx, path in file_info])
97-
return context, file_listing_for_prompt, len(files_data), project_name, language # Return language
101+
return context, file_listing_for_prompt, len(files_data), project_name, language, use_cache # Return use_cache
98102

99103
def exec(self, prep_res):
100-
context, file_listing_for_prompt, file_count, project_name, language = prep_res # Unpack project name and language
104+
context, file_listing_for_prompt, file_count, project_name, language, use_cache = prep_res # Unpack use_cache
101105
print(f"Identifying abstractions using LLM...")
102106

103107
# Add language instruction and hints only if not English
@@ -117,7 +121,7 @@ def exec(self, prep_res):
117121
{context}
118122
119123
{language_instruction}Analyze the codebase context.
120-
Identify the top 5-10 core most important abstractions to help those new to the codebase.
124+
Identify the top 5-20 core most important abstractions to help those new to the codebase.
121125
122126
For each abstraction, provide:
123127
1. A concise `name`{name_lang_hint}.
@@ -144,12 +148,14 @@ def exec(self, prep_res):
144148
Another core concept, similar to a blueprint for objects.{desc_lang_hint}
145149
file_indices:
146150
- 5 # path/to/another.js
147-
# ... up to 10 abstractions
151+
# ... up to 20 abstractions
148152
```"""
149-
response = call_llm(prompt)
153+
response = call_llm(prompt, use_cache=use_cache) # Pass use_cache parameter
150154

151155
# --- Validation ---
152156
yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
157+
# add whitespace to fix llm generation error(except -)
158+
yaml_str = add_indentation(yaml_str)
153159
abstractions = yaml.safe_load(yaml_str)
154160

155161
if not isinstance(abstractions, list):
@@ -203,6 +209,7 @@ def prep(self, shared):
203209
files_data = shared["files"]
204210
project_name = shared["project_name"] # Get project name
205211
language = shared.get("language", "english") # Get language
212+
use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True
206213

207214
# Create context with abstraction names, indices, descriptions, and relevant file snippets
208215
context = "Identified Abstractions:\n"
@@ -230,10 +237,10 @@ def prep(self, shared):
230237
)
231238
context += file_context_str
232239

233-
return context, "\n".join(abstraction_info_for_prompt), project_name, language # Return language
240+
return context, "\n".join(abstraction_info_for_prompt), project_name, language, use_cache # Return use_cache
234241

235242
def exec(self, prep_res):
236-
context, abstraction_listing, project_name, language = prep_res # Unpack project name and language
243+
context, abstraction_listing, project_name, language, use_cache = prep_res # Unpack use_cache
237244
print(f"Analyzing relationships using LLM...")
238245

239246
# Add language instruction and hints only if not English
@@ -339,6 +346,7 @@ def prep(self, shared):
339346
relationships = shared["relationships"] # Summary/label might be translated
340347
project_name = shared["project_name"] # Get project name
341348
language = shared.get("language", "english") # Get language
349+
use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True
342350

343351
# Prepare context for the LLM
344352
abstraction_info_for_prompt = []
@@ -363,10 +371,10 @@ def prep(self, shared):
363371
if language.lower() != "english":
364372
list_lang_note = f" (Names might be in {language.capitalize()})"
365373

366-
return abstraction_listing, context, len(abstractions), project_name, list_lang_note
374+
return abstraction_listing, context, len(abstractions), project_name, list_lang_note, use_cache # Return use_cache
367375

368376
def exec(self, prep_res):
369-
abstraction_listing, context, num_abstractions, project_name, list_lang_note = prep_res
377+
abstraction_listing, context, num_abstractions, project_name, list_lang_note, use_cache = prep_res # Unpack use_cache
370378
print("Determining chapter order using LLM...")
371379
# No language variation needed here in prompt instructions, just ordering based on structure
372380
# The input names might be translated, hence the note.
@@ -437,10 +445,12 @@ def post(self, shared, prep_res, exec_res):
437445
class WriteChapters(BatchNode):
438446
def prep(self, shared):
439447
chapter_order = shared["chapter_order"] # List of indices
440-
abstractions = shared["abstractions"] # List of dicts, name/desc potentially translated
441-
files_data = shared["files"]
442-
language = shared.get("language", "english") # Get language
443-
448+
abstractions = shared["abstractions"] # List of {"name": str, "description": str, "files": [int]}
449+
files_data = shared["files"] # List of (path, content) tuples
450+
project_name = shared["project_name"]
451+
language = shared.get("language", "english")
452+
use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True
453+
444454
# Get already written chapters to provide context
445455
# We store them temporarily during the batch run, not in shared memory yet
446456
# The 'previous_chapters_summary' will be built progressively in the exec context

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ requests>=2.28.0
44
gitpython>=3.1.0
55
google-cloud-aiplatform>=1.25.0
66
google-genai>=1.9.0
7-
python-dotenv>=1.0.0
7+
python-dotenv>=1.0.0

0 commit comments

Comments
 (0)