Skip to content

Commit 8b20f1b

Browse files
committed
feat: improve context management for large PRs
- Fix GitHub API pagination for PR files endpoint (was missing files 31+) - Include actual diff content in draft_roadmap prompt for better code understanding - Enhance context_expansion prompt to verify imports from pre-existing files - Update read_file tool docstring to clarify it can fetch any repo file This fixes issues where the model would incorrectly flag files as 'missing' when they were either in the PR (but on page 2 of results) or pre-existing in the repository.
1 parent 8ccc0a1 commit 8b20f1b

File tree

4 files changed

+120
-20
lines changed

4 files changed

+120
-20
lines changed

review_roadmap/agent/nodes.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,53 @@ def _build_fetched_content_str(fetched_content: Dict[str, str]) -> str:
312312
return "".join(parts)
313313

314314

315+
# Maximum characters per diff before truncation
316+
MAX_DIFF_CHARS = 1500
317+
# Maximum total characters for all diffs combined
318+
MAX_TOTAL_DIFF_CHARS = 80000
319+
320+
321+
def _build_diffs_context(state: ReviewState) -> str:
322+
"""Build formatted diff content for all changed files.
323+
324+
Includes the actual unified diff for each file so the LLM can see
325+
the code changes, not just file names. Large diffs are truncated
326+
to manage token limits.
327+
328+
Args:
329+
state: Current workflow state containing PR context.
330+
331+
Returns:
332+
Formatted string with all diffs, suitable for LLM prompt.
333+
"""
334+
if not state.pr_context.files:
335+
return "No files changed."
336+
337+
parts = []
338+
total_chars = 0
339+
340+
for f in state.pr_context.files:
341+
if not f.diff_content:
342+
# Binary files or very large files may not have diff content
343+
file_section = f"### {f.path} ({f.status}, +{f.additions}/-{f.deletions})\n[No diff available - binary or large file]\n"
344+
else:
345+
diff = f.diff_content
346+
if len(diff) > MAX_DIFF_CHARS:
347+
diff = diff[:MAX_DIFF_CHARS] + f"\n... (truncated, {len(f.diff_content)} chars total)"
348+
file_section = f"### {f.path} ({f.status}, +{f.additions}/-{f.deletions})\n```diff\n{diff}\n```\n"
349+
350+
# Check if adding this would exceed our total budget
351+
if total_chars + len(file_section) > MAX_TOTAL_DIFF_CHARS:
352+
remaining = len(state.pr_context.files) - len(parts)
353+
parts.append(f"\n... ({remaining} more files not shown due to size limits)\n")
354+
break
355+
356+
parts.append(file_section)
357+
total_chars += len(file_section)
358+
359+
return "\n".join(parts)
360+
361+
315362
def draft_roadmap(state: ReviewState) -> Dict[str, Any]:
316363
"""Generate the final Markdown review roadmap.
317364
@@ -332,6 +379,7 @@ def draft_roadmap(state: ReviewState) -> Dict[str, Any]:
332379
iteration=state.reflection_iterations)
333380

334381
files_context = _build_files_context(state)
382+
diffs_context = _build_diffs_context(state)
335383
comments_context = _build_comments_context(state)
336384
fetched_context_str = _build_fetched_content_str(state.fetched_content)
337385

@@ -357,9 +405,12 @@ def draft_roadmap(state: ReviewState) -> Dict[str, Any]:
357405
Topology Analysis:
358406
{state.topology.get('analysis', 'No analysis')}
359407
360-
Files (with base links):
408+
Files (with deep links for review):
361409
{chr(10).join(files_context)}
362410
411+
## File Diffs (actual code changes)
412+
{diffs_context}
413+
363414
Existing Comments:
364415
{chr(10).join(comments_context) if comments_context else "No comments found."}
365416
{fetched_context_str}

review_roadmap/agent/prompts.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,27 @@
1515
You have the PR metadata, file list, and topology.
1616
Review the "High Risk" files or ambiguous changes.
1717
18-
If you need to see the FULL content of a file (not just the diff) to understand it, use the `read_file` tool.
19-
For example:
20-
- If a class hierarchy changed, read the parent class file.
21-
- If a complex logic change involves helper functions, read the file definition.
18+
## The `read_file` Tool
2219
23-
Do not fetch files unless necessary. If the diff is sufficient, just return "DONE".
20+
You can use `read_file` to fetch the FULL content of ANY file in the repository - not just files in the PR diff. This is useful for:
21+
22+
1. **Verifying imports exist**: If the PR imports from modules not in the diff (e.g., `from myproject.utils import helper`), fetch the file to confirm it exists and understand its interface.
23+
24+
2. **Understanding parent classes**: If a class inherits from a base class not in the diff, fetch it to understand the contract.
25+
26+
3. **Checking helper functions**: If the PR calls functions defined elsewhere, fetch them to understand the context.
27+
28+
4. **Validating configuration references**: If code references config files or constants, fetch them.
29+
30+
## Examples
31+
- PR imports `from temperature_agent.tools.memory import store_house_knowledge` → fetch `temperature_agent/tools/memory.py`
32+
- PR extends `BaseProcessor` from `core/processors.py` → fetch `core/processors.py`
33+
- PR uses `settings.API_KEY` → fetch `config/settings.py`
34+
35+
## Guidelines
36+
- Do not fetch files unless necessary. If the diff is self-explanatory, just return "DONE".
37+
- Prioritize fetching files that help verify the PR is complete (e.g., are all required interfaces implemented?).
38+
- If a file doesn't exist, that's valuable information - it suggests missing files in the PR.
2439
"""
2540

2641
DRAFT_ROADMAP_SYSTEM_PROMPT = """You are a benevolent Senior Staff Engineer guiding a junior reviewer.

review_roadmap/agent/tools.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,20 @@
66
@tool
77
def read_file(path: str) -> Optional[str]:
88
"""
9-
Reads the full content of a specific file from the PR.
10-
Use this when you need to see the code context surrounding a change,
11-
or to understand a whole class/module definition.
9+
Reads the full content of ANY file from the repository at the PR's head commit.
10+
11+
This fetches files from the entire codebase, not just files in the PR diff.
12+
Use this to:
13+
- Verify that imported modules exist (e.g., fetch 'myproject/utils/helpers.py')
14+
- Understand parent classes or interfaces referenced by the PR
15+
- Check helper functions or utilities called by changed code
16+
- Confirm configuration or constant values referenced in the PR
17+
18+
Args:
19+
path: Full path to the file from the repository root (e.g., 'src/utils/auth.py')
20+
21+
Returns:
22+
The file content, or an error message if the file doesn't exist.
1223
"""
1324
# The actual implementation happens in the node, this is just for schema binding
1425
return None

review_roadmap/github/client.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ def _fetch_pr_metadata(self, owner: str, repo: str, pr_number: int) -> PRMetadat
8484
def _fetch_file_diffs(self, owner: str, repo: str, pr_number: int) -> List[FileDiff]:
8585
"""Fetch the list of changed files with their diffs.
8686
87+
Handles pagination to fetch all files, not just the first page.
88+
GitHub defaults to 30 files per page; we request 100 (max allowed).
89+
8790
Args:
8891
owner: Repository owner.
8992
repo: Repository name.
@@ -95,19 +98,39 @@ def _fetch_file_diffs(self, owner: str, repo: str, pr_number: int) -> List[FileD
9598
Raises:
9699
httpx.HTTPStatusError: If the API request fails.
97100
"""
98-
files_resp = self.client.get(f"/repos/{owner}/{repo}/pulls/{pr_number}/files")
99-
files_resp.raise_for_status()
101+
all_files: List[FileDiff] = []
102+
page = 1
103+
per_page = 100 # Max allowed by GitHub API
100104

101-
return [
102-
FileDiff(
103-
path=f["filename"],
104-
status=f["status"],
105-
additions=f["additions"],
106-
deletions=f["deletions"],
107-
diff_content=f.get("patch", "") # Patch might be missing for binary/large files
105+
while True:
106+
files_resp = self.client.get(
107+
f"/repos/{owner}/{repo}/pulls/{pr_number}/files",
108+
params={"page": page, "per_page": per_page}
108109
)
109-
for f in files_resp.json()
110-
]
110+
files_resp.raise_for_status()
111+
files_data = files_resp.json()
112+
113+
if not files_data:
114+
break # No more files
115+
116+
all_files.extend([
117+
FileDiff(
118+
path=f["filename"],
119+
status=f["status"],
120+
additions=f["additions"],
121+
deletions=f["deletions"],
122+
diff_content=f.get("patch", "") # Patch might be missing for binary/large files
123+
)
124+
for f in files_data
125+
])
126+
127+
# If we got fewer than per_page, we've reached the last page
128+
if len(files_data) < per_page:
129+
break
130+
131+
page += 1
132+
133+
return all_files
111134

112135
def _fetch_issue_comments(self, owner: str, repo: str, pr_number: int) -> List[PRComment]:
113136
"""Fetch general conversation comments from the issues endpoint.

0 commit comments

Comments
 (0)