feat: improve context management for large PRs

jwm4 · jwm4 · commit 8b20f1b88c21 · 2026-01-11T11:07:57.000-05:00
- Fix GitHub API pagination for PR files endpoint (was missing files 31+)
- Include actual diff content in draft_roadmap prompt for better code understanding
- Enhance context_expansion prompt to verify imports from pre-existing files
- Update read_file tool docstring to clarify it can fetch any repo file

This fixes issues where the model would incorrectly flag files as 'missing'
when they were either in the PR (but on page 2 of results) or pre-existing
in the repository.
diff --git a/review_roadmap/agent/nodes.py b/review_roadmap/agent/nodes.py
@@ -312,6 +312,53 @@ def _build_fetched_content_str(fetched_content: Dict[str, str]) -> str:
     return "".join(parts)
 
 
+# Maximum characters per diff before truncation
+MAX_DIFF_CHARS = 1500
+# Maximum total characters for all diffs combined
+MAX_TOTAL_DIFF_CHARS = 80000
+
+
+def _build_diffs_context(state: ReviewState) -> str:
+    """Build formatted diff content for all changed files.
+
+    Includes the actual unified diff for each file so the LLM can see
+    the code changes, not just file names. Large diffs are truncated
+    to manage token limits.
+
+    Args:
+        state: Current workflow state containing PR context.
+
+    Returns:
+        Formatted string with all diffs, suitable for LLM prompt.
+    """
+    if not state.pr_context.files:
+        return "No files changed."
+
+    parts = []
+    total_chars = 0
+
+    for f in state.pr_context.files:
+        if not f.diff_content:
+            # Binary files or very large files may not have diff content
+            file_section = f"### {f.path} ({f.status}, +{f.additions}/-{f.deletions})\n[No diff available - binary or large file]\n"
+        else:
+            diff = f.diff_content
+            if len(diff) > MAX_DIFF_CHARS:
+                diff = diff[:MAX_DIFF_CHARS] + f"\n... (truncated, {len(f.diff_content)} chars total)"
+            file_section = f"### {f.path} ({f.status}, +{f.additions}/-{f.deletions})\n```diff\n{diff}\n```\n"
+
+        # Check if adding this would exceed our total budget
+        if total_chars + len(file_section) > MAX_TOTAL_DIFF_CHARS:
+            remaining = len(state.pr_context.files) - len(parts)
+            parts.append(f"\n... ({remaining} more files not shown due to size limits)\n")
+            break
+
+        parts.append(file_section)
+        total_chars += len(file_section)
+
+    return "\n".join(parts)
+
+
 def draft_roadmap(state: ReviewState) -> Dict[str, Any]:
     """Generate the final Markdown review roadmap.
 
@@ -332,6 +379,7 @@ def draft_roadmap(state: ReviewState) -> Dict[str, Any]:
                 iteration=state.reflection_iterations)
     
     files_context = _build_files_context(state)
+    diffs_context = _build_diffs_context(state)
     comments_context = _build_comments_context(state)
     fetched_context_str = _build_fetched_content_str(state.fetched_content)
     
@@ -357,9 +405,12 @@ def draft_roadmap(state: ReviewState) -> Dict[str, Any]:
     Topology Analysis:
     {state.topology.get('analysis', 'No analysis')}
     
-    Files (with base links):
+    Files (with deep links for review):
     {chr(10).join(files_context)}
     
+    ## File Diffs (actual code changes)
+    {diffs_context}
+    
     Existing Comments:
     {chr(10).join(comments_context) if comments_context else "No comments found."}
     {fetched_context_str}
diff --git a/review_roadmap/agent/prompts.py b/review_roadmap/agent/prompts.py
@@ -15,12 +15,27 @@
 You have the PR metadata, file list, and topology.
 Review the "High Risk" files or ambiguous changes.
 
-If you need to see the FULL content of a file (not just the diff) to understand it, use the `read_file` tool.
-For example:
-- If a class hierarchy changed, read the parent class file.
-- If a complex logic change involves helper functions, read the file definition.
+## The `read_file` Tool
 
-Do not fetch files unless necessary. If the diff is sufficient, just return "DONE".
+You can use `read_file` to fetch the FULL content of ANY file in the repository - not just files in the PR diff. This is useful for:
+
+1. **Verifying imports exist**: If the PR imports from modules not in the diff (e.g., `from myproject.utils import helper`), fetch the file to confirm it exists and understand its interface.
+
+2. **Understanding parent classes**: If a class inherits from a base class not in the diff, fetch it to understand the contract.
+
+3. **Checking helper functions**: If the PR calls functions defined elsewhere, fetch them to understand the context.
+
+4. **Validating configuration references**: If code references config files or constants, fetch them.
+
+## Examples
+- PR imports `from temperature_agent.tools.memory import store_house_knowledge` → fetch `temperature_agent/tools/memory.py`
+- PR extends `BaseProcessor` from `core/processors.py` → fetch `core/processors.py`
+- PR uses `settings.API_KEY` → fetch `config/settings.py`
+
+## Guidelines
+- Do not fetch files unless necessary. If the diff is self-explanatory, just return "DONE".
+- Prioritize fetching files that help verify the PR is complete (e.g., are all required interfaces implemented?).
+- If a file doesn't exist, that's valuable information - it suggests missing files in the PR.
 """
 
 DRAFT_ROADMAP_SYSTEM_PROMPT = """You are a benevolent Senior Staff Engineer guiding a junior reviewer.
diff --git a/review_roadmap/agent/tools.py b/review_roadmap/agent/tools.py
@@ -6,9 +6,20 @@
 @tool
 def read_file(path: str) -> Optional[str]:
     """
-    Reads the full content of a specific file from the PR.
-    Use this when you need to see the code context surrounding a change,
-    or to understand a whole class/module definition.
+    Reads the full content of ANY file from the repository at the PR's head commit.
+    
+    This fetches files from the entire codebase, not just files in the PR diff.
+    Use this to:
+    - Verify that imported modules exist (e.g., fetch 'myproject/utils/helpers.py')
+    - Understand parent classes or interfaces referenced by the PR
+    - Check helper functions or utilities called by changed code
+    - Confirm configuration or constant values referenced in the PR
+    
+    Args:
+        path: Full path to the file from the repository root (e.g., 'src/utils/auth.py')
+    
+    Returns:
+        The file content, or an error message if the file doesn't exist.
     """
     # The actual implementation happens in the node, this is just for schema binding
     return None
diff --git a/review_roadmap/github/client.py b/review_roadmap/github/client.py
@@ -84,6 +84,9 @@ def _fetch_pr_metadata(self, owner: str, repo: str, pr_number: int) -> PRMetadat
     def _fetch_file_diffs(self, owner: str, repo: str, pr_number: int) -> List[FileDiff]:
         """Fetch the list of changed files with their diffs.
 
+        Handles pagination to fetch all files, not just the first page.
+        GitHub defaults to 30 files per page; we request 100 (max allowed).
+
         Args:
             owner: Repository owner.
             repo: Repository name.
@@ -95,19 +98,39 @@ def _fetch_file_diffs(self, owner: str, repo: str, pr_number: int) -> List[FileD
         Raises:
             httpx.HTTPStatusError: If the API request fails.
         """
-        files_resp = self.client.get(f"/repos/{owner}/{repo}/pulls/{pr_number}/files")
-        files_resp.raise_for_status()
+        all_files: List[FileDiff] = []
+        page = 1
+        per_page = 100  # Max allowed by GitHub API
         
-        return [
-            FileDiff(
-                path=f["filename"],
-                status=f["status"],
-                additions=f["additions"],
-                deletions=f["deletions"],
-                diff_content=f.get("patch", "")  # Patch might be missing for binary/large files
+        while True:
+            files_resp = self.client.get(
+                f"/repos/{owner}/{repo}/pulls/{pr_number}/files",
+                params={"page": page, "per_page": per_page}
             )
-            for f in files_resp.json()
-        ]
+            files_resp.raise_for_status()
+            files_data = files_resp.json()
+            
+            if not files_data:
+                break  # No more files
+                
+            all_files.extend([
+                FileDiff(
+                    path=f["filename"],
+                    status=f["status"],
+                    additions=f["additions"],
+                    deletions=f["deletions"],
+                    diff_content=f.get("patch", "")  # Patch might be missing for binary/large files
+                )
+                for f in files_data
+            ])
+            
+            # If we got fewer than per_page, we've reached the last page
+            if len(files_data) < per_page:
+                break
+                
+            page += 1
+        
+        return all_files
 
     def _fetch_issue_comments(self, owner: str, repo: str, pr_number: int) -> List[PRComment]:
         """Fetch general conversation comments from the issues endpoint.