feat: add self-reflection step before presenting roadmap

jwm4 · jwm4 · commit 25acda50e61b · 2026-01-06T16:27:20.000-05:00
Implements the self-reflection pattern from issue #6, where the agent reviews its own output before presenting it to users. This catches issues like missing files, generic advice, or unclear reasoning. Changes: - Add reflect_on_roadmap node that evaluates the generated roadmap - Add conditional routing to retry roadmap generation if issues found - Limit retries to 2 iterations to prevent infinite loops - Add --no-reflection flag to skip reflection for faster results - Update draft_roadmap to incorporate feedback on retries The reflection step checks for: - Completeness (all files mentioned) - Logical review order - Specificity (not generic boilerplate) - Accuracy of summaries Closes #6
diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@ A CLI tool that uses LLMs (Claude) to generate a structured, human-friendly road
 - **Topology Analysis**: Groups changed files into logical components (e.g., API, DB, Frontend).
 - **Deep Linking**: Generates links to specific lines of code in the PR.
 - **Review Guidance**: Suggests a logical order for reviewing files.
+- **Self-Reflection**: Reviews its own output before presenting, catching issues and improving quality.
 - **Integration**: Fetches PR metadata, diffs, and existing comments from GitHub.
 
 ## Installation
@@ -66,6 +67,7 @@ review_roadmap {PR link in the form owner/repo/pr_number or just a URL to the PR
 |--------|-------------|
 | `--output`, `-o` | Save the roadmap to a file instead of printing to stdout |
 | `--post`, `-p` | Post the roadmap as a comment directly on the PR |
+| `--no-reflection` | Skip the self-reflection step for faster results |
 
 You can use both `-o` and `-p` together—the roadmap will be generated once and saved to both the file and the PR comment.
 
@@ -91,6 +93,12 @@ Generate and both save to file and post to PR:
 review_roadmap https://github.com/llamastack/llama-stack/pull/3674 -o roadmap.md -p
 ```
 
+Generate quickly without self-reflection (faster but may have lower quality):
+
+```bash
+review_roadmap https://github.com/llamastack/llama-stack/pull/3674 --no-reflection
+```
+
 ## Development
 
 ```bash
@@ -109,5 +117,8 @@ pytest -v
 The tool uses **LangGraph** to orchestrate the workflow:
 
 1. **Analyze Structure**: LLM analyzes file paths to understand component groups.
-2. **Context Expansion**: (Planned) Fetches additional file content if diffs are ambiguous.
+2. **Context Expansion**: Fetches additional file content if diffs are ambiguous.
 3. **Draft Roadmap**: Synthesizes metadata, diffs, and comments into a coherent guide.
+4. **Self-Reflection**: Reviews the generated roadmap for completeness and accuracy, retrying if needed.
+
+The self-reflection step implements the [self-review pattern](https://github.com/jeremyeder/reference/blob/main/docs/patterns/self-review-reflection.md), where the agent evaluates its own output before presenting it to users. This catches issues like missing files, generic advice, or unclear reasoning—improving quality without manual review.
diff --git a/review_roadmap/agent/graph.py b/review_roadmap/agent/graph.py
@@ -8,16 +8,63 @@
 from langgraph.graph.state import CompiledStateGraph
 
 from review_roadmap.agent.state import ReviewState
-from review_roadmap.agent.nodes import analyze_structure, context_expansion, draft_roadmap
+from review_roadmap.agent.nodes import (
+    analyze_structure,
+    context_expansion,
+    draft_roadmap,
+    reflect_on_roadmap,
+)
+from review_roadmap.agent.prompts import MAX_REFLECTION_ITERATIONS
+from review_roadmap.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def _should_reflect(state: ReviewState) -> str:
+    """Determine whether to run reflection or skip to end.
+    
+    Args:
+        state: Current workflow state.
+        
+    Returns:
+        'reflect' to run reflection, 'end' to skip.
+    """
+    if state.skip_reflection:
+        logger.info("reflection_skipped", reason="disabled by user")
+        return "end"
+    return "reflect"
+
+
+def _after_reflection(state: ReviewState) -> str:
+    """Determine whether to retry roadmap generation or finish.
+    
+    Args:
+        state: Current workflow state with reflection results.
+        
+    Returns:
+        'retry' to regenerate roadmap, 'end' to finish.
+    """
+    if state.reflection_passed:
+        return "end"
+    if state.reflection_iterations >= MAX_REFLECTION_ITERATIONS:
+        logger.warning("max_reflection_iterations_reached",
+                      iterations=state.reflection_iterations)
+        return "end"
+    return "retry"
 
 
 def build_graph() -> CompiledStateGraph:
     """Build and compile the LangGraph workflow for review roadmap generation.
 
-    The workflow consists of three sequential nodes:
+    The workflow consists of four nodes with conditional routing:
     1. analyze_structure: Groups changed files into logical components
     2. context_expansion: Optionally fetches additional file content for context
     3. draft_roadmap: Generates the final Markdown roadmap
+    4. reflect_on_roadmap: Self-reviews the roadmap and may trigger a retry
+
+    The reflection step can be skipped by setting skip_reflection=True in the
+    initial state. If reflection fails, the workflow loops back to draft_roadmap
+    with feedback, up to MAX_REFLECTION_ITERATIONS times.
 
     Returns:
         A compiled LangGraph that can be invoked with a ReviewState containing
@@ -27,18 +74,41 @@ def build_graph() -> CompiledStateGraph:
         >>> graph = build_graph()
         >>> result = graph.invoke({"pr_context": pr_context})
         >>> roadmap = result["roadmap"]
+        
+        # Skip reflection:
+        >>> result = graph.invoke({"pr_context": pr_context, "skip_reflection": True})
     """
     workflow = StateGraph(ReviewState)
 
     # Add Nodes
     workflow.add_node("analyze_structure", analyze_structure)
     workflow.add_node("context_expansion", context_expansion)
     workflow.add_node("draft_roadmap", draft_roadmap)
+    workflow.add_node("reflect_on_roadmap", reflect_on_roadmap)
 
     # Define Edges
     workflow.set_entry_point("analyze_structure")
     workflow.add_edge("analyze_structure", "context_expansion")
     workflow.add_edge("context_expansion", "draft_roadmap")
-    workflow.add_edge("draft_roadmap", END)
+    
+    # After draft_roadmap, decide whether to reflect or skip
+    workflow.add_conditional_edges(
+        "draft_roadmap",
+        _should_reflect,
+        {
+            "reflect": "reflect_on_roadmap",
+            "end": END,
+        }
+    )
+    
+    # After reflection, decide whether to retry or finish
+    workflow.add_conditional_edges(
+        "reflect_on_roadmap",
+        _after_reflection,
+        {
+            "retry": "draft_roadmap",
+            "end": END,
+        }
+    )
 
     return workflow.compile()
diff --git a/review_roadmap/agent/nodes.py b/review_roadmap/agent/nodes.py
@@ -112,6 +112,7 @@ def _get_llm_instance() -> BaseChatModel:
     ANALYZE_STRUCTURE_SYSTEM_PROMPT,
     CONTEXT_EXPANSION_SYSTEM_PROMPT,
     DRAFT_ROADMAP_SYSTEM_PROMPT,
+    REFLECT_ON_ROADMAP_SYSTEM_PROMPT,
 )
 from review_roadmap.agent.tools import read_file
 
@@ -318,13 +319,17 @@ def draft_roadmap(state: ReviewState) -> Dict[str, Any]:
     file analysis, topology, comments, fetched content) into a structured
     roadmap with deep links to guide the reviewer.
 
+    If reflection feedback is present (from a previous iteration), it is
+    included in the prompt to guide improvements.
+
     Args:
         state: Current workflow state with all accumulated context.
 
     Returns:
         Dict with 'roadmap' key containing the Markdown roadmap string.
     """
-    logger.info("node_started", node="draft_roadmap")
+    logger.info("node_started", node="draft_roadmap", 
+                iteration=state.reflection_iterations)
     
     files_context = _build_files_context(state)
     comments_context = _build_comments_context(state)
@@ -333,6 +338,15 @@ def draft_roadmap(state: ReviewState) -> Dict[str, Any]:
     repo_url = state.pr_context.metadata.repo_url
     pr_number = state.pr_context.metadata.number
     
+    # Include reflection feedback if this is a retry
+    feedback_section = ""
+    if state.reflection_feedback:
+        feedback_section = f"""
+    
+    ## Self-Review Feedback (address these issues in your revision)
+    {state.reflection_feedback}
+    """
+    
     context_str = f"""
     Title: {state.pr_context.metadata.title}
     Description: {state.pr_context.metadata.description}
@@ -349,6 +363,7 @@ def draft_roadmap(state: ReviewState) -> Dict[str, Any]:
     Existing Comments:
     {chr(10).join(comments_context) if comments_context else "No comments found."}
     {fetched_context_str}
+    {feedback_section}
     """
     
     prompt = ChatPromptTemplate.from_messages([
@@ -360,3 +375,71 @@ def draft_roadmap(state: ReviewState) -> Dict[str, Any]:
     response = chain.invoke({"context": context_str})
     
     return {"roadmap": response.content}
+
+
+def reflect_on_roadmap(state: ReviewState) -> Dict[str, Any]:
+    """Self-review the generated roadmap before presenting to user.
+
+    Evaluates the roadmap against quality criteria and either approves it
+    or provides specific feedback for improvement. This implements the
+    self-reflection pattern to catch issues before humans see them.
+
+    Args:
+        state: Current workflow state with the generated roadmap.
+
+    Returns:
+        Dict with reflection results:
+        - reflection_passed: Whether the roadmap passed review
+        - reflection_feedback: Specific feedback if failed
+        - reflection_iterations: Incremented iteration count
+    """
+    logger.info("node_started", node="reflect_on_roadmap",
+                iteration=state.reflection_iterations)
+    
+    # Build context for reflection
+    files_list = "\n".join([f"- {f.path}" for f in state.pr_context.files])
+    
+    context_str = f"""## PR Context
+Title: {state.pr_context.metadata.title}
+Changed Files:
+{files_list}
+
+## Generated Roadmap
+{state.roadmap}
+
+## Previous Feedback (if any)
+{state.reflection_feedback or "None - first review"}
+"""
+    
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", REFLECT_ON_ROADMAP_SYSTEM_PROMPT),
+        ("human", "{context}")
+    ])
+    
+    chain = prompt | _get_llm_instance()
+    response = chain.invoke({"context": context_str})
+    
+    # Parse response (with fallback for non-JSON responses)
+    import json
+    try:
+        result = json.loads(response.content)
+        passed = result.get("passed", False)
+        feedback = result.get("feedback", "")
+        notes = result.get("notes", "")
+    except json.JSONDecodeError:
+        # If LLM didn't return valid JSON, assume it passed
+        logger.warning("reflection_response_not_json", content=response.content[:200])
+        passed = True
+        feedback = ""
+        notes = "Self-review: completed (non-JSON response)"
+    
+    if passed:
+        logger.info("reflection_passed", notes=notes)
+    else:
+        logger.info("reflection_failed", feedback=feedback)
+    
+    return {
+        "reflection_passed": passed,
+        "reflection_feedback": feedback,
+        "reflection_iterations": state.reflection_iterations + 1,
+    }
diff --git a/review_roadmap/agent/prompts.py b/review_roadmap/agent/prompts.py
@@ -1,5 +1,8 @@
 # System Prompts for the Agent
 
+# Maximum number of reflection iterations before accepting the roadmap
+MAX_REFLECTION_ITERATIONS = 2
+
 ANALYZE_STRUCTURE_SYSTEM_PROMPT = """You are a Senior Software Architect.
 
 Analyze the list of changed files and group them into logical components (e.g., 'Backend API', 'Frontend Components', 'Database Schema', 'Configuration').
@@ -42,3 +45,30 @@
 
 Do not be generic. Be specific to the file paths and names provided.
 """
+
+REFLECT_ON_ROADMAP_SYSTEM_PROMPT = """You are a Senior Staff Engineer reviewing a PR review roadmap before it's shown to a human reviewer.
+
+## Your Task
+Critically evaluate this roadmap from the perspective of someone who will use it to review the PR.
+
+## Checklist
+1. **Completeness**: Are all changed files mentioned? Is anything important missing?
+2. **Logical Order**: Does the suggested review order make sense? Would a reviewer get confused?
+3. **Specificity**: Are the "watch outs" specific to THIS PR, or generic boilerplate?
+4. **Deep Links**: Are file references actionable (include links where provided)?
+5. **Accuracy**: Do the summaries match the actual file changes described?
+6. **Assumptions**: Are there unstated assumptions that should be made explicit?
+
+## Response Format
+If the roadmap passes review, respond with EXACTLY this JSON:
+```json
+{"passed": true, "notes": "Self-review: [brief note on quality]"}
+```
+
+If issues need fixing, respond with EXACTLY this JSON:
+```json
+{"passed": false, "feedback": "[specific issues to fix, be concise]"}
+```
+
+Be rigorous but not pedantic. Only fail roadmaps with genuine issues that would confuse or mislead a reviewer.
+"""
diff --git a/review_roadmap/agent/state.py b/review_roadmap/agent/state.py
@@ -20,13 +20,18 @@ class ReviewState(BaseModel):
     2. analyze_structure: populates topology
     3. context_expansion: populates fetched_content (if needed)
     4. draft_roadmap: populates roadmap (final output)
+    5. reflect_on_roadmap: self-reviews and optionally triggers retry
 
     Attributes:
         pr_context: Input PR data including metadata, files, and comments.
         topology: Analysis of file groupings from the structure analysis node.
         required_context: File paths identified for fetching (intermediate).
         fetched_content: Additional file contents fetched for context.
         roadmap: The final generated Markdown roadmap (output).
+        reflection_feedback: Feedback from self-reflection step for improvements.
+        reflection_passed: Whether the roadmap passed self-review.
+        reflection_iterations: Number of reflection iterations completed.
+        skip_reflection: Whether to skip the self-reflection step entirely.
     """
 
     # Input
@@ -45,3 +50,17 @@ class ReviewState(BaseModel):
 
     # Output
     roadmap: str = ""
+
+    # Reflection
+    reflection_feedback: str = Field(
+        default="", description="Feedback from self-reflection step"
+    )
+    reflection_passed: bool = Field(
+        default=False, description="Whether the roadmap passed self-review"
+    )
+    reflection_iterations: int = Field(
+        default=0, description="Number of reflection iterations completed"
+    )
+    skip_reflection: bool = Field(
+        default=False, description="Whether to skip the self-reflection step"
+    )
diff --git a/review_roadmap/main.py b/review_roadmap/main.py
@@ -36,7 +36,8 @@ def format_pr_comment(roadmap_content: str) -> str:
 def generate(
     pr_url: str = typer.Argument(..., help="GitHub PR URL (e.g., owner/repo/123) or 'owner/repo/123' string"),
     output: str = typer.Option(None, "--output", "-o", help="Output file for the roadmap"),
-    post: bool = typer.Option(False, "--post", "-p", help="Post the roadmap as a comment on the PR")
+    post: bool = typer.Option(False, "--post", "-p", help="Post the roadmap as a comment on the PR"),
+    no_reflection: bool = typer.Option(False, "--no-reflection", help="Skip the self-reflection step")
 ):
     """Generates a review roadmap for a given Pull Request."""
     
@@ -87,9 +88,13 @@ def generate(
     console.print(f"[green]Found PR: {pr_context.metadata.title} (Changed files: {len(pr_context.files)})[/green]")
 
     # 2. Run LangGraph
-    console.print("[bold purple]Generating Roadmap (this may take a minute)...[/bold purple]")
+    if no_reflection:
+        console.print("[bold purple]Generating Roadmap (reflection disabled)...[/bold purple]")
+    else:
+        console.print("[bold purple]Generating Roadmap with self-reflection (this may take a minute)...[/bold purple]")
+    
     graph = build_graph()
-    initial_state = {"pr_context": pr_context}
+    initial_state = {"pr_context": pr_context, "skip_reflection": no_reflection}
     
     result = graph.invoke(initial_state)
     roadmap_content = result.get("roadmap", "")
diff --git a/tests/conftest.py b/tests/conftest.py
diff --git a/tests/test_agent_graph.py b/tests/test_agent_graph.py
diff --git a/tests/test_agent_nodes.py b/tests/test_agent_nodes.py