Merge pull request #75 from claytonlin1110/feat/save-prompts-run-artifacts

dippatel1994 · web-flow · commit 22deffccc2e6 · 2026-03-08T20:39:39.000-04:00
feat: persist formatted agent prompts in run outputs
diff --git a/paperbanana/agents/base.py b/paperbanana/agents/base.py
@@ -20,9 +20,15 @@ class BaseAgent(ABC):
     a specific role in the generation process.
     """
 
-    def __init__(self, vlm_provider: VLMProvider, prompt_dir: str = "prompts"):
+    def __init__(
+        self,
+        vlm_provider: VLMProvider,
+        prompt_dir: str = "prompts",
+        prompt_recorder: Any | None = None,
+    ):
         self.vlm = vlm_provider
         self.prompt_dir = Path(prompt_dir)
+        self._prompt_recorder = prompt_recorder
 
     @property
     @abstractmethod
@@ -50,5 +56,23 @@ def load_prompt(self, diagram_type: str = "diagram") -> str:
         return path.read_text(encoding="utf-8")
 
     def format_prompt(self, template: str, **kwargs: Any) -> str:
-        """Format a prompt template with the given values."""
-        return template.format(**kwargs)
+        """Format a prompt template with the given values.
+
+        If a prompt recorder is configured, this method will write the formatted
+        prompt to the active run directory.
+        """
+        # Reserved internal argument (not forwarded into template.format()).
+        prompt_label = kwargs.pop("prompt_label", None)
+
+        formatted = template.format(**kwargs)
+        if self._prompt_recorder is not None:
+            try:
+                self._prompt_recorder.record(
+                    agent_name=self.agent_name,
+                    label=str(prompt_label) if prompt_label else None,
+                    prompt=formatted,
+                )
+            except Exception:
+                # Recording is best-effort; do not break generation on I/O issues.
+                logger.warning("Prompt recording failed", agent=self.agent_name, label=prompt_label)
+        return formatted
diff --git a/paperbanana/agents/critic.py b/paperbanana/agents/critic.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import json
+import re
 from typing import Optional
 
 import structlog
@@ -22,8 +23,10 @@ class CriticAgent(BaseAgent):
     faithfulness, conciseness, readability, and aesthetic issues.
     """
 
-    def __init__(self, vlm_provider: VLMProvider, prompt_dir: str = "prompts"):
-        super().__init__(vlm_provider, prompt_dir)
+    def __init__(
+        self, vlm_provider: VLMProvider, prompt_dir: str = "prompts", prompt_recorder=None
+    ):
+        super().__init__(vlm_provider, prompt_dir, prompt_recorder=prompt_recorder)
 
     @property
     def agent_name(self) -> str:
@@ -56,8 +59,9 @@ async def run(
 
         prompt_type = "diagram" if diagram_type == DiagramType.METHODOLOGY else "plot"
         template = self.load_prompt(prompt_type)
-        prompt = self.format_prompt(
-            template,
+        prompt_label = self._prompt_label_from_image_path(image_path) or "critic"
+        # Build prompt manually so we record once after appending user_feedback.
+        prompt = template.format(
             source_context=source_context,
             caption=caption,
             description=description,
@@ -68,6 +72,17 @@ async def run(
                 f"\n\nAdditional user feedback to consider in your evaluation:\n{user_feedback}"
             )
 
+        # Record the exact prompt sent to the model (including user_feedback in continue-run flows)
+        if self._prompt_recorder is not None:
+            try:
+                self._prompt_recorder.record(
+                    agent_name=self.agent_name,
+                    label=prompt_label,
+                    prompt=prompt,
+                )
+            except Exception:
+                logger.warning("Prompt recording failed", agent=self.agent_name, label=prompt_label)
+
         logger.info("Running critic agent", image_path=image_path)
 
         response = await self.vlm.generate(
@@ -86,6 +101,14 @@ async def run(
         )
         return critique
 
+    @staticmethod
+    def _prompt_label_from_image_path(image_path: str) -> str | None:
+        """Best-effort label (e.g. critic_iter_3) derived from output image filename."""
+        m = re.search(r"(?:diagram|plot)_iter_(\d+)\.", image_path)
+        if not m:
+            return None
+        return f"critic_iter_{m.group(1)}"
+
     def _parse_response(self, response: str) -> CritiqueResult:
         """Parse the VLM response into a CritiqueResult."""
         try:
diff --git a/paperbanana/agents/optimizer.py b/paperbanana/agents/optimizer.py
@@ -26,8 +26,10 @@ class InputOptimizerAgent(BaseAgent):
     - Caption sharpener: converts vague intent into precise visual specification
     """
 
-    def __init__(self, vlm_provider: VLMProvider, prompt_dir: str = "prompts"):
-        super().__init__(vlm_provider, prompt_dir)
+    def __init__(
+        self, vlm_provider: VLMProvider, prompt_dir: str = "prompts", prompt_recorder=None
+    ):
+        super().__init__(vlm_provider, prompt_dir, prompt_recorder=prompt_recorder)
 
     @property
     def agent_name(self) -> str:
@@ -57,11 +59,13 @@ async def run(
 
         context_prompt = self.format_prompt(
             context_template,
+            prompt_label="context_enricher",
             source_context=source_context,
             caption=caption,
         )
         caption_prompt = self.format_prompt(
             caption_template,
+            prompt_label="caption_sharpener",
             source_context=source_context,
             caption=caption,
         )
diff --git a/paperbanana/agents/planner.py b/paperbanana/agents/planner.py
@@ -30,8 +30,10 @@ class PlannerAgent(BaseAgent):
     can render. Matches paper equation 4: P = VLM_plan(S, C, {(S_i, C_i, I_i)}).
     """
 
-    def __init__(self, vlm_provider: VLMProvider, prompt_dir: str = "prompts"):
-        super().__init__(vlm_provider, prompt_dir)
+    def __init__(
+        self, vlm_provider: VLMProvider, prompt_dir: str = "prompts", prompt_recorder=None
+    ):
+        super().__init__(vlm_provider, prompt_dir, prompt_recorder=prompt_recorder)
 
     @property
     def agent_name(self) -> str:
@@ -70,6 +72,7 @@ async def run(
         ratios_str = ", ".join(supported_ratios) if supported_ratios else "1:1, 16:9"
         prompt = self.format_prompt(
             template,
+            prompt_label="planner",
             source_context=source_context,
             caption=caption,
             examples=examples_text,
diff --git a/paperbanana/agents/retriever.py b/paperbanana/agents/retriever.py
@@ -20,8 +20,10 @@ class RetrieverAgent(BaseAgent):
     reference examples are most useful for generating the target diagram.
     """
 
-    def __init__(self, vlm_provider: VLMProvider, prompt_dir: str = "prompts"):
-        super().__init__(vlm_provider, prompt_dir)
+    def __init__(
+        self, vlm_provider: VLMProvider, prompt_dir: str = "prompts", prompt_recorder=None
+    ):
+        super().__init__(vlm_provider, prompt_dir, prompt_recorder=prompt_recorder)
 
     @property
     def agent_name(self) -> str:
@@ -68,6 +70,7 @@ async def run(
         template = self.load_prompt(prompt_type)
         prompt = self.format_prompt(
             template,
+            prompt_label="retriever",
             source_context=source_context,
             caption=caption,
             candidates=candidates_text,
diff --git a/paperbanana/agents/stylist.py b/paperbanana/agents/stylist.py
@@ -23,8 +23,9 @@ def __init__(
         vlm_provider: VLMProvider,
         guidelines: str = "",
         prompt_dir: str = "prompts",
+        prompt_recorder=None,
     ):
-        super().__init__(vlm_provider, prompt_dir)
+        super().__init__(vlm_provider, prompt_dir, prompt_recorder=prompt_recorder)
         self.guidelines = guidelines
 
     @property
@@ -59,6 +60,7 @@ async def run(
         template = self.load_prompt(prompt_type)
         prompt = self.format_prompt(
             template,
+            prompt_label="stylist",
             description=description,
             guidelines=style_guidelines,
             source_context=source_context,
diff --git a/paperbanana/agents/visualizer.py b/paperbanana/agents/visualizer.py
@@ -33,8 +33,9 @@ def __init__(
         vlm_provider: VLMProvider,
         prompt_dir: str = "prompts",
         output_dir: str = "outputs",
+        prompt_recorder=None,
     ):
-        super().__init__(vlm_provider, prompt_dir)
+        super().__init__(vlm_provider, prompt_dir, prompt_recorder=prompt_recorder)
         self.image_gen = image_gen
         self.output_dir = Path(output_dir)
 
@@ -89,7 +90,11 @@ async def _generate_diagram(
     ) -> str:
         """Generate a methodology diagram using the image generation model."""
         template = self.load_prompt("diagram")
-        prompt = self.format_prompt(template, description=description)
+        prompt = self.format_prompt(
+            template,
+            prompt_label=f"visualizer_diagram_iter_{iteration}",
+            description=description,
+        )
 
         logger.info("Generating diagram image", iteration=iteration)
 
@@ -144,7 +149,11 @@ async def _generate_plot(
 
         # Load and format the plot visualizer prompt template
         template = self.load_prompt("plot")
-        code_prompt = self.format_prompt(template, description=full_description)
+        code_prompt = self.format_prompt(
+            template,
+            prompt_label=f"visualizer_plot_iter_{iteration}",
+            description=full_description,
+        )
 
         logger.info("Generating plot code", iteration=iteration)
 
diff --git a/paperbanana/cli.py b/paperbanana/cli.py
@@ -82,6 +82,11 @@ def generate(
         help="Output image format (png, jpeg, or webp)",
     ),
     config: Optional[str] = typer.Option(None, "--config", help="Path to config YAML file"),
+    save_prompts: Optional[bool] = typer.Option(
+        None,
+        "--save-prompts/--no-save-prompts",
+        help="Save formatted prompts into the run directory (for debugging)",
+    ),
     dry_run: bool = typer.Option(
         False,
         "--dry-run",
@@ -165,6 +170,8 @@ def generate(
         overrides["max_iterations"] = max_iterations
     if optimize:
         overrides["optimize_inputs"] = True
+    if save_prompts is not None:
+        overrides["save_prompts"] = save_prompts
     if output:
         overrides["output_dir"] = str(Path(output).parent)
     overrides["output_format"] = format
@@ -488,6 +495,11 @@ def plot(
     auto: bool = typer.Option(
         False, "--auto", help="Let critic loop until satisfied (max 30 iterations)"
     ),
+    save_prompts: Optional[bool] = typer.Option(
+        None,
+        "--save-prompts/--no-save-prompts",
+        help="Save formatted prompts into the run directory (for debugging)",
+    ),
 ):
     """Generate a statistical plot from data."""
     if format not in ("png", "jpeg", "webp"):
@@ -527,6 +539,7 @@ def plot(
         output_format=format,
         optimize_inputs=optimize,
         auto_refine=auto,
+        save_prompts=True if save_prompts is None else save_prompts,
     )
 
     gen_input = GenerationInput(
diff --git a/paperbanana/core/config.py b/paperbanana/core/config.py
@@ -88,6 +88,7 @@ class Settings(BaseSettings):
     output_dir: str = "outputs"
     output_format: OutputFormat = "png"
     save_iterations: bool = True
+    save_prompts: bool = True
 
     # API Keys (loaded from environment)
     google_api_key: Optional[str] = Field(default=None, alias="GOOGLE_API_KEY")
@@ -208,6 +209,7 @@ def _flatten_yaml(config: dict, prefix: str = "") -> dict:
         "output.dir": "output_dir",
         "output.format": "output_format",
         "output.save_iterations": "save_iterations",
+        "output.save_prompts": "save_prompts",
     }
 
     def _recurse(d: dict, prefix: str = "") -> None:
diff --git a/paperbanana/core/pipeline.py b/paperbanana/core/pipeline.py
@@ -16,6 +16,7 @@
 from paperbanana.agents.stylist import StylistAgent
 from paperbanana.agents.visualizer import VisualizerAgent
 from paperbanana.core.config import Settings
+from paperbanana.core.prompt_recorder import PromptRecorder
 from paperbanana.core.types import (
     DiagramType,
     GenerationInput,
@@ -117,6 +118,11 @@ def __init__(
         if self.settings.skip_ssl_verification:
             _apply_ssl_skip()
 
+        # Prompt recorder (writes formatted prompts to outputs/<run_id>/prompts/)
+        self._prompt_recorder = None
+        if self.settings.save_prompts:
+            self._prompt_recorder = PromptRecorder(run_dir_provider=lambda: self._run_dir)
+
         # Initialize providers
         if vlm_client is not None:
             # Demo mode: use provided clients
@@ -145,19 +151,31 @@ def __init__(
 
         # Initialize agents
         prompt_dir = self._find_prompt_dir()
-        self.optimizer = InputOptimizerAgent(self._vlm, prompt_dir=prompt_dir)
-        self.retriever = RetrieverAgent(self._vlm, prompt_dir=prompt_dir)
-        self.planner = PlannerAgent(self._vlm, prompt_dir=prompt_dir)
+        self.optimizer = InputOptimizerAgent(
+            self._vlm, prompt_dir=prompt_dir, prompt_recorder=self._prompt_recorder
+        )
+        self.retriever = RetrieverAgent(
+            self._vlm, prompt_dir=prompt_dir, prompt_recorder=self._prompt_recorder
+        )
+        self.planner = PlannerAgent(
+            self._vlm, prompt_dir=prompt_dir, prompt_recorder=self._prompt_recorder
+        )
         self.stylist = StylistAgent(
-            self._vlm, guidelines=self._methodology_guidelines, prompt_dir=prompt_dir
+            self._vlm,
+            guidelines=self._methodology_guidelines,
+            prompt_dir=prompt_dir,
+            prompt_recorder=self._prompt_recorder,
         )
         self.visualizer = VisualizerAgent(
             self._image_gen,
             self._vlm,
             prompt_dir=prompt_dir,
             output_dir=str(self._run_dir),
+            prompt_recorder=self._prompt_recorder,
+        )
+        self.critic = CriticAgent(
+            self._vlm, prompt_dir=prompt_dir, prompt_recorder=self._prompt_recorder
         )
-        self.critic = CriticAgent(self._vlm, prompt_dir=prompt_dir)
 
         logger.info(
             "Pipeline initialized",
diff --git a/paperbanana/core/prompt_recorder.py b/paperbanana/core/prompt_recorder.py
diff --git a/tests/test_pipeline/test_save_prompts.py b/tests/test_pipeline/test_save_prompts.py