Merge pull request #7 from gaussian/develop

kut · web-flow · commit 8f2e4e155701 · 2026-02-23T12:09:45.000-05:00
Add structured logging, action timeout, and LLM retry dedup
diff --git a/.env.example b/.env.example
@@ -0,0 +1 @@
+OPENAI_API_KEY=sk-...
diff --git a/.gitignore b/.gitignore
@@ -206,7 +206,8 @@ marimo/_static/
 marimo/_lsp/
 __marimo__/
 
-# shots outputs
+# shots config & outputs
+shots.yaml
 shots_out/
 report.json
 storage_state.json
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,6 @@
+# Agents
+
+## Git
+
+- Always stage and commit in a single command: `git add file1 file2 && git commit -m "message"`
+- Run git commands from the working directory directly — no `cd` or `-C` flags
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1 @@
+AGENTS.md
diff --git a/README.md b/README.md
@@ -14,6 +14,15 @@ pip install -e ".[llm,yaml]"
 playwright install chromium
 ```
 
+## Setup
+
+```bash
+cp shots.yaml.example shots.yaml   # edit with your app's URL and shots
+cp .env.example .env                # add your OPENAI_API_KEY
+```
+
+Both `shots.yaml` and `.env` are gitignored.
+
 ## 1) One-time manual login
 
 ```bash
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,10 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.uv]
+package = true
+
 [project]
 name = "shots"
 version = "0.1.0"
diff --git a/shots.yaml.example b/shots.yaml.example
@@ -12,8 +12,8 @@ shots:
     description: >
       Capture the main dashboard with KPI cards and a chart visible.
       Close any modal, cookie banner, or tour overlay. If needed, use the left nav to reach Dashboard.
-  - id: wizard-step-4
+
+  - id: integrations
     description: >
-      Open the onboarding wizard and click the red button 3 times to reach step 4 summary,
-      then stop when the summary screen is visible and presentable.
+      Show Settings -> Integrations page listing available integrations.
     viewport_preset: laptop
diff --git a/shots/cli.py b/shots/cli.py
@@ -10,13 +10,15 @@
 
 
 def _add_common_run_flags(p: argparse.ArgumentParser) -> None:
-    p.add_argument("--out-dir", default="shots_out")
+    p.add_argument("--out-dir", default=None, help="Output directory (overrides config out_dir, default: shots_out).")
     p.add_argument("--headed", action="store_true", help="Show the browser (debug).")
-    p.add_argument("--timeout-ms", type=int, default=45_000)
+    p.add_argument("--timeout-ms", type=int, default=10_000, help="Page-load/navigation timeout.")
+    p.add_argument("--action-timeout-ms", type=int, default=5_000, help="Timeout for clicks/typing (fail fast).")
 
     p.add_argument("--use-llm", action="store_true", help="Enable LLM multi-step navigation to acquire each shot.")
-    p.add_argument("--model", default="gpt-4.1")
+    p.add_argument("--model", default="gpt-5.2")
     p.add_argument("--use-llm-crop", action="store_true", help="Use LLM to choose a crop box.")
+    p.add_argument("--max-crop-retries", type=int, default=2, help="Max crop validation retries (default: 2).")
     p.add_argument("--save-source", action="store_true", help="Save uncropped source images too.")
 
 
@@ -77,7 +79,9 @@ def cmd_login(args) -> None:
 
 def cmd_run_config(args) -> None:
     cfg = load_config(args.config)
-    out_dir = pathlib.Path(args.out_dir).resolve()
+    # CLI --out-dir overrides config out_dir
+    out_dir_str = args.out_dir if args.out_dir is not None else cfg.out_dir
+    out_dir = pathlib.Path(out_dir_str).resolve()
 
     w, h, scale, full_page = _resolve_cli_viewport(args)
     fallback = viewport_from_values(w, h, scale, full_page=full_page)
@@ -86,10 +90,12 @@ def cmd_run_config(args) -> None:
         cfg=cfg,
         out_dir=out_dir,
         timeout_ms=args.timeout_ms,
+        action_timeout_ms=args.action_timeout_ms,
         headed=args.headed,
         use_llm=args.use_llm,
         model=args.model,
         use_llm_crop=args.use_llm_crop,
+        max_crop_retries=args.max_crop_retries,
         save_source=args.save_source,
         cli_fallback_viewport=fallback,
     )
diff --git a/shots/config.py b/shots/config.py
@@ -19,14 +19,26 @@ class ShotSpec:
     viewport_preset: str | None = None
     viewport: dict[str, int] | None = None  # width/height/scale
     full_page: bool | None = None
+    label: str | None = None  # per-shot label override
+
+
+@dataclass
+class ShotGroup:
+    id: str
+    shots: list[ShotSpec]
+    output: str = "png"  # "png" or "pdf"
+    label: str | None = None  # template string applied to all shots
+    label_date: bool = False  # add date/time line below the label
+    folder: str | None = None  # override subfolder name (defaults to id)
 
 
 @dataclass
 class RunConfig:
     base_url: str
     start: str
     defaults: dict[str, Any]
-    shots: list[ShotSpec]
+    groups: list[ShotGroup]
+    out_dir: str = "shots_out"
 
 
 def _require_str(obj: dict[str, Any], key: str) -> str:
@@ -35,6 +47,28 @@ def _require_str(obj: dict[str, Any], key: str) -> str:
     return obj[key].strip()
 
 
+def _parse_shot(s: dict[str, Any], ctx: str) -> ShotSpec:
+    """Parse a single shot dict into a ShotSpec."""
+    if not isinstance(s, dict):
+        raise ValueError(f"{ctx} must be an object.")
+    sid = _require_str(s, "id")
+    desc = _require_str(s, "description")
+
+    viewport = s.get("viewport")
+    if viewport is not None and not isinstance(viewport, dict):
+        raise ValueError(f"{ctx}.viewport must be an object if provided.")
+
+    return ShotSpec(
+        id=sid,
+        description=desc,
+        url=str(s["url"]).strip() if s.get("url") else None,
+        viewport_preset=str(s["viewport_preset"]).strip() if s.get("viewport_preset") else None,
+        viewport={k: int(v) for k, v in viewport.items()} if viewport else None,
+        full_page=bool(s["full_page"]) if "full_page" in s else None,
+        label=str(s["label"]).strip().replace("\\n", "\n") if s.get("label") else None,
+    )
+
+
 def load_config(path: str) -> RunConfig:
     p = pathlib.Path(path).resolve()
     raw_text = p.read_text(encoding="utf-8")
@@ -51,34 +85,63 @@ def load_config(path: str) -> RunConfig:
 
     base_url = _require_str(data, "base_url").rstrip("/")
     start = str(data.get("start", "/")).strip() or "/"
+    out_dir = str(data.get("out_dir", "shots_out")).strip() or "shots_out"
     defaults = data.get("defaults", {}) or {}
     if not isinstance(defaults, dict):
         raise ValueError("defaults must be an object.")
 
-    shots_raw = data.get("shots", [])
-    if not isinstance(shots_raw, list) or not shots_raw:
-        raise ValueError("shots must be a non-empty list.")
-
-    shots: list[ShotSpec] = []
-    for idx, s in enumerate(shots_raw):
-        if not isinstance(s, dict):
-            raise ValueError(f"shots[{idx}] must be an object.")
-        sid = _require_str(s, "id")
-        desc = _require_str(s, "description")
-
-        viewport = s.get("viewport")
-        if viewport is not None and not isinstance(viewport, dict):
-            raise ValueError(f"shots[{idx}].viewport must be an object if provided.")
-
-        shots.append(
-            ShotSpec(
-                id=sid,
-                description=desc,
-                url=str(s["url"]).strip() if s.get("url") else None,
-                viewport_preset=str(s["viewport_preset"]).strip() if s.get("viewport_preset") else None,
-                viewport={k: int(v) for k, v in viewport.items()} if viewport else None,
-                full_page=bool(s["full_page"]) if "full_page" in s else None,
-            )
-        )
-
-    return RunConfig(base_url=base_url, start=start, defaults=defaults, shots=shots)
+    has_groups = "groups" in data
+    has_shots = "shots" in data
+
+    if has_groups and has_shots:
+        raise ValueError("Config cannot have both 'groups' and 'shots'. Use one or the other.")
+    if not has_groups and not has_shots:
+        raise ValueError("Config must have either 'groups' or 'shots'.")
+
+    groups: list[ShotGroup] = []
+
+    if has_groups:
+        groups_raw = data["groups"]
+        if not isinstance(groups_raw, list) or not groups_raw:
+            raise ValueError("groups must be a non-empty list.")
+
+        for gi, g in enumerate(groups_raw):
+            if not isinstance(g, dict):
+                raise ValueError(f"groups[{gi}] must be an object.")
+            gid = _require_str(g, "id")
+            output = str(g.get("output", "png")).strip().lower()
+            if output not in ("png", "pdf"):
+                raise ValueError(f"groups[{gi}].output must be 'png' or 'pdf', got '{output}'.")
+
+            shots_raw = g.get("shots", [])
+            if not isinstance(shots_raw, list) or not shots_raw:
+                raise ValueError(f"groups[{gi}].shots must be a non-empty list.")
+
+            shots = [_parse_shot(s, f"groups[{gi}].shots[{si}]") for si, s in enumerate(shots_raw)]
+
+            if output == "png" and len(shots) > 1:
+                raise ValueError(
+                    f"groups[{gi}] ('{gid}'): output='png' requires exactly 1 shot, got {len(shots)}. "
+                    "Use output='pdf' for multi-shot groups."
+                )
+
+            groups.append(ShotGroup(
+                id=gid,
+                shots=shots,
+                output=output,
+                label=str(g["label"]).strip().replace("\\n", "\n") if g.get("label") else None,
+                label_date=bool(g.get("label_date", False)),
+                folder=str(g["folder"]).strip() if g.get("folder") else None,
+            ))
+
+    else:
+        # Flat shots list — auto-wrap each into its own group
+        shots_raw = data["shots"]
+        if not isinstance(shots_raw, list) or not shots_raw:
+            raise ValueError("shots must be a non-empty list.")
+
+        for si, s in enumerate(shots_raw):
+            shot = _parse_shot(s, f"shots[{si}]")
+            groups.append(ShotGroup(id=shot.id, shots=[shot]))
+
+    return RunConfig(base_url=base_url, start=start, defaults=defaults, groups=groups, out_dir=out_dir)
diff --git a/shots/image_ops.py b/shots/image_ops.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 from io import BytesIO
 
-from PIL import Image
+from PIL import Image, ImageDraw, ImageFont
 
 
 def b64_png(png_bytes: bytes) -> str:
@@ -64,3 +64,81 @@ def crop_png(png_bytes: bytes, crop: Crop) -> bytes:
 def get_png_size(png_bytes: bytes) -> tuple[int, int]:
     im = Image.open(BytesIO(png_bytes))
     return im.size
+
+
+def _get_font(size: int) -> ImageFont.FreeTypeFont | ImageFont.ImageFont:
+    """Try common system sans-serif fonts, fall back to Pillow default."""
+    candidates = [
+        "/System/Library/Fonts/Helvetica.ttc",
+        "/System/Library/Fonts/SFNSText.ttf",
+        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+        "/usr/share/fonts/TTF/DejaVuSans.ttf",
+        "arial.ttf",
+        "DejaVuSans.ttf",
+    ]
+    for path in candidates:
+        try:
+            return ImageFont.truetype(path, size)
+        except (OSError, IOError):
+            continue
+    return ImageFont.load_default(size=size)
+
+
+def add_label_banner(png_bytes: bytes, label_text: str, font_size: int = 32) -> bytes:
+    """
+    Add a white banner with black text below the image, separated by a black
+    line. Additive — does not crop into the screenshot, just extends the canvas
+    downward. Supports multiline text (e.g. label + date on separate lines).
+    """
+    im = Image.open(BytesIO(png_bytes)).convert("RGBA")
+    w, h = im.size
+
+    font = _get_font(font_size)
+    line_thickness = 2
+    padding = font_size  # gap above and below text block
+    line_spacing = font_size // 2  # extra gap between lines
+
+    # Measure multiline text height
+    tmp_draw = ImageDraw.Draw(im)
+    text_bbox = tmp_draw.multiline_textbbox((0, 0), label_text, font=font, spacing=line_spacing)
+    text_block_h = text_bbox[3] - text_bbox[1]
+
+    banner_h = line_thickness + 2 * padding + text_block_h
+
+    # New canvas: original + separator + banner
+    out = Image.new("RGBA", (w, h + banner_h), (255, 255, 255, 255))
+    out.paste(im, (0, 0))
+
+    draw = ImageDraw.Draw(out)
+
+    # Black separator line
+    draw.rectangle([(0, h), (w, h + line_thickness)], fill=(0, 0, 0, 255))
+
+    # Centered multiline text
+    text_bbox = draw.multiline_textbbox((0, 0), label_text, font=font, spacing=line_spacing)
+    text_w = text_bbox[2] - text_bbox[0]
+    text_x = (w - text_w) // 2
+    text_y = h + line_thickness + padding
+    draw.multiline_text((text_x, text_y), label_text, fill=(0, 0, 0, 255), font=font, align="center", spacing=line_spacing)
+
+    buf = BytesIO()
+    out.save(buf, format="PNG")
+    return buf.getvalue()
+
+
+def pngs_to_pdf(png_bytes_list: list[bytes]) -> bytes:
+    """
+    Combine multiple PNG images into a single PDF (one image per page).
+    Uses Pillow's built-in PDF support — no extra dependencies.
+    """
+    images: list[Image.Image] = []
+    for png_bytes in png_bytes_list:
+        im = Image.open(BytesIO(png_bytes)).convert("RGB")
+        images.append(im)
+
+    buf = BytesIO()
+    if len(images) == 1:
+        images[0].save(buf, format="PDF")
+    else:
+        images[0].save(buf, format="PDF", save_all=True, append_images=images[1:])
+    return buf.getvalue()
diff --git a/shots/labels.py b/shots/labels.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+import re
+from urllib.parse import urlparse
+
+# UUID pattern: 8-4-4-4-12 hex chars
+_UUID_RE = re.compile(r"/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.IGNORECASE)
+# Numeric ID segments: /123/ or trailing /123
+_NUMERIC_RE = re.compile(r"/\d+(?=/|$)")
+
+
+def desensitize_url(url: str, base_url: str = "") -> str:
+    """
+    Strip origin, replace UUIDs and numeric IDs with placeholders.
+    e.g. http://localhost:4210/brain/accounts/user/edfb2590-.../change/
+      -> /brain/accounts/user/{id}/change/
+    """
+    # Strip origin to get just the path
+    if base_url and url.startswith(base_url):
+        path = url[len(base_url):]
+    else:
+        parsed = urlparse(url)
+        path = parsed.path
+
+    if not path:
+        path = "/"
+
+    # Replace UUIDs first (more specific), then numeric IDs
+    path = _UUID_RE.sub("/{id}", path)
+    path = _NUMERIC_RE.sub("/{id}", path)
+
+    return path
+
+
+class _SafeDict(dict):
+    """Dict that returns {key} for missing keys instead of raising KeyError."""
+
+    def __missing__(self, key: str) -> str:
+        return "{" + key + "}"
+
+
+def render_label(template: str, variables: dict[str, str]) -> str:
+    """
+    Render a label template with variables. Unknown {tags} are left as-is.
+
+    Available variables: url, id, title
+    """
+    return template.format_map(_SafeDict(variables))
diff --git a/shots/llm.py b/shots/llm.py
diff --git a/shots/runner.py b/shots/runner.py
diff --git a/shots/stability.py b/shots/stability.py
diff --git a/uv.lock b/uv.lock