fix(rfspec): persist results and support fire-and-forget polling

TheFactoriousDROID · TheFactoriousDROID · commit 9261d6789a0f · 2026-03-09T20:37:41.000-07:00
The run.sh script spawns three droid exec calls that take several minutes,
but the Execute tool times out at 60s. When that happens the temp dir
self-destructs and results are lost.

Changes:
- Write model outputs to persistent ~/.factory/rfspec/runs/&lt;id&gt;/ instead of
  a temp dir
- Print RFSPEC_RUN_DIR path immediately so the agent captures it before timeout
- Write a done sentinel (STATUS=complete|failed) for polling
- Update SKILL.md (v1.3.0) with fire-and-forget + poll workflow instructions
diff --git a/plugins/rfspec/commands/rfspec b/plugins/rfspec/commands/rfspec
@@ -1,2 +1,45 @@
 #!/usr/bin/env bash
-exec "$(dirname "$0")/../skills/rfspec/scripts/run.sh" "$@"
+# Launch rfspec in background and return polling instructions immediately.
+# This avoids the Execute tool timeout killing the long-running model calls.
+
+SCRIPT_DIR="$(dirname "$0")"
+RUN_SH="${SCRIPT_DIR}/../skills/rfspec/scripts/run.sh"
+
+if [ $# -eq 0 ]; then
+  exec "$RUN_SH"
+fi
+
+# Run the script in background, capturing output to its own log.
+# run.sh prints RFSPEC_RUN_DIR=<path> as its first line, so we wait
+# just long enough to capture that, then return control to the agent.
+BGLOG=$(mktemp /tmp/rfspec-bg-XXXXXXXX)
+nohup "$RUN_SH" "$@" >"$BGLOG" 2>&1 &
+BG_PID=$!
+
+# Wait briefly for run.sh to create the output dir and print the path
+sleep 2
+
+# Extract the run dir from the early output
+RUN_DIR=$(grep -m1 'RFSPEC_RUN_DIR=' "$BGLOG" 2>/dev/null | cut -d= -f2-)
+
+echo "User prompt: $*"
+echo ""
+
+if [ -z "$RUN_DIR" ]; then
+  echo "rfspec launched (PID ${BG_PID}), but run dir not yet available."
+  echo "Check log: ${BGLOG}"
+else
+  echo "RFSPEC_RUN_DIR=${RUN_DIR}"
+fi
+
+echo ""
+echo "rfspec is running in background (PID ${BG_PID})."
+echo "Background log: ${BGLOG}"
+echo ""
+echo "Three models (Opus, GPT-5.4, Gemini) are generating competing specs."
+echo "Tell the user results will be ready in a few minutes, then poll:"
+echo ""
+echo "  cat ${RUN_DIR:-<run_dir>}/done 2>/dev/null || echo PENDING"
+echo ""
+echo "When done, read results:"
+echo "  Read: ${RUN_DIR:-<run_dir>}/results.md"
diff --git a/plugins/rfspec/hooks/hooks.json b/plugins/rfspec/hooks/hooks.json
@@ -0,0 +1,18 @@
+{
+  "description": "Blocks rfspec result access until the rfspec skill is loaded",
+  "hooks": {
+    "PreToolUse": [
+      {
+        "matcher": "Execute",
+        "commandRegex": "rfspec/runs/",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "python3 ${DROID_PLUGIN_ROOT}/hooks/rfspec-ready.py",
+            "timeout": 5
+          }
+        ]
+      }
+    ]
+  }
+}
diff --git a/plugins/rfspec/hooks/rfspec-ready.py b/plugins/rfspec/hooks/rfspec-ready.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""PreToolUse hook -- blocks rfspec polling commands until skill is loaded.
+
+Matches Execute commands containing rfspec/runs/. Allows if results are not
+ready yet or the rfspec skill marker exists. Blocks if results are ready but
+the skill hasn't been loaded, forcing the agent to invoke Skill: rfspec first.
+"""
+
+import json
+import logging
+import re
+import sys
+from pathlib import Path
+
+USER_FACTORY = Path.home() / ".factory"
+MARKER_DIR = USER_FACTORY / ".skill-markers"
+RFSPEC_RUNS = USER_FACTORY / "rfspec" / "runs"
+LOG_FILE = USER_FACTORY / "logs" / "hooks.log"
+
+RUN_DIR_RE = re.compile(r"rfspec/runs/([\w-]+)")
+
+
+def setup_logging():
+    LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
+    logging.basicConfig(
+        filename=LOG_FILE,
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    return logging.getLogger("rfspec-ready")
+
+
+def find_run_dir(command: str) -> Path | None:
+    m = RUN_DIR_RE.search(command)
+    if m:
+        return RFSPEC_RUNS / m.group(1)
+    return None
+
+
+def skill_loaded(session_id: str) -> bool:
+    if not session_id:
+        return False
+    return (MARKER_DIR / f"{session_id}-rfspec").exists()
+
+
+def main():
+    log = setup_logging()
+
+    try:
+        data = json.load(sys.stdin)
+    except json.JSONDecodeError:
+        sys.exit(0)
+
+    command = data.get("tool_input", {}).get("command", "")
+    session_id = data.get("session_id", "")
+
+    run_dir = find_run_dir(command)
+    if not run_dir:
+        sys.exit(0)
+
+    done_file = run_dir / "done"
+    if not done_file.exists():
+        log.info(f"[PreToolUse] rfspec run {run_dir.name} not done yet, allowing poll")
+        sys.exit(0)
+
+    if skill_loaded(session_id):
+        log.info(f"[PreToolUse] rfspec skill loaded, allowing access to {run_dir.name}")
+        sys.exit(0)
+
+    results_path = run_dir / "results.md"
+    log.info(
+        f"[PreToolUse] BLOCK rfspec results ready but skill not loaded, session={session_id[:8]}"
+    )
+
+    message = (
+        f"rfspec results are ready at {results_path}. "
+        f"You MUST invoke the rfspec skill to load the evaluation workflow before reading results. "
+        f'Call: Skill(skill="rfspec")'
+    )
+
+    print(
+        json.dumps(
+            {
+                "hookSpecificOutput": {
+                    "hookEventName": "PreToolUse",
+                    "permissionDecision": "deny",
+                    "permissionDecisionReason": message,
+                },
+            }
+        )
+    )
+    sys.exit(2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/plugins/rfspec/skills/rfspec/SKILL.md b/plugins/rfspec/skills/rfspec/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: rfspec
-version: 1.2.0
+version: 1.3.0
 description: |
   Multi-model spec generation and synthesis. Use when the user wants to:
   - Get competing proposals from different AI models
@@ -17,20 +17,63 @@ Fan out a prompt to multiple models, compare their responses, and help the user
 
 ## Quick Reference
 
-| Task | Action |
-|------|--------|
-| Generate competing specs | `/rfspec <prompt>` |
-| Pick one result | Select via AskUser after comparison |
-| Synthesize results | Combine strongest elements when user chooses synthesis |
-| Save final spec | Write to `specs/active/YYYY-MM-DD-<slug>.md` |
+| Task                     | Action                                                 |
+| ------------------------ | ------------------------------------------------------ |
+| Generate competing specs | `/rfspec <prompt>` (background)                        |
+| Poll for results         | Check `<run_dir>/done` sentinel                        |
+| Pick one result          | Select via AskUser after comparison                    |
+| Synthesize results       | Combine strongest elements when user chooses synthesis |
+| Save final spec          | Write to `specs/active/YYYY-MM-DD-<slug>.md`           |
 
 ## Workflow
 
-1. Run `/rfspec <user's prompt>` -- fires parallel model calls, returns labeled options (A, B, C).
-2. Evaluate the results -- see [references/evaluation-guide.md](references/evaluation-guide.md).
-3. Present the choice to the user via AskUser.
-4. Present the selected or synthesized spec via ExitSpecMode for user review.
-5. Save to `specs/active/` only after the user approves in spec mode.
+The `/rfspec` command spawns three `droid exec` calls in parallel. These take
+several minutes, far exceeding the Execute tool timeout. You MUST use the
+fire-and-forget + poll pattern.
+
+### Step 1 -- Launch (background)
+
+Run the command with `fireAndForget=true`:
+
+```
+Execute: /rfspec <user's prompt>
+  fireAndForget: true
+```
+
+The script immediately prints `RFSPEC_RUN_DIR=<path>` to its log file.
+Read the log file (path printed by Execute) to capture the run directory.
+
+### Step 2 -- Poll for completion
+
+Tell the user the models are running and you will check back. Then poll:
+
+```
+Execute: cat <run_dir>/done 2>/dev/null || echo "PENDING"
+```
+
+Poll every 30-60 seconds. The sentinel contains `STATUS=complete` or
+`STATUS=failed`. While waiting, you can do other work or let the user know
+progress.
+
+### Step 3 -- Read results
+
+Once `done` exists, read the results:
+
+```
+Read: <run_dir>/results.md
+```
+
+This file contains all three model outputs as markdown sections (Option A, B, C).
+
+### Step 4 -- Evaluate and present
+
+Evaluate the results -- see [references/evaluation-guide.md](references/evaluation-guide.md).
+Present the choice to the user via AskUser.
+
+### Step 5 -- Finalize
+
+Present the selected or synthesized spec via ExitSpecMode for user review.
+Save to `specs/active/` only after the user approves in spec mode.
 
 ## Saving
 
@@ -43,6 +86,19 @@ specs/active/YYYY-MM-DD-<slug>.md
 
 Where `<slug>` is a short kebab-case name derived from the topic.
 
+## Resuming from slash command
+
+If you are loading this skill after `/rfspec` already ran (the slash command told
+you to invoke `Skill: rfspec`), you already have the run directory. Pick up from
+Step 3:
+
+1. Read `<run_dir>/results.md` to get the model outputs.
+2. Follow Step 4 (evaluate and present) and Step 5 (finalize) below.
+
+The `results.md` file includes embedded agent instructions as a fallback, but
+prefer the full workflow in this document -- it covers the evaluation guide,
+saving rules, and rejection handling that the embedded version omits.
+
 ## Pitfalls
 
 - Don't summarize each option individually -- compare them against each other.
@@ -63,29 +119,31 @@ Example 1: User wants competing specs
 User says: "Get me specs from multiple models for adding a dark mode toggle"
 Actions:
 
-1. Run `/rfspec add a dark mode toggle to the settings page with persistent user preference`
-2. Read Options A, B, C
-3. Compare: "Option A uses CSS variables with a React context, Option B uses Tailwind's dark class with localStorage, Option C uses a theme provider with system preference detection."
-4. Present choice via AskUser
-Result: User picks Option B, saved to `specs/active/2026-03-06-dark-mode-toggle.md`
+1. Execute `/rfspec add a dark mode toggle ...` with `fireAndForget=true`
+2. Read the background log to get `RFSPEC_RUN_DIR`
+3. Tell user: "Models are running, I'll check back shortly."
+4. Poll `<run_dir>/done` until `STATUS=complete`
+5. Read `<run_dir>/results.md`, compare Options A, B, C
+6. Present choice via AskUser
+   Result: User picks Option B, saved to `specs/active/2026-03-06-dark-mode-toggle.md`
 
 Example 2: User wants synthesis
 User says: "rfspec this: refactor the auth module to use JWT"
 Actions:
 
-1. Run `/rfspec refactor the auth module to use JWT`
-2. Compare results, noting Option A has better token rotation but Option C has cleaner middleware
+1. Launch background, poll for completion
+2. Read results, compare -- Option A has better token rotation, Option C has cleaner middleware
 3. User selects "Synthesize"
 4. Combine Option A's rotation logic with Option C's middleware structure
-Result: Synthesized spec saved to `specs/active/2026-03-06-auth-jwt-refactor.md`
+   Result: Synthesized spec saved to `specs/active/2026-03-06-auth-jwt-refactor.md`
 
 Example 3: All options rejected
 User says: "None of these work, they all miss the caching layer"
 Actions:
 
 1. Ask what's missing -- user explains the Redis caching requirement
 2. Offer to re-run: `/rfspec refactor auth module to use JWT with Redis session caching`
-Result: New round of specs generated with caching addressed
+   Result: New round of specs generated with caching addressed
 
 ## References
 
diff --git a/plugins/rfspec/skills/rfspec/scripts/run.sh b/plugins/rfspec/skills/rfspec/scripts/run.sh