feat(evals): personas + rubric support, grading stub; CLI eval run/report; model listing; advisor prompt anti-templating

haasonsaas · haasonsaas · commit 33c16b4aa868 · 2025-09-09T16:00:20.000-07:00
diff --git a/evals/scenarios_personas.yaml b/evals/scenarios_personas.yaml
@@ -0,0 +1,55 @@
+scenarios:
+  - id: seed_ceo_focus
+    persona: "Founder-CEO, 2-person team, zero revenue"
+    stage: "Pre-seed"
+    prompt: "We have 10 interviews suggesting pain but no commitments. How do we get 3 paid design partners in 14 days?"
+    playbook: playbooks/high_orbit.yaml
+    rubric:
+      - "Actionability: concrete steps with owners and deadlines"
+      - "Specific outreach: named segments or job titles, with scripts"
+      - "Measurement: leading indicator within 1-2 weeks"
+      - "Courageous tradeoffs: deprioritizes comfort features for decisive tests"
+
+  - id: series_a_ceo_retention
+    persona: "Founder-CEO, 15 FTE, $100k MRR"
+    stage: "Early growth"
+    prompt: "Logo churn ticked to 3% last month. Which cohorts should we diagnose first and what actions in the next 48h?"
+    playbook: playbooks/high_orbit.yaml
+    rubric:
+      - "Cohort thinking: segment by ICP/use-case, not averages"
+      - "Hypothesis-driven: proposes concrete causes and tests"
+      - "Cross-functional: product + CS + pricing interplay"
+      - "Immediate actions: 48h calendar-blocked tasks"
+
+  - id: consumer_app_activation
+    persona: "PM, Consumer app, 20k WAU"
+    stage: "Product-market search"
+    prompt: "Activation from signup to first value is 17%. Propose a 1-week plan to hit 25% and what to cut to get there."
+    playbook: playbooks/high_orbit.yaml
+    rubric:
+      - "Activation metric clarity: precise step definition and baseline"
+      - "Small-bets: 3-5 experiments with measurable deltas"
+      - "Qual + quant: combines funnel data with user calls"
+      - "Cut list: explicit deprioritizations for bandwidth"
+
+  - id: enterprise_pipeline
+    persona: "Founder, Enterprise B2B, 6-month cycles"
+    stage: "Pilot to contract"
+    prompt: "We have 4 late-stage deals stuck on security review. What should I do this week to force a decision?"
+    playbook: playbooks/high_orbit.yaml
+    rubric:
+      - "Deal mechanics: identifies blockers and stakeholders"
+      - "Mutual close plan: steps, owners, dates"
+      - "Value re-anchoring: ROI or risk framing"
+      - "Escalation: exec alignment or withdraw strategy"
+
+  - id: pricing_reframe
+    persona: "Go-to-market lead"
+    stage: "Post-MVP"
+    prompt: "Our per-seat plan stalls at 30 seats. How to reframe pricing to expand accounts in 2 weeks?"
+    playbook: playbooks/high_orbit.yaml
+    rubric:
+      - "Wedge expansion: module/usage or value metrics"
+      - "Pilot offer: specific terms and success criteria"
+      - "Objection-handling: equips AE with scripts"
+      - "Analysis: before/after unit economics"
diff --git a/orbit_agent/advisor.py b/orbit_agent/advisor.py
@@ -51,10 +51,13 @@ class HighOrbitAdvice(dspy.Signature):
     """You are a brutally honest startup advisor in the YC tradition.
 
     Give specific, actionable advice that optimizes for $1B vs $0 outcomes.
-    Be concrete and direct. Avoid generic platitudes.
+    Be concrete, direct, and tailored to the user's context (persona, stage,
+    constraints). Treat the playbook as heuristics, not as text to repeat.
+    Do not regurgitate playbook lines; instead, synthesize novel guidance and
+    tie it to the user's scenario with clear reasoning.
 
     Output format requirements:
-    - Advice: 2-3 paragraphs of specific, actionable guidance
+    - Advice: 2-3 paragraphs of specific, actionable guidance (no bullets)
     - Actions: List exactly 3-5 specific tasks, one per line, no formatting
     - Metric: One clear metric to track progress
     - Risks: List exactly 3 risks, one per line, no formatting
diff --git a/orbit_agent/cli.py b/orbit_agent/cli.py
@@ -32,6 +32,8 @@
 eval_app = typer.Typer(help="Run evals and generate reports")
 app.add_typer(ctx_app, name="context")
 app.add_typer(eval_app, name="eval")
+models_app = typer.Typer(help="List available models for the active provider")
+app.add_typer(models_app, name="models")
 
 
 @app.callback()
@@ -548,5 +550,47 @@ def eval_report(
         raise typer.Exit(1)
 
 
+@models_app.command("list")
+def models_list():
+    """List models from the active provider (OpenAI only for now)."""
+    try:
+        import os
+        from openai import OpenAI
+
+        key = os.getenv("OPENAI_API_KEY")
+        if not key:
+            console.print("[red]OPENAI_API_KEY not set[/red]")
+            raise typer.Exit(1)
+
+        client = OpenAI(api_key=key)
+        resp = client.models.list()
+        ids = [m.id for m in resp.data]
+        # Prefer relevant chat-capable frontier models
+        preferred = [
+            i
+            for i in ids
+            if any(
+                i.startswith(p)
+                for p in (
+                    "gpt-5",
+                    "gpt-4.1",
+                    "gpt-4o",
+                    "o3",
+                )
+            )
+        ]
+        console.print("[bold]Candidate Models[/]:")
+        for mid in sorted(preferred):
+            console.print(f"- {mid}")
+        others = [i for i in ids if i not in preferred]
+        console.print("\n[dim]Other models (truncated)[/dim]")
+        for mid in sorted(others)[:20]:
+            console.print(f"- {mid}")
+    except Exception as e:
+        logger.error(f"Model listing failed: {e}")
+        console.print(f"[bold red]Error:[/bold red] {e}")
+        raise typer.Exit(1)
+
+
 if __name__ == "__main__":
     app()
diff --git a/orbit_agent/evals.py b/orbit_agent/evals.py
@@ -17,6 +17,9 @@ class Scenario:
     id: str
     prompt: str
     playbook_path: str | None = None
+    persona: str | None = None
+    stage: str | None = None
+    rubric: List[str] | None = None
 
 
 @dataclass
@@ -46,6 +49,9 @@ def load_scenarios(path: str | Path) -> List[Scenario]:
                 id=item.get("id") or f"s{i+1}",
                 prompt=item["prompt"],
                 playbook_path=item.get("playbook"),
+                persona=item.get("persona"),
+                stage=item.get("stage"),
+                rubric=item.get("rubric"),
             )
         )
     return scenarios
@@ -126,10 +132,56 @@ def summarize_results(records: List[EvalRecord]) -> Dict[str, Any]:
     }
 
 
+# Optional: experimental grading with an explicit rubric using LLM
+try:
+    import dspy
+
+    class RubricGrade(dspy.Signature):
+        """You are grading a startup advice response against a rubric.
+        Return a JSON object strictly with fields: overall (0-10), feedback, criteria.
+        'criteria' maps each rubric item to pass/fail and a brief note.
+        """
+
+        persona: str = dspy.InputField()
+        stage: str = dspy.InputField()
+        prompt: str = dspy.InputField()
+        rubric: str = dspy.InputField(desc="Bulleted list of rubric items")
+        advice: str = dspy.InputField()
+
+        grade_json: str = dspy.OutputField()
+
+    _rubric_grader = dspy.Predict(RubricGrade)
+
+    def grade_with_rubric(
+        records: List[EvalRecord], scenarios: List[Scenario]
+    ) -> List[Dict[str, Any]]:
+        id_to_scn = {s.id: s for s in scenarios}
+        graded: List[Dict[str, Any]] = []
+        for r in records:
+            sc = id_to_scn.get(r.scenario_id)
+            if not sc or not sc.rubric:
+                continue
+            rubric_text = "\n".join(f"- {item}" for item in sc.rubric)
+            out = _rubric_grader(
+                persona=sc.persona or "",
+                stage=sc.stage or "",
+                prompt=sc.prompt,
+                rubric=rubric_text,
+                advice=r.advice,
+            )
+            graded.append({
+                "scenario_id": r.scenario_id,
+                "grade_json": out.grade_json,
+            })
+        return graded
+except Exception:  # pragma: no cover
+    def grade_with_rubric(records: List[EvalRecord], scenarios: List[Scenario]):
+        return []
+
+
 def save_eval_results(records: List[EvalRecord], out_path: str | Path) -> None:
     p = Path(out_path)
     p.parent.mkdir(parents=True, exist_ok=True)
     with p.open("w") as f:
         for r in records:
             f.write(json.dumps(asdict(r)) + "\n")
-