feat(evals): add eval runner, CLI commands, and scenarios; add format tests

haasonsaas · haasonsaas · commit 17c2394bd4c4 · 2025-09-09T15:56:49.000-07:00
diff --git a/evals/scenarios.yaml b/evals/scenarios.yaml
@@ -0,0 +1,13 @@
+scenarios:
+  - id: yc_dilution
+    prompt: "I'm considering YC. Is the 7% dilution worth it?"
+    playbook: playbooks/high_orbit.yaml
+  - id: default_alive
+    prompt: "We have $800k cash and $55k burn. Are we default-alive?"
+    playbook: playbooks/high_orbit.yaml
+  - id: focus_plan
+    prompt: "Create a ruthless 48h focus plan to validate paid design partner demand for our B2B SaaS."
+    playbook: playbooks/high_orbit.yaml
+  - id: raise_round
+    prompt: "Should I raise a round now or wait 6 months? Pros and cons?"
+    playbook: playbooks/high_orbit.yaml
diff --git a/orbit_agent/cli.py b/orbit_agent/cli.py
@@ -14,6 +14,12 @@
 from .tools.retention import calculate_cohort_retention
 from .tools.funnel import analyze_funnel, FunnelStep
 from .memory import load_context, save_context
+from .evals import (
+    load_scenarios,
+    run_evals,
+    summarize_results,
+    save_eval_results,
+)
 import os
 import subprocess
 
@@ -23,7 +29,9 @@
 
 app = typer.Typer(help="Orbit Agent CLI — brutally honest startup advisor")
 ctx_app = typer.Typer(help="Manage your Orbit context")
+eval_app = typer.Typer(help="Run evals and generate reports")
 app.add_typer(ctx_app, name="context")
+app.add_typer(eval_app, name="eval")
 
 
 @app.callback()
@@ -479,5 +487,66 @@ def context_edit(
         console.print(f"[bold red]Error:[/bold red] {e}")
 
 
+@eval_app.command("run")
+def eval_run(
+    dataset: str = typer.Option(
+        "evals/scenarios.yaml", help="Path to scenarios YAML"
+    ),
+    out: str = typer.Option(
+        ".orbit/evals/latest.jsonl", help="Path to save JSONL results"
+    ),
+):
+    """Run evals on scenarios and save JSONL results."""
+    try:
+        scenarios = load_scenarios(dataset)
+        if not scenarios:
+            console.print("[yellow]No scenarios found[/yellow]")
+            raise typer.Exit(1)
+
+        console.print(f"[dim]Loaded {len(scenarios)} scenarios[/dim]")
+        with console.status("[bold green]Evaluating..."):
+            records = run_evals(scenarios)
+        save_eval_results(records, out)
+        console.print(f"[green]Saved results to {out}[/green]")
+        summary = summarize_results(records)
+        console.print("\n[bold]Summary[/]:")
+        for k, v in summary.items():
+            console.print(f"- {k}: {v}")
+    except Exception as e:
+        logger.error(f"Eval run failed: {e}")
+        console.print(f"[bold red]Error:[/bold red] {e}")
+        raise typer.Exit(1)
+
+
+@eval_app.command("report")
+def eval_report(
+    results_path: str = typer.Argument(
+        ..., help="Path to JSONL results from eval run"
+    ),
+):
+    """Summarize a JSONL results file from eval run."""
+    try:
+        p = Path(results_path)
+        if not p.exists():
+            console.print(f"[red]Not found:[/red] {results_path}")
+            raise typer.Exit(1)
+        records = []
+        for line in p.read_text().splitlines():
+            if not line.strip():
+                continue
+            records.append(json.loads(line))
+        from .evals import EvalRecord
+
+        rec_objs = [EvalRecord(**r) for r in records]
+        summary = summarize_results(rec_objs)
+        console.print("[bold]Summary[/]:")
+        for k, v in summary.items():
+            console.print(f"- {k}: {v}")
+    except Exception as e:
+        logger.error(f"Eval report failed: {e}")
+        console.print(f"[bold red]Error:[/bold red] {e}")
+        raise typer.Exit(1)
+
+
 if __name__ == "__main__":
     app()
diff --git a/orbit_agent/evals.py b/orbit_agent/evals.py
@@ -0,0 +1,135 @@
+from __future__ import annotations
+
+import json
+import time
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import List, Dict, Any, Tuple
+
+import yaml
+
+from .advisor import HighOrbitAdvisor
+from .config import configure_lm
+
+
+@dataclass
+class Scenario:
+    id: str
+    prompt: str
+    playbook_path: str | None = None
+
+
+@dataclass
+class EvalRecord:
+    scenario_id: str
+    prompt: str
+    timestamp: float
+    latency_ms: float
+    advice: str
+    actions: List[str]
+    metric_to_watch: str
+    risks: List[str]
+    critic_score: int
+    critic_feedback: str
+    format_ok: bool
+    actions_count: int
+    risks_count: int
+
+
+def load_scenarios(path: str | Path) -> List[Scenario]:
+    p = Path(path)
+    data = yaml.safe_load(p.read_text())
+    scenarios: List[Scenario] = []
+    for i, item in enumerate(data.get("scenarios", [])):
+        scenarios.append(
+            Scenario(
+                id=item.get("id") or f"s{i+1}",
+                prompt=item["prompt"],
+                playbook_path=item.get("playbook"),
+            )
+        )
+    return scenarios
+
+
+def _split_lines(value: Any) -> List[str]:
+    if isinstance(value, list):
+        return [str(x).strip() for x in value if str(x).strip()]
+    if isinstance(value, str):
+        return [ln.strip() for ln in value.split("\n") if ln.strip()]
+    return []
+
+
+def _format_eval(advice: str, actions_lines: List[str], risks_lines: List[str]) -> Tuple[bool, int, int]:
+    # Clean bullet prefixes
+    a_clean = [ln.lstrip("123456789. -•*").strip() for ln in actions_lines if ln.strip()]
+    r_clean = [ln.lstrip("123456789. -•*").strip() for ln in risks_lines if ln.strip()]
+    actions_count = len(a_clean)
+    risks_count = len(r_clean)
+    # Format rules: 3-5 actions, exactly 3 risks, advice non-empty
+    format_ok = (3 <= actions_count <= 5) and (risks_count == 3) and bool(advice and advice.strip())
+    return format_ok, actions_count, risks_count
+
+
+def run_evals(scenarios: List[Scenario]) -> List[EvalRecord]:
+    # Ensure LM is configured for online evals
+    configure_lm()
+    advisor = HighOrbitAdvisor()
+    records: List[EvalRecord] = []
+
+    for sc in scenarios:
+        playbook = ""
+        if sc.playbook_path and Path(sc.playbook_path).exists():
+            playbook = Path(sc.playbook_path).read_text()
+
+        history = [{"role": "user", "content": sc.prompt}]
+        start = time.time()
+        res = advisor(history=history, playbook=playbook)
+        latency_ms = (time.time() - start) * 1000.0
+
+        actions_lines = _split_lines(res.actions_48h)
+        risks_lines = _split_lines(res.risks)
+        format_ok, a_count, r_count = _format_eval(res.advice or "", actions_lines, risks_lines)
+
+        records.append(
+            EvalRecord(
+                scenario_id=sc.id,
+                prompt=sc.prompt,
+                timestamp=time.time(),
+                latency_ms=latency_ms,
+                advice=res.advice or "",
+                actions=[ln.lstrip("123456789. -•*").strip() for ln in actions_lines],
+                metric_to_watch=res.metric_to_watch or "",
+                risks=[ln.lstrip("123456789. -•*").strip() for ln in risks_lines],
+                critic_score=int(getattr(res, "score", 0) or 0),
+                critic_feedback=str(getattr(res, "critique", "") or ""),
+                format_ok=format_ok,
+                actions_count=a_count,
+                risks_count=r_count,
+            )
+        )
+
+    return records
+
+
+def summarize_results(records: List[EvalRecord]) -> Dict[str, Any]:
+    n = len(records)
+    if n == 0:
+        return {"count": 0}
+    fmt_ok = sum(1 for r in records if r.format_ok)
+    avg_score = sum(r.critic_score for r in records) / n
+    avg_latency = sum(r.latency_ms for r in records) / n
+    return {
+        "count": n,
+        "format_ok_rate": fmt_ok / n,
+        "avg_critic_score": avg_score,
+        "avg_latency_ms": avg_latency,
+    }
+
+
+def save_eval_results(records: List[EvalRecord], out_path: str | Path) -> None:
+    p = Path(out_path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    with p.open("w") as f:
+        for r in records:
+            f.write(json.dumps(asdict(r)) + "\n")
+
diff --git a/tests/test_evals_format.py b/tests/test_evals_format.py
@@ -0,0 +1,22 @@
+from orbit_agent.evals import _format_eval
+
+
+def test_format_eval_happy_path():
+    advice = "Do X. Do Y."
+    actions = ["1. Call 10 users", "2. Ship MVP", "3. Measure activation"]
+    risks = ["No demand", "Wrong ICP", "Channel saturation"]
+    ok, a_count, r_count = _format_eval(advice, actions, risks)
+    assert ok is True
+    assert a_count == 3
+    assert r_count == 3
+
+
+def test_format_eval_too_few_actions_or_risks():
+    advice = "Something"
+    actions = ["One"]
+    risks = ["Risk1", "Risk2"]
+    ok, a_count, r_count = _format_eval(advice, actions, risks)
+    assert ok is False
+    assert a_count == 1
+    assert r_count == 2
+