feat: persona-aware evals; export CSV/MD; provider-selectable model listing; critic LM + overlap-penalized rerank

haasonsaas · haasonsaas · commit 15709fca7086 · 2025-09-09T16:45:22.000-07:00
diff --git a/orbit_agent/advisor.py b/orbit_agent/advisor.py
@@ -268,6 +268,7 @@ def forward(
 
                 from .tools.retention import calculate_cohort_retention
                 from .tools.funnel import analyze_funnel, FunnelStep
+                from .tools.finance import runway_months, expected_value
 
                 # Extract inline JSON-ish snippet if present
                 m = re.search(r"(\[.*\]|\{.*\})", history_str, re.DOTALL)
@@ -289,6 +290,32 @@ def forward(
                         res = analyze_funnel(steps)
                         rates = ", ".join(f"{r:.1f}%" for r in res.conversion_rates)
                         tool_results = f"Funnel steps={len(steps)}; Conversion: {rates}"
+                    elif isinstance(data, dict) and {"cash", "burn"} <= set(
+                        data.keys()
+                    ):
+                        rr = runway_months(
+                            float(data["cash"]),
+                            float(data["burn"]),
+                            float(data.get("growth", 0)),
+                        )
+                        tool_results = f"Runway≈{rr.months:.1f}m; Alive={'Yes' if rr.default_alive else 'No'}"
+                    elif isinstance(data, dict) and {
+                        "p_upside",
+                        "ev_upside",
+                        "p_mid",
+                        "ev_mid",
+                        "p_down",
+                        "ev_down",
+                    } <= set(data.keys()):
+                        ev = expected_value(
+                            float(data["p_upside"]),
+                            float(data["ev_upside"]),
+                            float(data["p_mid"]),
+                            float(data["ev_mid"]),
+                            float(data["p_down"]),
+                            float(data["ev_down"]),
+                        )
+                        tool_results = f"EV≈${ev:,.0f} across scenarios"
             except Exception:
                 tool_results = tool_results or ""
 
@@ -297,8 +324,39 @@ def forward(
 
             cfg = get_config()
             best_of_n = max(1, int(getattr(cfg, "best_of_n", 1)))
+            overlap_alpha = float(getattr(cfg, "overlap_alpha", 2.0) or 2.0)
             best_payload = None
-            best_score = -1
+            best_score = -1e9
+
+            # Optional separate critic LM
+            critic_lm = None
+            try:
+                import dspy as _dspy
+
+                if getattr(cfg, "critic_model", None):
+                    critic_lm = _dspy.LM(model=cfg.critic_model)
+            except Exception:
+                critic_lm = None
+
+            def _ngram_set(text: str, n: int = 3) -> set[str]:
+                tokens = [t for t in text.lower().split() if t]
+                return set(
+                    [
+                        " ".join(tokens[i : i + n])
+                        for i in range(0, max(0, len(tokens) - n + 1))
+                    ]
+                )
+
+            def _overlap_ratio(a: str, b: str, n: int = 3) -> float:
+                if not a or not b:
+                    return 0.0
+                A = _ngram_set(a, n)
+                B = _ngram_set(b, n)
+                if not A or not B:
+                    return 0.0
+                inter = len(A & B)
+                union = len(A | B)
+                return inter / union
 
             for _ in range(best_of_n):
                 logger.info("Generating advice with LLM")
@@ -310,24 +368,29 @@ def forward(
                     tool_results=tool_results or "No tools used in this session",
                 )
 
-                # Critique with retry
+                # Critique with retry (optionally on separate critic LM)
                 logger.info("Getting critique")
-                critique = self._call_llm_with_retry(
-                    self.critic,
-                    advice=self._clean_output(draft.advice),
-                    context=context_with_history,
-                )
+                advice_clean = self._clean_output(draft.advice)
+                kwargs = dict(advice=advice_clean, context=context_with_history)
+                if critic_lm is not None:
+                    critique = self._call_llm_with_retry(
+                        self.critic, **kwargs, lm=critic_lm
+                    )
+                else:
+                    critique = self._call_llm_with_retry(self.critic, **kwargs)
 
-                score = int(getattr(critique, "score", 0) or 0)
+                score_raw = int(getattr(critique, "score", 0) or 0)
+                overlap = _overlap_ratio(advice_clean, playbook)
+                score = score_raw - overlap_alpha * overlap
                 if score > best_score:
                     best_score = score
                     best_payload = (
-                        self._clean_output(draft.advice),
+                        advice_clean,
                         self._clean_output(draft.actions_48h),
                         self._clean_output(draft.metric_to_watch),
                         self._clean_output(draft.risks),
                         critique.feedback,
-                        score,
+                        score_raw,
                     )
 
             advice, actions_48h, metric_to_watch, risks, feedback, score = best_payload
diff --git a/orbit_agent/cli.py b/orbit_agent/cli.py
@@ -22,6 +22,9 @@
     grade_with_rubric,
     summarize_grades,
     save_grades,
+    load_eval_records,
+    export_summary_csv,
+    export_summary_md,
 )
 import os
 import subprocess
@@ -606,42 +609,93 @@ def eval_grade(
         raise typer.Exit(1)
 
 
-@models_app.command("list")
-def models_list():
-    """List models from the active provider (OpenAI only for now)."""
+@eval_app.command("summary")
+def eval_summary(
+    input_path: str = typer.Option(
+        ".orbit/evals/latest.jsonl", help="Path to JSONL results"
+    ),
+    csv_out: str = typer.Option(None, help="Write scenario summary to CSV here"),
+    md_out: str = typer.Option(None, help="Write scenario summary to Markdown here"),
+):
+    """Export per-scenario summaries to CSV/Markdown."""
     try:
-        import os
-        from openai import OpenAI
+        recs = load_eval_records(input_path)
+        if not recs:
+            console.print("[yellow]No records to summarize[/yellow]")
+            raise typer.Exit(0)
+        if csv_out:
+            export_summary_csv(recs, csv_out)
+            console.print(f"[green]CSV written:[/green] {csv_out}")
+        if md_out:
+            export_summary_md(recs, md_out)
+            console.print(f"[green]Markdown written:[/green] {md_out}")
+        if not csv_out and not md_out:
+            console.print(
+                "[yellow]No output paths specified (use --csv-out or --md-out)[/yellow]"
+            )
+    except Exception as e:
+        logger.error(f"Eval summary failed: {e}")
+        console.print(f"[bold red]Error:[/bold red] {e}")
+        raise typer.Exit(1)
 
-        key = os.getenv("OPENAI_API_KEY")
-        if not key:
-            console.print("[red]OPENAI_API_KEY not set[/red]")
-            raise typer.Exit(1)
 
-        client = OpenAI(api_key=key)
-        resp = client.models.list()
-        ids = [m.id for m in resp.data]
-        # Prefer relevant chat-capable frontier models
-        preferred = [
-            i
-            for i in ids
-            if any(
-                i.startswith(p)
-                for p in (
-                    "gpt-5",
-                    "gpt-4.1",
-                    "gpt-4o",
-                    "o3",
-                )
-            )
-        ]
-        console.print("[bold]Candidate Models[/]:")
-        for mid in sorted(preferred):
-            console.print(f"- {mid}")
-        others = [i for i in ids if i not in preferred]
-        console.print("\n[dim]Other models (truncated)[/dim]")
-        for mid in sorted(others)[:20]:
-            console.print(f"- {mid}")
+@models_app.command("list")
+def models_list(
+    provider: str = typer.Option("openai", help="Provider: openai|anthropic")
+):
+    """List models from a provider (token-free)."""
+    try:
+        provider = provider.lower()
+        if provider == "openai":
+            import os
+            from openai import OpenAI
+
+            key = os.getenv("OPENAI_API_KEY")
+            if not key:
+                console.print("[red]OPENAI_API_KEY not set[/red]")
+                raise typer.Exit(1)
+            client = OpenAI(api_key=key)
+            resp = client.models.list()
+            ids = [m.id for m in resp.data]
+            preferred = [
+                i
+                for i in ids
+                if any(i.startswith(p) for p in ("gpt-5", "gpt-4.1", "gpt-4o", "o3"))
+            ]
+            console.print("[bold]OpenAI Candidate Models[/]:")
+            for mid in sorted(preferred):
+                console.print(f"- {mid}")
+            others = [i for i in ids if i not in preferred]
+            console.print("\n[dim]Other models (truncated)[/dim]")
+            for mid in sorted(others)[:20]:
+                console.print(f"- {mid}")
+        elif provider == "anthropic":
+            try:
+                import os
+                import anthropic
+
+                key = os.getenv("ANTHROPIC_API_KEY")
+                if not key:
+                    console.print("[red]ANTHROPIC_API_KEY not set[/red]")
+                    raise typer.Exit(1)
+                client = anthropic.Anthropic(api_key=key)
+                # Anthropic SDK provides a fixed set; list known public IDs if API lacks listing.
+                known = [
+                    "claude-3-5-sonnet-20241022",
+                    "claude-3-5-haiku-20241022",
+                    "claude-3-opus-20240229",
+                    "claude-3-sonnet-20240229",
+                    "claude-3-haiku-20240307",
+                ]
+                console.print("[bold]Anthropic Models (known set)[/]:")
+                for mid in known:
+                    console.print(f"- {mid}")
+            except Exception as e:
+                console.print(f"[red]Anthropic listing not available:[/red] {e}")
+                raise typer.Exit(1)
+        else:
+            console.print("[red]Unsupported provider[/red]")
+            raise typer.Exit(1)
     except Exception as e:
         logger.error(f"Model listing failed: {e}")
         console.print(f"[bold red]Error:[/bold red] {e}")
diff --git a/orbit_agent/config.py b/orbit_agent/config.py
@@ -77,6 +77,8 @@ class AppConfig:
     # Generation quality controls
     best_of_n: int = 1
     temperature: float = 0.7
+    critic_model: Optional[str] = None
+    overlap_alpha: float = 2.0
 
     def __post_init__(self):
         """Validate configuration after initialization"""
@@ -184,6 +186,8 @@ def load_config() -> AppConfig:
         cost_per_1k_completion=float(os.getenv("ORBIT_COST_PER_1K_COMPLETION", "0")),
         best_of_n=int(os.getenv("ORBIT_BEST_OF_N", "1")),
         temperature=float(os.getenv("ORBIT_TEMPERATURE", "0.7")),
+        critic_model=os.getenv("ORBIT_CRITIC_LM"),
+        overlap_alpha=float(os.getenv("ORBIT_OVERLAP_ALPHA", "2.0")),
     )
 
     return config
diff --git a/orbit_agent/evals.py b/orbit_agent/evals.py
@@ -116,8 +116,18 @@ def run_evals(scenarios: List[Scenario]) -> List[EvalRecord]:
             playbook = Path(sc.playbook_path).read_text()
 
         history = [{"role": "user", "content": sc.prompt}]
+        persona_ctx = []
+        if sc.persona:
+            persona_ctx.append(f"Persona: {sc.persona}")
+        if sc.stage:
+            persona_ctx.append(f"Stage: {sc.stage}")
+        if sc.rubric:
+            persona_ctx.append(
+                "Success Rubric:" + "\n" + "\n".join(f"- {r}" for r in sc.rubric)
+            )
+        ctx = "\n".join(persona_ctx) if persona_ctx else None
         start = time.time()
-        res = advisor(history=history, playbook=playbook)
+        res = advisor(history=history, playbook=playbook, context=ctx)
         latency_ms = (time.time() - start) * 1000.0
 
         actions_lines = _split_lines(res.actions_48h)
@@ -250,3 +260,72 @@ def save_grades(graded: List[Dict[str, Any]], out_path: str | Path) -> None:
     with p.open("w") as f:
         for g in graded:
             f.write(json.dumps(g) + "\n")
+
+
+def load_eval_records(path: str | Path) -> List[EvalRecord]:
+    p = Path(path)
+    recs: List[EvalRecord] = []
+    if not p.exists():
+        return recs
+    for line in p.read_text().splitlines():
+        if not line.strip():
+            continue
+        recs.append(EvalRecord(**json.loads(line)))
+    return recs
+
+
+def summarize_by_scenario(records: List[EvalRecord]) -> List[Dict[str, Any]]:
+    from collections import defaultdict
+
+    groups = defaultdict(list)
+    for r in records:
+        groups[r.scenario_id].append(r)
+    rows = []
+    for sid, items in groups.items():
+        n = len(items)
+        avg_score = sum(i.critic_score for i in items) / n
+        avg_overlap = sum((i.overlap_ratio or 0.0) for i in items) / n
+        avg_latency = sum(i.latency_ms for i in items) / n
+        rows.append(
+            {
+                "scenario_id": sid,
+                "count": n,
+                "avg_critic_score": avg_score,
+                "avg_overlap": avg_overlap,
+                "avg_latency_ms": avg_latency,
+            }
+        )
+    return rows
+
+
+def export_summary_csv(records: List[EvalRecord], out_path: str | Path) -> None:
+    import csv
+
+    rows = summarize_by_scenario(records)
+    with open(out_path, "w", newline="") as f:
+        writer = csv.DictWriter(
+            f,
+            fieldnames=[
+                "scenario_id",
+                "count",
+                "avg_critic_score",
+                "avg_overlap",
+                "avg_latency_ms",
+            ],
+        )
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(row)
+
+
+def export_summary_md(records: List[EvalRecord], out_path: str | Path) -> None:
+    rows = summarize_by_scenario(records)
+    lines = [
+        "| Scenario | Count | Avg Score | Overlap | Latency (ms) |",
+        "|---|---:|---:|---:|---:|",
+    ]
+    for r in rows:
+        lines.append(
+            f"| {r['scenario_id']} | {r['count']} | {r['avg_critic_score']:.2f} | {r['avg_overlap']:.2f} | {r['avg_latency_ms']:.0f} |"
+        )
+    Path(out_path).write_text("\n".join(lines) + "\n")