feat(evals): add rubric grade CLI and summary; tool-aware analysis; formatting fixes

haasonsaas · haasonsaas · commit a222c240bfc3 · 2025-09-09T16:35:03.000-07:00
diff --git a/orbit_agent/advisor.py b/orbit_agent/advisor.py
@@ -261,6 +261,37 @@ def forward(
 
             context_with_history = context + recent_context
 
+            # Attempt tool-augmented analysis for structured snippets
+            tool_results = ""
+            try:
+                import re
+
+                from .tools.retention import calculate_cohort_retention
+                from .tools.funnel import analyze_funnel, FunnelStep
+
+                # Extract inline JSON-ish snippet if present
+                m = re.search(r"(\[.*\]|\{.*\})", history_str, re.DOTALL)
+                if m:
+                    snippet = m.group(1)
+                    import json
+
+                    data = json.loads(snippet)
+                    if isinstance(data, list) and data and isinstance(data[0], list):
+                        # Retention cohorts
+                        res = calculate_cohort_retention(data)
+                        tool_results = (
+                            f"Cohorts={len(res)}; Example retention for first cohort: "
+                            f"{', '.join(f'{r:.1f}%' for r in res[0].retention_rates[:4])}"
+                        )
+                    elif isinstance(data, list) and data and isinstance(data[0], dict):
+                        # Funnel steps
+                        steps = [FunnelStep(**s) for s in data]
+                        res = analyze_funnel(steps)
+                        rates = ", ".join(f"{r:.1f}%" for r in res.conversion_rates)
+                        tool_results = f"Funnel steps={len(steps)}; Conversion: {rates}"
+            except Exception:
+                tool_results = tool_results or ""
+
             # Best-of-N generation and rerank by critic
             from .config import get_config
 
@@ -276,7 +307,7 @@ def forward(
                     history=history_str,
                     playbook=playbook,
                     context=context_with_history,
-                    tool_results="No tools used in this session",
+                    tool_results=tool_results or "No tools used in this session",
                 )
 
                 # Critique with retry
diff --git a/orbit_agent/cli.py b/orbit_agent/cli.py
@@ -19,6 +19,9 @@
     run_evals,
     summarize_results,
     save_eval_results,
+    grade_with_rubric,
+    summarize_grades,
+    save_grades,
 )
 import os
 import subprocess
@@ -560,6 +563,49 @@ def eval_report(
         raise typer.Exit(1)
 
 
+@eval_app.command("grade")
+def eval_grade(
+    dataset: str = typer.Option(
+        "evals/scenarios_personas.yaml", help="Path to scenarios YAML with rubrics"
+    ),
+    results_path: str = typer.Option(
+        ".orbit/evals/latest.jsonl", help="Path to JSONL results to grade"
+    ),
+    out: str = typer.Option(
+        ".orbit/evals/grades.jsonl", help="Where to write rubric grades JSONL"
+    ),
+):
+    """Run rubric-based grading on prior eval results and summarize."""
+    try:
+        scns = load_scenarios(dataset)
+        p = Path(results_path)
+        if not p.exists():
+            console.print(f"[red]Not found:[/red] {results_path}")
+            raise typer.Exit(1)
+        from .evals import EvalRecord
+
+        rec_objs = []
+        for line in p.read_text().splitlines():
+            if not line.strip():
+                continue
+            rec_objs.append(EvalRecord(**json.loads(line)))
+        graded = grade_with_rubric(rec_objs, scns)
+        if not graded:
+            console.print(
+                "[yellow]No rubric-graded items (no rubrics present or LLM unavailable)[/yellow]"
+            )
+            raise typer.Exit(0)
+        save_grades(graded, out)
+        summary = summarize_grades(graded)
+        console.print("[bold]Rubric Summary[/]:")
+        for k, v in summary.items():
+            console.print(f"- {k}: {v}")
+    except Exception as e:
+        logger.error(f"Eval grade failed: {e}")
+        console.print(f"[bold red]Error:[/bold red] {e}")
+        raise typer.Exit(1)
+
+
 @models_app.command("list")
 def models_list():
     """List models from the active provider (OpenAI only for now)."""
diff --git a/orbit_agent/evals.py b/orbit_agent/evals.py
@@ -4,7 +4,7 @@
 import time
 from dataclasses import dataclass, asdict
 from pathlib import Path
-from typing import List, Dict, Any, Tuple
+from typing import List, Dict, Any, Tuple, Optional
 
 import yaml
 
@@ -224,3 +224,29 @@ def save_eval_results(records: List[EvalRecord], out_path: str | Path) -> None:
     with p.open("w") as f:
         for r in records:
             f.write(json.dumps(asdict(r)) + "\n")
+
+
+def parse_grade_json(text: str) -> Optional[Dict[str, Any]]:
+    try:
+        return json.loads(text)
+    except Exception:
+        return None
+
+
+def summarize_grades(graded: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Summarize rubric grades (expects items from grade_with_rubric)."""
+    parsed = [parse_grade_json(g.get("grade_json", "")) for g in graded]
+    parsed = [p for p in parsed if isinstance(p, dict) and "overall" in p]
+    if not parsed:
+        return {"count": 0}
+    n = len(parsed)
+    avg_overall = sum(float(p.get("overall", 0)) for p in parsed) / n
+    return {"count": n, "avg_overall": avg_overall}
+
+
+def save_grades(graded: List[Dict[str, Any]], out_path: str | Path) -> None:
+    p = Path(out_path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    with p.open("w") as f:
+        for g in graded:
+            f.write(json.dumps(g) + "\n")