Skip to content

Commit a222c24

Browse files
committed
feat(evals): add rubric grade CLI and summary; tool-aware analysis; formatting fixes
1 parent 9db18fe commit a222c24

File tree

3 files changed

+105
-2
lines changed

3 files changed

+105
-2
lines changed

orbit_agent/advisor.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,37 @@ def forward(
261261

262262
context_with_history = context + recent_context
263263

264+
# Attempt tool-augmented analysis for structured snippets
265+
tool_results = ""
266+
try:
267+
import re
268+
269+
from .tools.retention import calculate_cohort_retention
270+
from .tools.funnel import analyze_funnel, FunnelStep
271+
272+
# Extract inline JSON-ish snippet if present
273+
m = re.search(r"(\[.*\]|\{.*\})", history_str, re.DOTALL)
274+
if m:
275+
snippet = m.group(1)
276+
import json
277+
278+
data = json.loads(snippet)
279+
if isinstance(data, list) and data and isinstance(data[0], list):
280+
# Retention cohorts
281+
res = calculate_cohort_retention(data)
282+
tool_results = (
283+
f"Cohorts={len(res)}; Example retention for first cohort: "
284+
f"{', '.join(f'{r:.1f}%' for r in res[0].retention_rates[:4])}"
285+
)
286+
elif isinstance(data, list) and data and isinstance(data[0], dict):
287+
# Funnel steps
288+
steps = [FunnelStep(**s) for s in data]
289+
res = analyze_funnel(steps)
290+
rates = ", ".join(f"{r:.1f}%" for r in res.conversion_rates)
291+
tool_results = f"Funnel steps={len(steps)}; Conversion: {rates}"
292+
except Exception:
293+
tool_results = tool_results or ""
294+
264295
# Best-of-N generation and rerank by critic
265296
from .config import get_config
266297

@@ -276,7 +307,7 @@ def forward(
276307
history=history_str,
277308
playbook=playbook,
278309
context=context_with_history,
279-
tool_results="No tools used in this session",
310+
tool_results=tool_results or "No tools used in this session",
280311
)
281312

282313
# Critique with retry

orbit_agent/cli.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
run_evals,
2020
summarize_results,
2121
save_eval_results,
22+
grade_with_rubric,
23+
summarize_grades,
24+
save_grades,
2225
)
2326
import os
2427
import subprocess
@@ -560,6 +563,49 @@ def eval_report(
560563
raise typer.Exit(1)
561564

562565

566+
@eval_app.command("grade")
567+
def eval_grade(
568+
dataset: str = typer.Option(
569+
"evals/scenarios_personas.yaml", help="Path to scenarios YAML with rubrics"
570+
),
571+
results_path: str = typer.Option(
572+
".orbit/evals/latest.jsonl", help="Path to JSONL results to grade"
573+
),
574+
out: str = typer.Option(
575+
".orbit/evals/grades.jsonl", help="Where to write rubric grades JSONL"
576+
),
577+
):
578+
"""Run rubric-based grading on prior eval results and summarize."""
579+
try:
580+
scns = load_scenarios(dataset)
581+
p = Path(results_path)
582+
if not p.exists():
583+
console.print(f"[red]Not found:[/red] {results_path}")
584+
raise typer.Exit(1)
585+
from .evals import EvalRecord
586+
587+
rec_objs = []
588+
for line in p.read_text().splitlines():
589+
if not line.strip():
590+
continue
591+
rec_objs.append(EvalRecord(**json.loads(line)))
592+
graded = grade_with_rubric(rec_objs, scns)
593+
if not graded:
594+
console.print(
595+
"[yellow]No rubric-graded items (no rubrics present or LLM unavailable)[/yellow]"
596+
)
597+
raise typer.Exit(0)
598+
save_grades(graded, out)
599+
summary = summarize_grades(graded)
600+
console.print("[bold]Rubric Summary[/]:")
601+
for k, v in summary.items():
602+
console.print(f"- {k}: {v}")
603+
except Exception as e:
604+
logger.error(f"Eval grade failed: {e}")
605+
console.print(f"[bold red]Error:[/bold red] {e}")
606+
raise typer.Exit(1)
607+
608+
563609
@models_app.command("list")
564610
def models_list():
565611
"""List models from the active provider (OpenAI only for now)."""

orbit_agent/evals.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import time
55
from dataclasses import dataclass, asdict
66
from pathlib import Path
7-
from typing import List, Dict, Any, Tuple
7+
from typing import List, Dict, Any, Tuple, Optional
88

99
import yaml
1010

@@ -224,3 +224,29 @@ def save_eval_results(records: List[EvalRecord], out_path: str | Path) -> None:
224224
with p.open("w") as f:
225225
for r in records:
226226
f.write(json.dumps(asdict(r)) + "\n")
227+
228+
229+
def parse_grade_json(text: str) -> Optional[Dict[str, Any]]:
230+
try:
231+
return json.loads(text)
232+
except Exception:
233+
return None
234+
235+
236+
def summarize_grades(graded: List[Dict[str, Any]]) -> Dict[str, Any]:
237+
"""Summarize rubric grades (expects items from grade_with_rubric)."""
238+
parsed = [parse_grade_json(g.get("grade_json", "")) for g in graded]
239+
parsed = [p for p in parsed if isinstance(p, dict) and "overall" in p]
240+
if not parsed:
241+
return {"count": 0}
242+
n = len(parsed)
243+
avg_overall = sum(float(p.get("overall", 0)) for p in parsed) / n
244+
return {"count": n, "avg_overall": avg_overall}
245+
246+
247+
def save_grades(graded: List[Dict[str, Any]], out_path: str | Path) -> None:
248+
p = Path(out_path)
249+
p.parent.mkdir(parents=True, exist_ok=True)
250+
with p.open("w") as f:
251+
for g in graded:
252+
f.write(json.dumps(g) + "\n")

0 commit comments

Comments
 (0)