Skip to content

Commit 17c2394

Browse files
committed
feat(evals): add eval runner, CLI commands, and scenarios; add format tests
1 parent a2184e8 commit 17c2394

File tree

4 files changed

+239
-0
lines changed

4 files changed

+239
-0
lines changed

evals/scenarios.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
scenarios:
2+
- id: yc_dilution
3+
prompt: "I'm considering YC. Is the 7% dilution worth it?"
4+
playbook: playbooks/high_orbit.yaml
5+
- id: default_alive
6+
prompt: "We have $800k cash and $55k burn. Are we default-alive?"
7+
playbook: playbooks/high_orbit.yaml
8+
- id: focus_plan
9+
prompt: "Create a ruthless 48h focus plan to validate paid design partner demand for our B2B SaaS."
10+
playbook: playbooks/high_orbit.yaml
11+
- id: raise_round
12+
prompt: "Should I raise a round now or wait 6 months? Pros and cons?"
13+
playbook: playbooks/high_orbit.yaml

orbit_agent/cli.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@
1414
from .tools.retention import calculate_cohort_retention
1515
from .tools.funnel import analyze_funnel, FunnelStep
1616
from .memory import load_context, save_context
17+
from .evals import (
18+
load_scenarios,
19+
run_evals,
20+
summarize_results,
21+
save_eval_results,
22+
)
1723
import os
1824
import subprocess
1925

@@ -23,7 +29,9 @@
2329

2430
app = typer.Typer(help="Orbit Agent CLI — brutally honest startup advisor")
2531
ctx_app = typer.Typer(help="Manage your Orbit context")
32+
eval_app = typer.Typer(help="Run evals and generate reports")
2633
app.add_typer(ctx_app, name="context")
34+
app.add_typer(eval_app, name="eval")
2735

2836

2937
@app.callback()
@@ -479,5 +487,66 @@ def context_edit(
479487
console.print(f"[bold red]Error:[/bold red] {e}")
480488

481489

490+
@eval_app.command("run")
491+
def eval_run(
492+
dataset: str = typer.Option(
493+
"evals/scenarios.yaml", help="Path to scenarios YAML"
494+
),
495+
out: str = typer.Option(
496+
".orbit/evals/latest.jsonl", help="Path to save JSONL results"
497+
),
498+
):
499+
"""Run evals on scenarios and save JSONL results."""
500+
try:
501+
scenarios = load_scenarios(dataset)
502+
if not scenarios:
503+
console.print("[yellow]No scenarios found[/yellow]")
504+
raise typer.Exit(1)
505+
506+
console.print(f"[dim]Loaded {len(scenarios)} scenarios[/dim]")
507+
with console.status("[bold green]Evaluating..."):
508+
records = run_evals(scenarios)
509+
save_eval_results(records, out)
510+
console.print(f"[green]Saved results to {out}[/green]")
511+
summary = summarize_results(records)
512+
console.print("\n[bold]Summary[/]:")
513+
for k, v in summary.items():
514+
console.print(f"- {k}: {v}")
515+
except Exception as e:
516+
logger.error(f"Eval run failed: {e}")
517+
console.print(f"[bold red]Error:[/bold red] {e}")
518+
raise typer.Exit(1)
519+
520+
521+
@eval_app.command("report")
522+
def eval_report(
523+
results_path: str = typer.Argument(
524+
..., help="Path to JSONL results from eval run"
525+
),
526+
):
527+
"""Summarize a JSONL results file from eval run."""
528+
try:
529+
p = Path(results_path)
530+
if not p.exists():
531+
console.print(f"[red]Not found:[/red] {results_path}")
532+
raise typer.Exit(1)
533+
records = []
534+
for line in p.read_text().splitlines():
535+
if not line.strip():
536+
continue
537+
records.append(json.loads(line))
538+
from .evals import EvalRecord
539+
540+
rec_objs = [EvalRecord(**r) for r in records]
541+
summary = summarize_results(rec_objs)
542+
console.print("[bold]Summary[/]:")
543+
for k, v in summary.items():
544+
console.print(f"- {k}: {v}")
545+
except Exception as e:
546+
logger.error(f"Eval report failed: {e}")
547+
console.print(f"[bold red]Error:[/bold red] {e}")
548+
raise typer.Exit(1)
549+
550+
482551
if __name__ == "__main__":
483552
app()

orbit_agent/evals.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
from __future__ import annotations
2+
3+
import json
4+
import time
5+
from dataclasses import dataclass, asdict
6+
from pathlib import Path
7+
from typing import List, Dict, Any, Tuple
8+
9+
import yaml
10+
11+
from .advisor import HighOrbitAdvisor
12+
from .config import configure_lm
13+
14+
15+
@dataclass
16+
class Scenario:
17+
id: str
18+
prompt: str
19+
playbook_path: str | None = None
20+
21+
22+
@dataclass
23+
class EvalRecord:
24+
scenario_id: str
25+
prompt: str
26+
timestamp: float
27+
latency_ms: float
28+
advice: str
29+
actions: List[str]
30+
metric_to_watch: str
31+
risks: List[str]
32+
critic_score: int
33+
critic_feedback: str
34+
format_ok: bool
35+
actions_count: int
36+
risks_count: int
37+
38+
39+
def load_scenarios(path: str | Path) -> List[Scenario]:
40+
p = Path(path)
41+
data = yaml.safe_load(p.read_text())
42+
scenarios: List[Scenario] = []
43+
for i, item in enumerate(data.get("scenarios", [])):
44+
scenarios.append(
45+
Scenario(
46+
id=item.get("id") or f"s{i+1}",
47+
prompt=item["prompt"],
48+
playbook_path=item.get("playbook"),
49+
)
50+
)
51+
return scenarios
52+
53+
54+
def _split_lines(value: Any) -> List[str]:
55+
if isinstance(value, list):
56+
return [str(x).strip() for x in value if str(x).strip()]
57+
if isinstance(value, str):
58+
return [ln.strip() for ln in value.split("\n") if ln.strip()]
59+
return []
60+
61+
62+
def _format_eval(advice: str, actions_lines: List[str], risks_lines: List[str]) -> Tuple[bool, int, int]:
63+
# Clean bullet prefixes
64+
a_clean = [ln.lstrip("123456789. -•*").strip() for ln in actions_lines if ln.strip()]
65+
r_clean = [ln.lstrip("123456789. -•*").strip() for ln in risks_lines if ln.strip()]
66+
actions_count = len(a_clean)
67+
risks_count = len(r_clean)
68+
# Format rules: 3-5 actions, exactly 3 risks, advice non-empty
69+
format_ok = (3 <= actions_count <= 5) and (risks_count == 3) and bool(advice and advice.strip())
70+
return format_ok, actions_count, risks_count
71+
72+
73+
def run_evals(scenarios: List[Scenario]) -> List[EvalRecord]:
74+
# Ensure LM is configured for online evals
75+
configure_lm()
76+
advisor = HighOrbitAdvisor()
77+
records: List[EvalRecord] = []
78+
79+
for sc in scenarios:
80+
playbook = ""
81+
if sc.playbook_path and Path(sc.playbook_path).exists():
82+
playbook = Path(sc.playbook_path).read_text()
83+
84+
history = [{"role": "user", "content": sc.prompt}]
85+
start = time.time()
86+
res = advisor(history=history, playbook=playbook)
87+
latency_ms = (time.time() - start) * 1000.0
88+
89+
actions_lines = _split_lines(res.actions_48h)
90+
risks_lines = _split_lines(res.risks)
91+
format_ok, a_count, r_count = _format_eval(res.advice or "", actions_lines, risks_lines)
92+
93+
records.append(
94+
EvalRecord(
95+
scenario_id=sc.id,
96+
prompt=sc.prompt,
97+
timestamp=time.time(),
98+
latency_ms=latency_ms,
99+
advice=res.advice or "",
100+
actions=[ln.lstrip("123456789. -•*").strip() for ln in actions_lines],
101+
metric_to_watch=res.metric_to_watch or "",
102+
risks=[ln.lstrip("123456789. -•*").strip() for ln in risks_lines],
103+
critic_score=int(getattr(res, "score", 0) or 0),
104+
critic_feedback=str(getattr(res, "critique", "") or ""),
105+
format_ok=format_ok,
106+
actions_count=a_count,
107+
risks_count=r_count,
108+
)
109+
)
110+
111+
return records
112+
113+
114+
def summarize_results(records: List[EvalRecord]) -> Dict[str, Any]:
115+
n = len(records)
116+
if n == 0:
117+
return {"count": 0}
118+
fmt_ok = sum(1 for r in records if r.format_ok)
119+
avg_score = sum(r.critic_score for r in records) / n
120+
avg_latency = sum(r.latency_ms for r in records) / n
121+
return {
122+
"count": n,
123+
"format_ok_rate": fmt_ok / n,
124+
"avg_critic_score": avg_score,
125+
"avg_latency_ms": avg_latency,
126+
}
127+
128+
129+
def save_eval_results(records: List[EvalRecord], out_path: str | Path) -> None:
130+
p = Path(out_path)
131+
p.parent.mkdir(parents=True, exist_ok=True)
132+
with p.open("w") as f:
133+
for r in records:
134+
f.write(json.dumps(asdict(r)) + "\n")
135+

tests/test_evals_format.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from orbit_agent.evals import _format_eval
2+
3+
4+
def test_format_eval_happy_path():
5+
advice = "Do X. Do Y."
6+
actions = ["1. Call 10 users", "2. Ship MVP", "3. Measure activation"]
7+
risks = ["No demand", "Wrong ICP", "Channel saturation"]
8+
ok, a_count, r_count = _format_eval(advice, actions, risks)
9+
assert ok is True
10+
assert a_count == 3
11+
assert r_count == 3
12+
13+
14+
def test_format_eval_too_few_actions_or_risks():
15+
advice = "Something"
16+
actions = ["One"]
17+
risks = ["Risk1", "Risk2"]
18+
ok, a_count, r_count = _format_eval(advice, actions, risks)
19+
assert ok is False
20+
assert a_count == 1
21+
assert r_count == 2
22+

0 commit comments

Comments
 (0)