Skip to content

Commit 33c16b4

Browse files
committed
feat(evals): personas + rubric support, grading stub; CLI eval run/report; model listing; advisor prompt anti-templating
1 parent 17c2394 commit 33c16b4

File tree

4 files changed

+157
-3
lines changed

4 files changed

+157
-3
lines changed

evals/scenarios_personas.yaml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
scenarios:
2+
- id: seed_ceo_focus
3+
persona: "Founder-CEO, 2-person team, zero revenue"
4+
stage: "Pre-seed"
5+
prompt: "We have 10 interviews suggesting pain but no commitments. How do we get 3 paid design partners in 14 days?"
6+
playbook: playbooks/high_orbit.yaml
7+
rubric:
8+
- "Actionability: concrete steps with owners and deadlines"
9+
- "Specific outreach: named segments or job titles, with scripts"
10+
- "Measurement: leading indicator within 1-2 weeks"
11+
- "Courageous tradeoffs: deprioritizes comfort features for decisive tests"
12+
13+
- id: series_a_ceo_retention
14+
persona: "Founder-CEO, 15 FTE, $100k MRR"
15+
stage: "Early growth"
16+
prompt: "Logo churn ticked to 3% last month. Which cohorts should we diagnose first and what actions in the next 48h?"
17+
playbook: playbooks/high_orbit.yaml
18+
rubric:
19+
- "Cohort thinking: segment by ICP/use-case, not averages"
20+
- "Hypothesis-driven: proposes concrete causes and tests"
21+
- "Cross-functional: product + CS + pricing interplay"
22+
- "Immediate actions: 48h calendar-blocked tasks"
23+
24+
- id: consumer_app_activation
25+
persona: "PM, Consumer app, 20k WAU"
26+
stage: "Product-market search"
27+
prompt: "Activation from signup to first value is 17%. Propose a 1-week plan to hit 25% and what to cut to get there."
28+
playbook: playbooks/high_orbit.yaml
29+
rubric:
30+
- "Activation metric clarity: precise step definition and baseline"
31+
- "Small-bets: 3-5 experiments with measurable deltas"
32+
- "Qual + quant: combines funnel data with user calls"
33+
- "Cut list: explicit deprioritizations for bandwidth"
34+
35+
- id: enterprise_pipeline
36+
persona: "Founder, Enterprise B2B, 6-month cycles"
37+
stage: "Pilot to contract"
38+
prompt: "We have 4 late-stage deals stuck on security review. What should I do this week to force a decision?"
39+
playbook: playbooks/high_orbit.yaml
40+
rubric:
41+
- "Deal mechanics: identifies blockers and stakeholders"
42+
- "Mutual close plan: steps, owners, dates"
43+
- "Value re-anchoring: ROI or risk framing"
44+
- "Escalation: exec alignment or withdraw strategy"
45+
46+
- id: pricing_reframe
47+
persona: "Go-to-market lead"
48+
stage: "Post-MVP"
49+
prompt: "Our per-seat plan stalls at 30 seats. How to reframe pricing to expand accounts in 2 weeks?"
50+
playbook: playbooks/high_orbit.yaml
51+
rubric:
52+
- "Wedge expansion: module/usage or value metrics"
53+
- "Pilot offer: specific terms and success criteria"
54+
- "Objection-handling: equips AE with scripts"
55+
- "Analysis: before/after unit economics"

orbit_agent/advisor.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,13 @@ class HighOrbitAdvice(dspy.Signature):
5151
"""You are a brutally honest startup advisor in the YC tradition.
5252
5353
Give specific, actionable advice that optimizes for $1B vs $0 outcomes.
54-
Be concrete and direct. Avoid generic platitudes.
54+
Be concrete, direct, and tailored to the user's context (persona, stage,
55+
constraints). Treat the playbook as heuristics, not as text to repeat.
56+
Do not regurgitate playbook lines; instead, synthesize novel guidance and
57+
tie it to the user's scenario with clear reasoning.
5558
5659
Output format requirements:
57-
- Advice: 2-3 paragraphs of specific, actionable guidance
60+
- Advice: 2-3 paragraphs of specific, actionable guidance (no bullets)
5861
- Actions: List exactly 3-5 specific tasks, one per line, no formatting
5962
- Metric: One clear metric to track progress
6063
- Risks: List exactly 3 risks, one per line, no formatting

orbit_agent/cli.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
eval_app = typer.Typer(help="Run evals and generate reports")
3333
app.add_typer(ctx_app, name="context")
3434
app.add_typer(eval_app, name="eval")
35+
models_app = typer.Typer(help="List available models for the active provider")
36+
app.add_typer(models_app, name="models")
3537

3638

3739
@app.callback()
@@ -548,5 +550,47 @@ def eval_report(
548550
raise typer.Exit(1)
549551

550552

553+
@models_app.command("list")
554+
def models_list():
555+
"""List models from the active provider (OpenAI only for now)."""
556+
try:
557+
import os
558+
from openai import OpenAI
559+
560+
key = os.getenv("OPENAI_API_KEY")
561+
if not key:
562+
console.print("[red]OPENAI_API_KEY not set[/red]")
563+
raise typer.Exit(1)
564+
565+
client = OpenAI(api_key=key)
566+
resp = client.models.list()
567+
ids = [m.id for m in resp.data]
568+
# Prefer relevant chat-capable frontier models
569+
preferred = [
570+
i
571+
for i in ids
572+
if any(
573+
i.startswith(p)
574+
for p in (
575+
"gpt-5",
576+
"gpt-4.1",
577+
"gpt-4o",
578+
"o3",
579+
)
580+
)
581+
]
582+
console.print("[bold]Candidate Models[/]:")
583+
for mid in sorted(preferred):
584+
console.print(f"- {mid}")
585+
others = [i for i in ids if i not in preferred]
586+
console.print("\n[dim]Other models (truncated)[/dim]")
587+
for mid in sorted(others)[:20]:
588+
console.print(f"- {mid}")
589+
except Exception as e:
590+
logger.error(f"Model listing failed: {e}")
591+
console.print(f"[bold red]Error:[/bold red] {e}")
592+
raise typer.Exit(1)
593+
594+
551595
if __name__ == "__main__":
552596
app()

orbit_agent/evals.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ class Scenario:
1717
id: str
1818
prompt: str
1919
playbook_path: str | None = None
20+
persona: str | None = None
21+
stage: str | None = None
22+
rubric: List[str] | None = None
2023

2124

2225
@dataclass
@@ -46,6 +49,9 @@ def load_scenarios(path: str | Path) -> List[Scenario]:
4649
id=item.get("id") or f"s{i+1}",
4750
prompt=item["prompt"],
4851
playbook_path=item.get("playbook"),
52+
persona=item.get("persona"),
53+
stage=item.get("stage"),
54+
rubric=item.get("rubric"),
4955
)
5056
)
5157
return scenarios
@@ -126,10 +132,56 @@ def summarize_results(records: List[EvalRecord]) -> Dict[str, Any]:
126132
}
127133

128134

135+
# Optional: experimental grading with an explicit rubric using LLM
136+
try:
137+
import dspy
138+
139+
class RubricGrade(dspy.Signature):
140+
"""You are grading a startup advice response against a rubric.
141+
Return a JSON object strictly with fields: overall (0-10), feedback, criteria.
142+
'criteria' maps each rubric item to pass/fail and a brief note.
143+
"""
144+
145+
persona: str = dspy.InputField()
146+
stage: str = dspy.InputField()
147+
prompt: str = dspy.InputField()
148+
rubric: str = dspy.InputField(desc="Bulleted list of rubric items")
149+
advice: str = dspy.InputField()
150+
151+
grade_json: str = dspy.OutputField()
152+
153+
_rubric_grader = dspy.Predict(RubricGrade)
154+
155+
def grade_with_rubric(
156+
records: List[EvalRecord], scenarios: List[Scenario]
157+
) -> List[Dict[str, Any]]:
158+
id_to_scn = {s.id: s for s in scenarios}
159+
graded: List[Dict[str, Any]] = []
160+
for r in records:
161+
sc = id_to_scn.get(r.scenario_id)
162+
if not sc or not sc.rubric:
163+
continue
164+
rubric_text = "\n".join(f"- {item}" for item in sc.rubric)
165+
out = _rubric_grader(
166+
persona=sc.persona or "",
167+
stage=sc.stage or "",
168+
prompt=sc.prompt,
169+
rubric=rubric_text,
170+
advice=r.advice,
171+
)
172+
graded.append({
173+
"scenario_id": r.scenario_id,
174+
"grade_json": out.grade_json,
175+
})
176+
return graded
177+
except Exception: # pragma: no cover
178+
def grade_with_rubric(records: List[EvalRecord], scenarios: List[Scenario]):
179+
return []
180+
181+
129182
def save_eval_results(records: List[EvalRecord], out_path: str | Path) -> None:
130183
p = Path(out_path)
131184
p.parent.mkdir(parents=True, exist_ok=True)
132185
with p.open("w") as f:
133186
for r in records:
134187
f.write(json.dumps(asdict(r)) + "\n")
135-

0 commit comments

Comments
 (0)