Skip to content

Commit 9db18fe

Browse files
committed
feat: best-of-N rerank; add playbook overlap metric; expand eval report; default model openai/gpt-4.1
1 parent 01366b8 commit 9db18fe

File tree

4 files changed

+89
-24
lines changed

4 files changed

+89
-24
lines changed

orbit_agent/advisor.py

Lines changed: 44 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -261,34 +261,55 @@ def forward(
261261

262262
context_with_history = context + recent_context
263263

264-
# Generate advice with retry
265-
logger.info("Generating advice with LLM")
266-
draft = self._call_llm_with_retry(
267-
self.generate,
268-
history=history_str,
269-
playbook=playbook,
270-
context=context_with_history,
271-
tool_results="No tools used in this session",
272-
)
264+
# Best-of-N generation and rerank by critic
265+
from .config import get_config
266+
267+
cfg = get_config()
268+
best_of_n = max(1, int(getattr(cfg, "best_of_n", 1)))
269+
best_payload = None
270+
best_score = -1
271+
272+
for _ in range(best_of_n):
273+
logger.info("Generating advice with LLM")
274+
draft = self._call_llm_with_retry(
275+
self.generate,
276+
history=history_str,
277+
playbook=playbook,
278+
context=context_with_history,
279+
tool_results="No tools used in this session",
280+
)
273281

274-
# Critique with retry
275-
logger.info("Getting critique")
276-
critique = self._call_llm_with_retry(
277-
self.critic,
278-
advice=self._clean_output(draft.advice),
279-
context=context_with_history,
280-
)
282+
# Critique with retry
283+
logger.info("Getting critique")
284+
critique = self._call_llm_with_retry(
285+
self.critic,
286+
advice=self._clean_output(draft.advice),
287+
context=context_with_history,
288+
)
289+
290+
score = int(getattr(critique, "score", 0) or 0)
291+
if score > best_score:
292+
best_score = score
293+
best_payload = (
294+
self._clean_output(draft.advice),
295+
self._clean_output(draft.actions_48h),
296+
self._clean_output(draft.metric_to_watch),
297+
self._clean_output(draft.risks),
298+
critique.feedback,
299+
score,
300+
)
281301

302+
advice, actions_48h, metric_to_watch, risks, feedback, score = best_payload
282303
result = dspy.Prediction(
283-
advice=self._clean_output(draft.advice),
284-
actions_48h=self._clean_output(draft.actions_48h),
285-
metric_to_watch=self._clean_output(draft.metric_to_watch),
286-
risks=self._clean_output(draft.risks),
287-
critique=critique.feedback,
288-
score=critique.score,
304+
advice=advice,
305+
actions_48h=actions_48h,
306+
metric_to_watch=metric_to_watch,
307+
risks=risks,
308+
critique=feedback,
309+
score=score,
289310
)
290311

291-
logger.info(f"Advice generated successfully, score: {critique.score}")
312+
logger.info(f"Advice generated successfully, score: {score}")
292313
return result
293314

294315
except Exception as e:

orbit_agent/cli.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,20 @@ def eval_report(
540540
console.print("[bold]Summary[/]:")
541541
for k, v in summary.items():
542542
console.print(f"- {k}: {v}")
543+
# Print per-scenario brief
544+
console.print("\n[bold]By Scenario[/]:")
545+
from collections import defaultdict
546+
547+
by_id = defaultdict(list)
548+
for r in rec_objs:
549+
by_id[r.scenario_id].append(r)
550+
for sid, items in by_id.items():
551+
cs = sum(i.critic_score for i in items) / len(items)
552+
ov = sum((i.overlap_ratio or 0.0) for i in items) / len(items)
553+
lat = sum(i.latency_ms for i in items) / len(items)
554+
console.print(
555+
f"- {sid}: score={cs:.2f}, overlap={ov:.2f}, latency_ms={lat:.0f}"
556+
)
543557
except Exception as e:
544558
logger.error(f"Eval report failed: {e}")
545559
console.print(f"[bold red]Error:[/bold red] {e}")

orbit_agent/config.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
logger = logging.getLogger(__name__)
1616

1717
# Default model configurations
18-
DEFAULT_OPENAI = "openai/gpt-4o-mini"
18+
DEFAULT_OPENAI = "openai/gpt-4.1"
1919
DEFAULT_ANTHROPIC = "anthropic/claude-3-5-sonnet-20240620"
2020
DEFAULT_OLLAMA = "ollama_chat/llama3.2"
2121

@@ -74,6 +74,9 @@ class AppConfig:
7474
track_usage: bool = False
7575
cost_per_1k_prompt: float = 0.0
7676
cost_per_1k_completion: float = 0.0
77+
# Generation quality controls
78+
best_of_n: int = 1
79+
temperature: float = 0.7
7780

7881
def __post_init__(self):
7982
"""Validate configuration after initialization"""
@@ -179,6 +182,8 @@ def load_config() -> AppConfig:
179182
track_usage=os.getenv("ORBIT_TRACK_USAGE", "false").lower() == "true",
180183
cost_per_1k_prompt=float(os.getenv("ORBIT_COST_PER_1K_PROMPT", "0")),
181184
cost_per_1k_completion=float(os.getenv("ORBIT_COST_PER_1K_COMPLETION", "0")),
185+
best_of_n=int(os.getenv("ORBIT_BEST_OF_N", "1")),
186+
temperature=float(os.getenv("ORBIT_TEMPERATURE", "0.7")),
182187
)
183188

184189
return config

orbit_agent/evals.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class EvalRecord:
3737
format_ok: bool
3838
actions_count: int
3939
risks_count: int
40+
overlap_ratio: float | None = None
4041

4142

4243
def load_scenarios(path: str | Path) -> List[Scenario]:
@@ -84,6 +85,25 @@ def _format_eval(
8485
return format_ok, actions_count, risks_count
8586

8687

88+
def _ngram_set(text: str, n: int = 3) -> set[str]:
89+
tokens = [t for t in text.lower().split() if t]
90+
return set(
91+
[" ".join(tokens[i : i + n]) for i in range(0, max(0, len(tokens) - n + 1))]
92+
)
93+
94+
95+
def _overlap_ratio(a: str, b: str, n: int = 3) -> float:
96+
if not a or not b:
97+
return 0.0
98+
A = _ngram_set(a, n)
99+
B = _ngram_set(b, n)
100+
if not A or not B:
101+
return 0.0
102+
inter = len(A & B)
103+
union = len(A | B)
104+
return inter / union
105+
106+
87107
def run_evals(scenarios: List[Scenario]) -> List[EvalRecord]:
88108
# Ensure LM is configured for online evals
89109
configure_lm()
@@ -106,6 +126,8 @@ def run_evals(scenarios: List[Scenario]) -> List[EvalRecord]:
106126
res.advice or "", actions_lines, risks_lines
107127
)
108128

129+
overlap = _overlap_ratio(res.advice or "", playbook) if playbook else 0.0
130+
109131
records.append(
110132
EvalRecord(
111133
scenario_id=sc.id,
@@ -121,6 +143,7 @@ def run_evals(scenarios: List[Scenario]) -> List[EvalRecord]:
121143
format_ok=format_ok,
122144
actions_count=a_count,
123145
risks_count=r_count,
146+
overlap_ratio=overlap,
124147
)
125148
)
126149

@@ -134,11 +157,13 @@ def summarize_results(records: List[EvalRecord]) -> Dict[str, Any]:
134157
fmt_ok = sum(1 for r in records if r.format_ok)
135158
avg_score = sum(r.critic_score for r in records) / n
136159
avg_latency = sum(r.latency_ms for r in records) / n
160+
avg_overlap = sum((r.overlap_ratio or 0.0) for r in records) / n
137161
return {
138162
"count": n,
139163
"format_ok_rate": fmt_ok / n,
140164
"avg_critic_score": avg_score,
141165
"avg_latency_ms": avg_latency,
166+
"avg_playbook_overlap": avg_overlap,
142167
}
143168

144169

0 commit comments

Comments
 (0)