feat: best-of-N rerank; add playbook overlap metric; expand eval report; default model openai/gpt-4.1

haasonsaas · haasonsaas · commit 9db18fec63c1 · 2025-09-09T16:19:05.000-07:00
diff --git a/orbit_agent/advisor.py b/orbit_agent/advisor.py
@@ -261,34 +261,55 @@ def forward(
 
             context_with_history = context + recent_context
 
-            # Generate advice with retry
-            logger.info("Generating advice with LLM")
-            draft = self._call_llm_with_retry(
-                self.generate,
-                history=history_str,
-                playbook=playbook,
-                context=context_with_history,
-                tool_results="No tools used in this session",
-            )
+            # Best-of-N generation and rerank by critic
+            from .config import get_config
+
+            cfg = get_config()
+            best_of_n = max(1, int(getattr(cfg, "best_of_n", 1)))
+            best_payload = None
+            best_score = -1
+
+            for _ in range(best_of_n):
+                logger.info("Generating advice with LLM")
+                draft = self._call_llm_with_retry(
+                    self.generate,
+                    history=history_str,
+                    playbook=playbook,
+                    context=context_with_history,
+                    tool_results="No tools used in this session",
+                )
 
-            # Critique with retry
-            logger.info("Getting critique")
-            critique = self._call_llm_with_retry(
-                self.critic,
-                advice=self._clean_output(draft.advice),
-                context=context_with_history,
-            )
+                # Critique with retry
+                logger.info("Getting critique")
+                critique = self._call_llm_with_retry(
+                    self.critic,
+                    advice=self._clean_output(draft.advice),
+                    context=context_with_history,
+                )
+
+                score = int(getattr(critique, "score", 0) or 0)
+                if score > best_score:
+                    best_score = score
+                    best_payload = (
+                        self._clean_output(draft.advice),
+                        self._clean_output(draft.actions_48h),
+                        self._clean_output(draft.metric_to_watch),
+                        self._clean_output(draft.risks),
+                        critique.feedback,
+                        score,
+                    )
 
+            advice, actions_48h, metric_to_watch, risks, feedback, score = best_payload
             result = dspy.Prediction(
-                advice=self._clean_output(draft.advice),
-                actions_48h=self._clean_output(draft.actions_48h),
-                metric_to_watch=self._clean_output(draft.metric_to_watch),
-                risks=self._clean_output(draft.risks),
-                critique=critique.feedback,
-                score=critique.score,
+                advice=advice,
+                actions_48h=actions_48h,
+                metric_to_watch=metric_to_watch,
+                risks=risks,
+                critique=feedback,
+                score=score,
             )
 
-            logger.info(f"Advice generated successfully, score: {critique.score}")
+            logger.info(f"Advice generated successfully, score: {score}")
             return result
 
         except Exception as e:
diff --git a/orbit_agent/cli.py b/orbit_agent/cli.py
@@ -540,6 +540,20 @@ def eval_report(
         console.print("[bold]Summary[/]:")
         for k, v in summary.items():
             console.print(f"- {k}: {v}")
+        # Print per-scenario brief
+        console.print("\n[bold]By Scenario[/]:")
+        from collections import defaultdict
+
+        by_id = defaultdict(list)
+        for r in rec_objs:
+            by_id[r.scenario_id].append(r)
+        for sid, items in by_id.items():
+            cs = sum(i.critic_score for i in items) / len(items)
+            ov = sum((i.overlap_ratio or 0.0) for i in items) / len(items)
+            lat = sum(i.latency_ms for i in items) / len(items)
+            console.print(
+                f"- {sid}: score={cs:.2f}, overlap={ov:.2f}, latency_ms={lat:.0f}"
+            )
     except Exception as e:
         logger.error(f"Eval report failed: {e}")
         console.print(f"[bold red]Error:[/bold red] {e}")
diff --git a/orbit_agent/config.py b/orbit_agent/config.py
@@ -15,7 +15,7 @@
 logger = logging.getLogger(__name__)
 
 # Default model configurations
-DEFAULT_OPENAI = "openai/gpt-4o-mini"
+DEFAULT_OPENAI = "openai/gpt-4.1"
 DEFAULT_ANTHROPIC = "anthropic/claude-3-5-sonnet-20240620"
 DEFAULT_OLLAMA = "ollama_chat/llama3.2"
 
@@ -74,6 +74,9 @@ class AppConfig:
     track_usage: bool = False
     cost_per_1k_prompt: float = 0.0
     cost_per_1k_completion: float = 0.0
+    # Generation quality controls
+    best_of_n: int = 1
+    temperature: float = 0.7
 
     def __post_init__(self):
         """Validate configuration after initialization"""
@@ -179,6 +182,8 @@ def load_config() -> AppConfig:
         track_usage=os.getenv("ORBIT_TRACK_USAGE", "false").lower() == "true",
         cost_per_1k_prompt=float(os.getenv("ORBIT_COST_PER_1K_PROMPT", "0")),
         cost_per_1k_completion=float(os.getenv("ORBIT_COST_PER_1K_COMPLETION", "0")),
+        best_of_n=int(os.getenv("ORBIT_BEST_OF_N", "1")),
+        temperature=float(os.getenv("ORBIT_TEMPERATURE", "0.7")),
     )
 
     return config
diff --git a/orbit_agent/evals.py b/orbit_agent/evals.py
@@ -37,6 +37,7 @@ class EvalRecord:
     format_ok: bool
     actions_count: int
     risks_count: int
+    overlap_ratio: float | None = None
 
 
 def load_scenarios(path: str | Path) -> List[Scenario]:
@@ -84,6 +85,25 @@ def _format_eval(
     return format_ok, actions_count, risks_count
 
 
+def _ngram_set(text: str, n: int = 3) -> set[str]:
+    tokens = [t for t in text.lower().split() if t]
+    return set(
+        [" ".join(tokens[i : i + n]) for i in range(0, max(0, len(tokens) - n + 1))]
+    )
+
+
+def _overlap_ratio(a: str, b: str, n: int = 3) -> float:
+    if not a or not b:
+        return 0.0
+    A = _ngram_set(a, n)
+    B = _ngram_set(b, n)
+    if not A or not B:
+        return 0.0
+    inter = len(A & B)
+    union = len(A | B)
+    return inter / union
+
+
 def run_evals(scenarios: List[Scenario]) -> List[EvalRecord]:
     # Ensure LM is configured for online evals
     configure_lm()
@@ -106,6 +126,8 @@ def run_evals(scenarios: List[Scenario]) -> List[EvalRecord]:
             res.advice or "", actions_lines, risks_lines
         )
 
+        overlap = _overlap_ratio(res.advice or "", playbook) if playbook else 0.0
+
         records.append(
             EvalRecord(
                 scenario_id=sc.id,
@@ -121,6 +143,7 @@ def run_evals(scenarios: List[Scenario]) -> List[EvalRecord]:
                 format_ok=format_ok,
                 actions_count=a_count,
                 risks_count=r_count,
+                overlap_ratio=overlap,
             )
         )
 
@@ -134,11 +157,13 @@ def summarize_results(records: List[EvalRecord]) -> Dict[str, Any]:
     fmt_ok = sum(1 for r in records if r.format_ok)
     avg_score = sum(r.critic_score for r in records) / n
     avg_latency = sum(r.latency_ms for r in records) / n
+    avg_overlap = sum((r.overlap_ratio or 0.0) for r in records) / n
     return {
         "count": n,
         "format_ok_rate": fmt_ok / n,
         "avg_critic_score": avg_score,
         "avg_latency_ms": avg_latency,
+        "avg_playbook_overlap": avg_overlap,
     }