browserbase · tkattkat · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
diff --git a/.changeset/six-oranges-report.md b/.changeset/six-oranges-report.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Enhanced Stagehand agent with smart model routing, expanded toolset, and robust context management. For more information, reference the [stagehand agent docs](https://docs.stagehand.dev/basics/agent)
diff --git a/docs/basics/agent.mdx b/docs/basics/agent.mdx
@@ -62,13 +62,41 @@ await agent.execute("apply for a job at Browserbase")
 
 Use the agent without specifying a provider to utilize any model or LLM provider:
 
-<Note>Non CUA agents are currently only supported in TypeScript</Note>
+<Note>Stagehand agent is currently only supported in TypeScript</Note>
 
 ```typescript TypeScript
+// Basic usage
 const agent = stagehand.agent();
 await agent.execute("apply for a job at Browserbase")
 ```
 
+#### Recommended Configuration
+
+For optimal performance, we recommend using Claude 4 sonnet with Gemini 2.5 Flash as the execution model:
+
+```typescript TypeScript
+const agent = stagehand.agent({
+  model: "anthropic/claude-4-20250514", // Reliable reasoning and planning for the agent
+  executionModel: "google/gemini-2.5-flash", // Fast and reliable execution for stagehand primitives (act, extract, observe)
+  instructions: "You are a helpful assistant that can use a web browser.",
+});
+
+// Enable Claude-specific optimizations for best performance
+await agent.execute({
+  instruction: "apply for a job at Browserbase",
+  storeActions: false, // Unlocks claude-specific tools
+  maxSteps: 25
+});
+```
+
+<Tip>
+**Why this configuration?** Claude 4 provides excellent reasoning and planning, while Gemini 2.5 Flash offers fast execution for stagehand primitives. Setting `storeActions: false` enables coordinate-based tools for a hybrid approach of stagehand primitives and coordinate-based actions, but removes the ability to turn your agent runs into repeatable deterministic scripts.
+</Tip>
+
+<Note>
+All configuration options are optional. The agent works well with default settings, but the above configuration provides the most optimal performance.
+</Note>
+
 
 ## MCP Integrations
 

diff --git a/evals/tasks/agent/onlineMind2Web.ts b/evals/tasks/agent/onlineMind2Web.ts
@@ -44,7 +44,8 @@ export const onlineMind2Web: EvalFunction = async ({
     }
 
     await stagehand.page.goto(params.website, {
-      timeout: 75_000,
+      timeout: 120_000,
+      waitUntil: "commit",
     });
 
     const provider =

diff --git a/lib/agent/contextManager/checkpoints.ts b/lib/agent/contextManager/checkpoints.ts
@@ -0,0 +1,146 @@
+import { CoreAssistantMessage, CoreMessage } from "ai";
+import { isToolCallPart, messagesToText } from ".";
+import type { LLMClient } from "../../llm/LLMClient";
+import { RECENT_MESSAGES_TO_KEEP_IN_SUMMARY } from "./constants";
+
+export interface CheckpointPlan {
+  messagesToCheckpoint: CoreMessage[];
+  recentMessages: CoreMessage[];
+  checkpointCount: number;
+}
+
+export function planCheckpoint(
+  prompt: CoreMessage[],
+  systemMsgIndex: number,
+  toolCount: number,
+  recentToolsToKeep: number,
+  checkpointInterval: number,
+): CheckpointPlan | null {
+  if (toolCount < checkpointInterval) return null;
+
+  const checkpointCount = Math.floor(toolCount / checkpointInterval);
+  const toolsToKeep = toolCount - checkpointCount * checkpointInterval;
+  const recentToolsStart = Math.max(
+    0,
+    toolCount - Math.max(recentToolsToKeep, toolsToKeep),
+  );
+
+  const messagesToCheckpoint: CoreMessage[] = [];
+  const recentMessages: CoreMessage[] = [];
+  let currentToolCount = 0;
+
+  prompt.forEach((msg, idx) => {
+    if (idx <= systemMsgIndex) return;
+    const msgToolCount = countToolsInMessage(msg);
+    if (currentToolCount < recentToolsStart) messagesToCheckpoint.push(msg);
+    else recentMessages.push(msg);
+    currentToolCount += msgToolCount;
+  });
+
+  if (messagesToCheckpoint.length === 0) return null;
+  return { messagesToCheckpoint, recentMessages, checkpointCount };
+}
+
+export async function generateCheckpointSummary(
+  messages: CoreMessage[],
+  checkpointCount: number,
+  llmClient: LLMClient,
+): Promise<string> {
+  const conversationText = messagesToText(messages);
+  const model = llmClient.getLanguageModel?.();
+  if (!model) {
+    return `[Checkpoint Summary - ${checkpointCount} checkpoints]\n[Summary generation failed: LLM not available]`;
+  }
+
+  const { text } = await llmClient.generateText({
+    model,
+    messages: [
+      {
+        role: "user",
+        content: `Create a concise checkpoint summary of this browser automation conversation segment.
+
+Focus on:
+1. What browser actions were performed
+2. What was accomplished
+3. Current state/context
+4. Any errors or issues
+
+Conversation segment:
+${conversationText}
+
+Provide a brief summary (max 200 words) that preserves essential context for continuing the automation task:`,
+      },
+    ],
+    maxTokens: 300,
+    temperature: 0.3,
+  });
+
+  return `[Checkpoint Summary - ${checkpointCount} checkpoints]\n${text}`;
+}
+
+export async function summarizeConversation(
+  prompt: CoreMessage[],
+  systemMsgIndex: number,
+  llmClient: LLMClient,
+): Promise<{
+  summaryMessage: CoreAssistantMessage;
+  recentMessages: CoreMessage[];
+}> {
+  const recentMessages = prompt.slice(-RECENT_MESSAGES_TO_KEEP_IN_SUMMARY);
+  const summary = await generateConversationSummary(
+    prompt.slice(systemMsgIndex + 1),
+    llmClient,
+  );
+  const summaryMessage: CoreAssistantMessage = {
+    role: "assistant",
+    content: `[Previous Conversation Summary]\n\n${summary}\n\n[End of Summary - Continuing conversation from this point]`,
+  };
+  return { summaryMessage, recentMessages };
+}
+
+export async function generateConversationSummary(
+  messages: CoreMessage[],
+  llmClient: LLMClient,
+): Promise<string> {
+  const conversationText = messagesToText(messages);
+  const model = llmClient.getLanguageModel?.();
+  if (!model) return "[Summary generation failed: LLM not available]";
+
+  const { text } = await llmClient.generateText({
+    model,
+    messages: [
+      {
+        role: "user",
+        content: `Analyze this browser automation conversation and create a comprehensive summary that preserves all important context.
+
+Conversation:
+${conversationText}
+
+Create a summary that:
+1. Captures all key browser actions and their outcomes
+2. Preserves important technical details
+3. Maintains context about what was accomplished
+4. Notes the current page/state
+5. Includes any pending tasks or issues
+6. Summarizes data extracted or forms filled
+
+Provide a thorough summary that will allow continuation of the automation task:`,
+      },
+    ],
+    maxTokens: 500,
+    temperature: 0.3,
+  });
+
+  return text;
+}
+
+export function countToolsInMessage(msg: CoreMessage): number {
+  if (msg.role === "tool") return 1;
+  if (msg.role === "assistant") {
+    const assistantMsg = msg;
+    if (typeof assistantMsg.content !== "string") {
+      return assistantMsg.content.filter((part) => isToolCallPart(part)).length;
+    }
+  }
+  return 0;
+}
diff --git a/lib/agent/contextManager/compression.ts b/lib/agent/contextManager/compression.ts
@@ -0,0 +1,134 @@
+import { CoreMessage, ToolContent } from "ai";
+import {
+  compressToolResultContent,
+  isImageContentPart,
+  isToolResultContentPart,
+} from ".";
+import {
+  DEFAULT_TRUNCATE_TEXT_OVER,
+  SCREENSHOT_TEXT_PLACEHOLDER,
+  TOOL_RESULT_AGE_MESSAGES_TO_CONSIDER_OLD,
+  MAX_PREVIOUS_SAME_TOOL_RESULTS_TO_KEEP,
+} from "./constants";
+import { LogLevel } from "@/types/log";
+
+export function compressToolResults(
+  prompt: CoreMessage[],
+  logger?: (message: string, level: LogLevel) => void,
+): CoreMessage[] {
+  const processed = [...prompt];
+  const toolPositions = new Map<string, number[]>();
+  let replacedOldToolResults = 0;
+  let replacedOldScreenshots = 0;
+  let replacedOldAriaTrees = 0;
+  let imagesConvertedToText = 0;
+  let truncatedLongToolResults = 0;
+
+  prompt.forEach((msg, idx) => {
+    if (msg.role === "tool") {
+      const toolMessage = msg;
+      toolMessage.content.forEach((item) => {
+        if (isToolResultContentPart(item)) {
+          const positions = toolPositions.get(item.toolName) || [];
+          positions.push(idx);
+          toolPositions.set(item.toolName, positions);
+        }
+      });
+    }
+  });
+
+  const mapped = processed.map((msg, idx) => {
+    if (msg.role === "tool") {
+      const toolMessage = msg;
+      const processedContent: ToolContent = toolMessage.content.map((item) => {
+        if (isToolResultContentPart(item)) {
+          const positions = toolPositions.get(item.toolName) || [];
+          const currentPos = positions.indexOf(idx);
+          const isOldByAge =
+            prompt.length - idx > TOOL_RESULT_AGE_MESSAGES_TO_CONSIDER_OLD;
+          const isOldByCount =
+            currentPos >= 0 &&
+            positions.length - currentPos >
+              MAX_PREVIOUS_SAME_TOOL_RESULTS_TO_KEEP;
+          const isOld = isOldByAge || isOldByCount;
+          if (isOld) {
+            if (item.toolName === "screenshot") {
+              replacedOldToolResults++;
+              replacedOldScreenshots++;
+              logger?.(
+                `[compression] Replaced old screenshot tool-result at message index ${idx} (reason: ${[
+                  isOldByAge ? "age" : "",
+                  isOldByCount ? "prior-results" : "",
+                ]
+                  .filter(Boolean)
+                  .join("+")})`,
+                2,
+              );
+              return {
+                type: "tool-result",
+                toolCallId: item.toolCallId,
+                toolName: item.toolName,
+                result: "Screenshot taken",
+              };
+            } else if (item.toolName === "ariaTree") {
+              replacedOldToolResults++;
+              replacedOldAriaTrees++;
+              logger?.(
+                `[compression] Compressed old ariaTree tool-result at message index ${idx} (reason: ${[
+                  isOldByAge ? "age" : "",
+                  isOldByCount ? "prior-results" : "",
+                ]
+                  .filter(Boolean)
+                  .join("+")})`,
+                2,
+              );
+              return {
+                type: "tool-result",
+                toolCallId: item.toolCallId,
+                toolName: item.toolName,
+                result: {
+                  success: true,
+                  content: "Aria tree retrieved (compressed)",
+                },
+              };
+            }
+          }
+        }
+        // Convert screenshot image content to text
+        if (isImageContentPart(item)) {
+          imagesConvertedToText++;
+          return {
+            type: "text",
+            text: SCREENSHOT_TEXT_PLACEHOLDER,
+          } as unknown as ToolContent[number];
+        }
+
+        if (isToolResultContentPart(item)) {
+          const compressed = compressToolResultContent(item, {
+            truncateTextOver: DEFAULT_TRUNCATE_TEXT_OVER,
+          });
+          if (compressed !== item) truncatedLongToolResults++;
+          return compressed;
+        }
+
+        return item;
+      });
+
+      return { ...toolMessage, content: processedContent };
+    }
+    return msg;
+  });
+
+  if (
+    replacedOldToolResults > 0 ||
+    imagesConvertedToText > 0 ||
+    truncatedLongToolResults > 0
+  ) {
+    logger?.(
+      `[compression] Summary: replaced old tool-results=${replacedOldToolResults} (screenshots=${replacedOldScreenshots}, ariaTree=${replacedOldAriaTrees}); images→text=${imagesConvertedToText}; truncated long tool results=${truncatedLongToolResults}`,
+      2,
+    );
+  }
+
+  return mapped;
+}
diff --git a/lib/agent/contextManager/constants.ts b/lib/agent/contextManager/constants.ts
@@ -0,0 +1,23 @@
+// Compression thresholds
+export const TOOL_RESULT_AGE_MESSAGES_TO_CONSIDER_OLD = 7;
+export const MAX_PREVIOUS_SAME_TOOL_RESULTS_TO_KEEP = 2;
+export const DEFAULT_TRUNCATE_TEXT_OVER = 4000;
+
+// Token estimation defaults
+export const DEFAULT_TOKENS_PER_IMAGE = 2000;
+export const DEFAULT_TOKENS_PER_TOOL_CALL = 50;
+export const DEFAULT_TOKENS_FOR_UNKNOWN_TOOL_CONTENT = 200;
+
+// Summaries and previews
+export const ARIA_TREE_PREVIEW_CHARS = 100;
+export const GENERIC_RESULT_PREVIEW_CHARS = 50;
+export const RECENT_MESSAGES_TO_KEEP_IN_SUMMARY = 10;
+
+// Text placeholders
+export const SCREENSHOT_TEXT_PLACEHOLDER = "[screenshot]";
+export const IMAGE_TEXT_PLACEHOLDER = "[image]";
+
+// Context manager thresholds
+export const CHECKPOINT_INTERVAL = 50;
+export const RECENT_TOOLS_TO_KEEP = 10;
+export const SUMMARIZATION_THRESHOLD = 120000;