browserbase · tkattkat · Aug 10, 2025 · Aug 11, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/.changeset/rich-colts-march.md b/.changeset/rich-colts-march.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Add more evals for stagehand agent
diff --git a/evals/evals.config.json b/evals/evals.config.json
@@ -323,6 +323,34 @@
       "name": "agent/google_flights",
       "categories": ["agent"]
     },
+    {
+      "name": "agent/github_react_version",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/steam_games",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/ubereats",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/kith",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/apple_tv",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/apple_trade_in",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/arxiv_gpt_report",
+      "categories": ["agent"]
+    },
     {
       "name": "agent/sf_library_card",
       "categories": ["agent"]
@@ -331,6 +359,14 @@
       "name": "agent/sf_library_card_multiple",
       "categories": ["agent"]
     },
+    {
+      "name": "agent/hugging_face",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/google_maps_3",
+      "categories": ["agent"]
+    },
     {
       "name": "login",
       "categories": ["act", "regression"]
@@ -423,5 +459,26 @@
       "name": "namespace_xpath",
       "categories": ["act"]
     }
+    ,
+    {
+      "name": "agent/nba_trades",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/hotel_booking",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/github",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/all_recipes",
+      "categories": ["agent"]
+    },
+    {
+      "name": "agent/google_shopping",
+      "categories": ["agent"]
+    }
   ]
 }
diff --git a/evals/initStagehand.ts b/evals/initStagehand.ts
@@ -18,6 +18,7 @@ import {
 } from "@browserbasehq/stagehand";
 import { EvalLogger } from "./logger";
 import type { StagehandInitResult } from "@/types/evals";
+import type { AgentConfig } from "@/dist";
 import { AvailableModel } from "@browserbasehq/stagehand";
 
 /**
@@ -104,12 +105,26 @@ export const initStagehand = async ({
   // Set navigation timeout to 60 seconds for evaluations
   stagehand.context.setDefaultNavigationTimeout(60_000);
 
+  const isCUAModel = (model: string): boolean =>
+    model.includes("computer-use-preview") || model.startsWith("claude");
+
+  let agentConfig: AgentConfig | undefined;
+  if (isCUAModel(modelName)) {
+    agentConfig = {
+      model: modelName,
+      provider: modelName.startsWith("claude") ? "anthropic" : "openai",
+    } as AgentConfig;
+  }
+
+  const agent = stagehand.agent(agentConfig);
+
   return {
     stagehand,
     stagehandConfig: config,
     logger,
     debugUrl,
     sessionUrl,
     modelName,
+    agent,
   };
 };
diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts
@@ -106,7 +106,7 @@ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
 
 const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS
   ? process.env.EVAL_AGENT_MODELS.split(",")
-  : ["computer-use-preview-2025-03-11", "claude-3-7-sonnet-latest"];
+  : ["computer-use-preview-2025-03-11", "claude-sonnet-4-20250514"];
 
 /**
  * getModelList:

diff --git a/evals/tasks/agent/all_recipes.ts b/evals/tasks/agent/all_recipes.ts
@@ -0,0 +1,61 @@
+import { Evaluator } from "@/evals/evaluator";
+import { EvalFunction } from "@/types/evals";
+
+export const all_recipes: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://www.allrecipes.com/");
+    const evaluator = new Evaluator(stagehand);
+    const agentResult = await agent.execute({
+      instruction:
+        "Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
+      maxSteps: 20,
+    });
+
+    const { evaluation, reasoning } = await evaluator.evaluate({
+      question: "Did the agent find a recipe for Beef Wellington",
+    });
+
+    logger.log(agentResult);
+
+    const success =
+      agentResult.success &&
+      evaluation === "YES" &&
+      stagehand.page.url() ===
+        "https://www.allrecipes.com/recipe/16899/beef-wellington/";
+
+    if (!success) {
+      return {
+        _success: false,
+        message: reasoning,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      error,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    } as unknown as ReturnType<EvalFunction> extends Promise<infer R>
+      ? R
+      : never;
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/apple_trade_in.ts b/evals/tasks/agent/apple_trade_in.ts
@@ -0,0 +1,60 @@
+//this eval is expected to fail due to issues scrolling within the trade in dialog
+import { EvalFunction } from "@/types/evals";
+import { z } from "zod";
+
+export const apple_trade_in: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://www.apple.com/shop/trade-in");
+    const agentResult = await agent.execute({
+      instruction:
+        "Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
+      maxSteps: 30,
+    });
+
+    const { tradeInValue } = await stagehand.page.extract({
+      modelName: "google/gemini-2.5-flash",
+      instruction:
+        "Extract the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website. it will be inside this text : Get x trade-in credit toward a new iPhone', provide just the number",
+      schema: z.object({
+        tradeInValue: z.number(),
+      }),
+    });
+
+    const success =
+      agentResult.success &&
+      tradeInValue === 360 &&
+      stagehand.page.url().includes("https://www.apple.com/shop/trade-in");
+
+    if (!success) {
+      return {
+        _success: false,
+        message: agentResult.message,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      message: error.message,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/apple_tv.ts b/evals/tasks/agent/apple_tv.ts
@@ -0,0 +1,61 @@
+import { EvalFunction } from "@/types/evals";
+import { z } from "zod";
+
+export const apple_tv: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://www.apple.com/");
+
+    const agentResult = await agent.execute({
+      instruction:
+        "Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
+      maxSteps: 30,
+    });
+
+    const { height, width } = await stagehand.page.extract({
+      modelName: "google/gemini-2.5-flash",
+      instruction: "Extract the size and weight of the Apple TV 4K",
+      schema: z.object({
+        height: z.number().describe("The height of the Apple TV 4K in inches"),
+        width: z.number().describe("The width of the Apple TV 4K in inches"),
+      }),
+    });
+
+    const success =
+      agentResult.success &&
+      height === 1.2 &&
+      width === 3.66 &&
+      stagehand.page.url().includes("https://www.apple.com/apple-tv-4k/specs/");
+
+    if (!success) {
+      return {
+        _success: false,
+        message: agentResult.message,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      message: error.message,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/evals/tasks/agent/arxiv_gpt_report.ts b/evals/tasks/agent/arxiv_gpt_report.ts
@@ -0,0 +1,60 @@
+//agent often fails on this one,
+import { EvalFunction } from "@/types/evals";
+import { z } from "zod";
+export const arxiv_gpt_report: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+  agent,
+}) => {
+  try {
+    await stagehand.page.goto("https://arxiv.org/");
+
+    const agentResult = await agent.execute({
+      instruction:
+        "Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
+      maxSteps: 20,
+    });
+
+    // Mon, 27 Mar 2023 17:46:54 UTC
+    const { date } = await stagehand.page.extract({
+      modelName: "google/gemini-2.5-flash",
+      instruction:
+        "Extract the date of the v3 submission history, it should be in the format 'MM-DD-YYYY'",
+      schema: z.object({
+        date: z.string().describe("The date of the v3 submission history"),
+      }),
+    });
+
+    console.log(`date: ${date}`);
+
+    const success = agentResult.success && date === "03-27-2023";
+
+    if (!success) {
+      return {
+        _success: false,
+        message: agentResult.message,
+        debugUrl,
+        sessionUrl,
+        logs: logger.getLogs(),
+      };
+    }
+    return {
+      _success: true,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } catch (error) {
+    return {
+      _success: false,
+      message: error.message,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  } finally {
+    await stagehand.close();
+  }
+};