Skip to content
Merged
5 changes: 5 additions & 0 deletions .changeset/rich-colts-march.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
---

Add more evals for stagehand agent
57 changes: 57 additions & 0 deletions evals/evals.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,34 @@
"name": "agent/google_flights",
"categories": ["agent"]
},
{
"name": "agent/github_react_version",
"categories": ["agent"]
},
{
"name": "agent/steam_games",
"categories": ["agent"]
},
{
"name": "agent/ubereats",
"categories": ["agent"]
},
{
"name": "agent/kith",
"categories": ["agent"]
},
{
"name": "agent/apple_tv",
"categories": ["agent"]
},
{
"name": "agent/apple_trade_in",
"categories": ["agent"]
},
{
"name": "agent/arxiv_gpt_report",
"categories": ["agent"]
},
{
"name": "agent/sf_library_card",
"categories": ["agent"]
Expand All @@ -331,6 +359,14 @@
"name": "agent/sf_library_card_multiple",
"categories": ["agent"]
},
{
"name": "agent/hugging_face",
"categories": ["agent"]
},
{
"name": "agent/google_maps_3",
"categories": ["agent"]
},
{
"name": "login",
"categories": ["act", "regression"]
Expand Down Expand Up @@ -423,5 +459,26 @@
"name": "namespace_xpath",
"categories": ["act"]
}
,
{
"name": "agent/nba_trades",
"categories": ["agent"]
},
{
"name": "agent/hotel_booking",
"categories": ["agent"]
},
{
"name": "agent/github",
"categories": ["agent"]
},
{
"name": "agent/all_recipes",
"categories": ["agent"]
},
{
"name": "agent/google_shopping",
"categories": ["agent"]
}
]
}
15 changes: 15 additions & 0 deletions evals/initStagehand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import {
} from "@browserbasehq/stagehand";
import { EvalLogger } from "./logger";
import type { StagehandInitResult } from "@/types/evals";
import type { AgentConfig } from "@/dist";
import { AvailableModel } from "@browserbasehq/stagehand";

/**
Expand Down Expand Up @@ -104,12 +105,26 @@ export const initStagehand = async ({
// Set navigation timeout to 60 seconds for evaluations
stagehand.context.setDefaultNavigationTimeout(60_000);

const isCUAModel = (model: string): boolean =>
model.includes("computer-use-preview") || model.startsWith("claude");

let agentConfig: AgentConfig | undefined;
if (isCUAModel(modelName)) {
agentConfig = {
model: modelName,
provider: modelName.startsWith("claude") ? "anthropic" : "openai",
} as AgentConfig;
}

const agent = stagehand.agent(agentConfig);

return {
stagehand,
stagehandConfig: config,
logger,
debugUrl,
sessionUrl,
modelName,
agent,
};
};
2 changes: 1 addition & 1 deletion evals/taskConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS

const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS
? process.env.EVAL_AGENT_MODELS.split(",")
: ["computer-use-preview-2025-03-11", "claude-3-7-sonnet-latest"];
: ["computer-use-preview-2025-03-11", "claude-sonnet-4-20250514"];

/**
* getModelList:
Expand Down
59 changes: 59 additions & 0 deletions evals/tasks/agent/all_recipes.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import { Evaluator } from "@/evals/evaluator";
import { EvalFunction } from "@/types/evals";

export const all_recipes: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://www.allrecipes.com/");
const evaluator = new Evaluator(stagehand);
const agentResult = await agent.execute({
instruction:
"Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
maxSteps: 20,
});

const { evaluation, reasoning } = await evaluator.evaluate({
question: "Did the agent find a recipe for Beef Wellington",
});

logger.log(agentResult);

const success =
agentResult.success &&
evaluation === "YES" &&
stagehand.page.url() ===
"https://www.allrecipes.com/recipe/16899/beef-wellington/";

if (!success) {
return {
_success: false,
message: reasoning,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}

return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
error,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} finally {
await stagehand.close();
}
};
60 changes: 60 additions & 0 deletions evals/tasks/agent/apple_trade_in.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
//this eval is expected to fail due to issues scrolling within the trade in dialog
import { EvalFunction } from "@/types/evals";
import { z } from "zod";

export const apple_trade_in: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://www.apple.com/shop/trade-in");
const agentResult = await agent.execute({
instruction:
"Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
maxSteps: 30,
});

const { tradeInValue } = await stagehand.page.extract({
modelName: "google/gemini-2.5-flash",
instruction:
"Extract the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website. it will be inside this text : Get x trade-in credit toward a new iPhone', provide just the number",
schema: z.object({
tradeInValue: z.number(),
}),
});

const success =
agentResult.success &&
tradeInValue === 360 &&
stagehand.page.url().includes("https://www.apple.com/shop/trade-in");

if (!success) {
return {
_success: false,
message: agentResult.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
message: error.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} finally {
await stagehand.close();
}
};
61 changes: 61 additions & 0 deletions evals/tasks/agent/apple_tv.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import { EvalFunction } from "@/types/evals";
import { z } from "zod";

export const apple_tv: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://www.apple.com/");

const agentResult = await agent.execute({
instruction:
"Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
maxSteps: 30,
});

const { height, width } = await stagehand.page.extract({
modelName: "google/gemini-2.5-flash",
instruction: "Extract the size and weight of the Apple TV 4K",
schema: z.object({
height: z.number().describe("The height of the Apple TV 4K in inches"),
width: z.number().describe("The width of the Apple TV 4K in inches"),
}),
});

const success =
agentResult.success &&
height === 1.2 &&
width === 3.66 &&
stagehand.page.url().includes("https://www.apple.com/apple-tv-4k/specs/");

if (!success) {
return {
_success: false,
message: agentResult.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
message: error.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} finally {
await stagehand.close();
}
};
60 changes: 60 additions & 0 deletions evals/tasks/agent/arxiv_gpt_report.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
//agent often fails on this one,
import { EvalFunction } from "@/types/evals";
import { z } from "zod";
export const arxiv_gpt_report: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://arxiv.org/");

const agentResult = await agent.execute({
instruction:
"Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
maxSteps: 20,
});

// Mon, 27 Mar 2023 17:46:54 UTC
const { date } = await stagehand.page.extract({
modelName: "google/gemini-2.5-flash",
instruction:
"Extract the date of the v3 submission history, it should be in the format 'MM-DD-YYYY'",
schema: z.object({
date: z.string().describe("The date of the v3 submission history"),
}),
});

console.log(`date: ${date}`);

const success = agentResult.success && date === "03-27-2023";

if (!success) {
return {
_success: false,
message: agentResult.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
message: error.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} finally {
await stagehand.close();
}
};
Loading