Skip to content

Add more agent evals #961

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/rich-colts-march.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
---

Add more evals for stagehand agent
57 changes: 57 additions & 0 deletions evals/evals.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,34 @@
"name": "agent/google_flights",
"categories": ["agent"]
},
{
"name": "agent/github_react_version",
"categories": ["agent"]
},
{
"name": "agent/steam_games",
"categories": ["agent"]
},
{
"name": "agent/ubereats",
"categories": ["agent"]
},
{
"name": "agent/kith",
"categories": ["agent"]
},
{
"name": "agent/apple_tv",
"categories": ["agent"]
},
{
"name": "agent/apple_trade_in",
"categories": ["agent"]
},
{
"name": "agent/arxiv_gpt_report",
"categories": ["agent"]
},
{
"name": "agent/sf_library_card",
"categories": ["agent"]
Expand All @@ -331,6 +359,14 @@
"name": "agent/sf_library_card_multiple",
"categories": ["agent"]
},
{
"name": "agent/hugging_face",
"categories": ["agent"]
},
{
"name": "agent/google_maps_3",
"categories": ["agent"]
},
{
"name": "login",
"categories": ["act", "regression"]
Expand Down Expand Up @@ -423,5 +459,26 @@
"name": "namespace_xpath",
"categories": ["act"]
}
,
{
"name": "agent/nba_trades",
"categories": ["agent"]
},
{
"name": "agent/hotel_booking",
"categories": ["agent"]
},
{
"name": "agent/github",
"categories": ["agent"]
},
{
"name": "agent/all_recipes",
"categories": ["agent"]
},
{
"name": "agent/google_shopping",
"categories": ["agent"]
}
]
}
15 changes: 15 additions & 0 deletions evals/initStagehand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import {
} from "@browserbasehq/stagehand";
import { EvalLogger } from "./logger";
import type { StagehandInitResult } from "@/types/evals";
import type { AgentConfig } from "@/dist";
import { AvailableModel } from "@browserbasehq/stagehand";

/**
Expand Down Expand Up @@ -104,12 +105,26 @@ export const initStagehand = async ({
// Set navigation timeout to 60 seconds for evaluations
stagehand.context.setDefaultNavigationTimeout(60_000);

const isCUAModel = (model: string): boolean =>
model.includes("computer-use-preview") || model.startsWith("claude");

let agentConfig: AgentConfig | undefined;
if (isCUAModel(modelName)) {
agentConfig = {
model: modelName,
provider: modelName.startsWith("claude") ? "anthropic" : "openai",
} as AgentConfig;
}

const agent = stagehand.agent(agentConfig);

return {
stagehand,
stagehandConfig: config,
logger,
debugUrl,
sessionUrl,
modelName,
agent,
};
};
2 changes: 1 addition & 1 deletion evals/taskConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS

const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS
? process.env.EVAL_AGENT_MODELS.split(",")
: ["computer-use-preview-2025-03-11", "claude-3-7-sonnet-latest"];
: ["computer-use-preview-2025-03-11", "claude-sonnet-4-20250514"];

/**
* getModelList:
Expand Down
61 changes: 61 additions & 0 deletions evals/tasks/agent/all_recipes.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import { Evaluator } from "@/evals/evaluator";
import { EvalFunction } from "@/types/evals";

export const all_recipes: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://www.allrecipes.com/");
const evaluator = new Evaluator(stagehand);
const agentResult = await agent.execute({
instruction:
"Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
maxSteps: 20,
});

const { evaluation, reasoning } = await evaluator.evaluate({
question: "Did the agent find a recipe for Beef Wellington",
});

logger.log(agentResult);

const success =
agentResult.success &&
evaluation === "YES" &&
stagehand.page.url() ===
"https://www.allrecipes.com/recipe/16899/beef-wellington/";

if (!success) {
return {
_success: false,
message: reasoning,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}

return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
error,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
} as unknown as ReturnType<EvalFunction> extends Promise<infer R>
? R
: never;
} finally {
await stagehand.close();
}
};
60 changes: 60 additions & 0 deletions evals/tasks/agent/apple_trade_in.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
//this eval is expected to fail due to issues scrolling within the trade in dialog
import { EvalFunction } from "@/types/evals";
import { z } from "zod";

export const apple_trade_in: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://www.apple.com/shop/trade-in");
const agentResult = await agent.execute({
instruction:
"Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
maxSteps: 30,
});

const { tradeInValue } = await stagehand.page.extract({
modelName: "google/gemini-2.5-flash",
instruction:
"Extract the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website. it will be inside this text : Get x trade-in credit toward a new iPhone', provide just the number",
schema: z.object({
tradeInValue: z.number(),
}),
});

const success =
agentResult.success &&
tradeInValue === 360 &&
stagehand.page.url().includes("https://www.apple.com/shop/trade-in");

if (!success) {
return {
_success: false,
message: agentResult.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
message: error.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} finally {
await stagehand.close();
}
};
61 changes: 61 additions & 0 deletions evals/tasks/agent/apple_tv.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import { EvalFunction } from "@/types/evals";
import { z } from "zod";

export const apple_tv: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://www.apple.com/");

const agentResult = await agent.execute({
instruction:
"Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
maxSteps: 30,
});

const { height, width } = await stagehand.page.extract({
modelName: "google/gemini-2.5-flash",
instruction: "Extract the size and weight of the Apple TV 4K",
schema: z.object({
height: z.number().describe("The height of the Apple TV 4K in inches"),
width: z.number().describe("The width of the Apple TV 4K in inches"),
}),
});

const success =
agentResult.success &&
height === 1.2 &&
width === 3.66 &&
stagehand.page.url().includes("https://www.apple.com/apple-tv-4k/specs/");

if (!success) {
return {
_success: false,
message: agentResult.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
message: error.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} finally {
await stagehand.close();
}
};
60 changes: 60 additions & 0 deletions evals/tasks/agent/arxiv_gpt_report.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
//agent often fails on this one,
import { EvalFunction } from "@/types/evals";
import { z } from "zod";
export const arxiv_gpt_report: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://arxiv.org/");

const agentResult = await agent.execute({
instruction:
"Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
maxSteps: 20,
});

// Mon, 27 Mar 2023 17:46:54 UTC
const { date } = await stagehand.page.extract({
modelName: "google/gemini-2.5-flash",
instruction:
"Extract the date of the v3 submission history, it should be in the format 'MM-DD-YYYY'",
schema: z.object({
date: z.string().describe("The date of the v3 submission history"),
}),
});

console.log(`date: ${date}`);

const success = agentResult.success && date === "03-27-2023";

if (!success) {
return {
_success: false,
message: agentResult.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
message: error.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} finally {
await stagehand.close();
}
};
Loading