Skip to content
Merged
5 changes: 5 additions & 0 deletions .changeset/rich-colts-march.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
---

Add more evals for stagehand agent
65 changes: 65 additions & 0 deletions evals/evals.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,38 @@
"name": "agent/google_flights",
"categories": ["agent"]
},
{
"name": "agent/github_react_version",
"categories": ["agent"]
},
{
"name": "agent/steam_games",
"categories": ["agent"]
},
{
"name": "agent/ubereats",
"categories": ["agent"]
},
{
"name": "agent/kith",
"categories": ["agent"]
},
{
"name": "agent/apple_tv",
"categories": ["agent"]
},
{
"name": "agent/apple_trade_in",
"categories": ["agent"]
},
{
"name": "agent/arxiv_gpt_report",
"categories": ["agent"]
},
{
"name": "agent/wolframalpha_weight_loss",
"categories": ["agent"]
},
{
"name": "agent/sf_library_card",
"categories": ["agent"]
Expand All @@ -331,6 +363,14 @@
"name": "agent/sf_library_card_multiple",
"categories": ["agent"]
},
{
"name": "agent/hugging_face",
"categories": ["agent"]
},
{
"name": "agent/google_maps_3",
"categories": ["agent"]
},
{
"name": "login",
"categories": ["act", "regression"]
Expand Down Expand Up @@ -423,5 +463,30 @@
"name": "namespace_xpath",
"categories": ["act"]
}
,
{
"name": "agent/nba_trades",
"categories": ["agent"]
},
{
"name": "agent/hotel_booking",
"categories": ["agent"]
},
{
"name": "agent/github",
"categories": ["agent"]
},
{
"name": "agent/all_recipes",
"categories": ["agent"]
},
{
"name": "agent/amazon_shoes",
"categories": ["agent"]
},
{
"name": "agent/google_shopping",
"categories": ["agent"]
}
]
}
15 changes: 15 additions & 0 deletions evals/initStagehand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import {
} from "@browserbasehq/stagehand";
import { EvalLogger } from "./logger";
import type { StagehandInitResult } from "@/types/evals";
import type { AgentConfig } from "@/dist";
import { AvailableModel } from "@browserbasehq/stagehand";

/**
Expand Down Expand Up @@ -104,12 +105,26 @@ export const initStagehand = async ({
// Set navigation timeout to 60 seconds for evaluations
stagehand.context.setDefaultNavigationTimeout(60_000);

const isCUAModel = (model: string): boolean =>
model.includes("computer-use-preview") || model.startsWith("claude");

let agentConfig: AgentConfig | undefined;
if (isCUAModel(modelName)) {
agentConfig = {
model: modelName,
provider: modelName.startsWith("claude") ? "anthropic" : "openai",
} as AgentConfig;
}

const agent = stagehand.agent(agentConfig);

return {
stagehand,
stagehandConfig: config,
logger,
debugUrl,
sessionUrl,
modelName,
agent,
};
};
51 changes: 51 additions & 0 deletions evals/tasks/agent/all_recipes.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import { EvalFunction } from "@/types/evals";

export const all_recipes: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://www.allrecipes.com/");

const agentResult = await agent.execute({
instruction:
"Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
maxSteps: 20,
});
logger.log(agentResult);

const success = agentResult.success;

if (!success) {
return {
_success: false,
message: agentResult.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}

return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
error,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
} as unknown as ReturnType<EvalFunction> extends Promise<infer R>
? R
: never;
} finally {
await stagehand.close();
}
};
47 changes: 47 additions & 0 deletions evals/tasks/agent/amazon_shoes.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import { EvalFunction } from "@/types/evals";

export const amazon_shoes: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://www.amazon.com/");

const agentResult = await agent.execute({
instruction:
"Find a pair of mens running shoes in black, size 7, 4+ stars and under $50 and add them to my cart on Amazon.",
maxSteps: 18,
});

const success = agentResult.success;

if (!success) {
return {
_success: false,
message: agentResult.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
message: error.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} finally {
await stagehand.close();
}
};
47 changes: 47 additions & 0 deletions evals/tasks/agent/apple_trade_in.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import { EvalFunction } from "@/types/evals";

export const apple_trade_in: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://www.apple.com/");

const agentResult = await agent.execute({
instruction:
"Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
maxSteps: 30,
});

const success = agentResult.success;

if (!success) {
return {
_success: false,
message: agentResult.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
message: error.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} finally {
await stagehand.close();
}
};
47 changes: 47 additions & 0 deletions evals/tasks/agent/apple_tv.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import { EvalFunction } from "@/types/evals";

export const apple_tv: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://www.apple.com/");

const agentResult = await agent.execute({
instruction:
"Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
maxSteps: 30,
});

const success = agentResult.success;

if (!success) {
return {
_success: false,
message: agentResult.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
message: error.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} finally {
await stagehand.close();
}
};
47 changes: 47 additions & 0 deletions evals/tasks/agent/arxiv_gpt_report.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import { EvalFunction } from "@/types/evals";

export const arxiv_gpt_report: EvalFunction = async ({
debugUrl,
sessionUrl,
stagehand,
logger,
agent,
}) => {
try {
await stagehand.page.goto("https://arxiv.org/");

const agentResult = await agent.execute({
instruction:
"Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
maxSteps: 30,
});

const success = agentResult.success;

if (!success) {
return {
_success: false,
message: agentResult.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
}
return {
_success: true,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} catch (error) {
return {
_success: false,
message: error.message,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
};
} finally {
await stagehand.close();
}
};
Loading