diff --git a/.changeset/rich-colts-march.md b/.changeset/rich-colts-march.md new file mode 100644 index 00000000..dc1ddc6d --- /dev/null +++ b/.changeset/rich-colts-march.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Add more evals for stagehand agent diff --git a/evals/evals.config.json b/evals/evals.config.json index 9d3c7ccb..e5641a01 100644 --- a/evals/evals.config.json +++ b/evals/evals.config.json @@ -323,6 +323,34 @@ "name": "agent/google_flights", "categories": ["agent"] }, + { + "name": "agent/github_react_version", + "categories": ["agent"] + }, + { + "name": "agent/steam_games", + "categories": ["agent"] + }, + { + "name": "agent/ubereats", + "categories": ["agent"] + }, + { + "name": "agent/kith", + "categories": ["agent"] + }, + { + "name": "agent/apple_tv", + "categories": ["agent"] + }, + { + "name": "agent/apple_trade_in", + "categories": ["agent"] + }, + { + "name": "agent/arxiv_gpt_report", + "categories": ["agent"] + }, { "name": "agent/sf_library_card", "categories": ["agent"] @@ -331,6 +359,14 @@ "name": "agent/sf_library_card_multiple", "categories": ["agent"] }, + { + "name": "agent/hugging_face", + "categories": ["agent"] + }, + { + "name": "agent/google_maps_3", + "categories": ["agent"] + }, { "name": "login", "categories": ["act", "regression"] @@ -423,5 +459,26 @@ "name": "namespace_xpath", "categories": ["act"] } + , + { + "name": "agent/nba_trades", + "categories": ["agent"] + }, + { + "name": "agent/hotel_booking", + "categories": ["agent"] + }, + { + "name": "agent/github", + "categories": ["agent"] + }, + { + "name": "agent/all_recipes", + "categories": ["agent"] + }, + { + "name": "agent/google_shopping", + "categories": ["agent"] + } ] } diff --git a/evals/initStagehand.ts b/evals/initStagehand.ts index 16ff63da..cf8de26c 100644 --- a/evals/initStagehand.ts +++ b/evals/initStagehand.ts @@ -18,6 +18,7 @@ import { } from "@browserbasehq/stagehand"; import { EvalLogger } from "./logger"; import type { StagehandInitResult } from "@/types/evals"; +import type { AgentConfig } from "@/dist"; import { AvailableModel } from "@browserbasehq/stagehand"; /** @@ -104,6 +105,19 @@ export const initStagehand = async ({ // Set navigation timeout to 60 seconds for evaluations stagehand.context.setDefaultNavigationTimeout(60_000); + const isCUAModel = (model: string): boolean => + model.includes("computer-use-preview") || model.startsWith("claude"); + + let agentConfig: AgentConfig | undefined; + if (isCUAModel(modelName)) { + agentConfig = { + model: modelName, + provider: modelName.startsWith("claude") ? "anthropic" : "openai", + } as AgentConfig; + } + + const agent = stagehand.agent(agentConfig); + return { stagehand, stagehandConfig: config, @@ -111,5 +125,6 @@ export const initStagehand = async ({ debugUrl, sessionUrl, modelName, + agent, }; }; diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts index 238f849c..d67fbebc 100644 --- a/evals/taskConfig.ts +++ b/evals/taskConfig.ts @@ -106,7 +106,7 @@ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS ? process.env.EVAL_AGENT_MODELS.split(",") - : ["computer-use-preview-2025-03-11", "claude-3-7-sonnet-latest"]; + : ["computer-use-preview-2025-03-11", "claude-sonnet-4-20250514"]; /** * getModelList: diff --git a/evals/tasks/agent/all_recipes.ts b/evals/tasks/agent/all_recipes.ts new file mode 100644 index 00000000..c416b4c5 --- /dev/null +++ b/evals/tasks/agent/all_recipes.ts @@ -0,0 +1,61 @@ +import { Evaluator } from "@/evals/evaluator"; +import { EvalFunction } from "@/types/evals"; + +export const all_recipes: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + await stagehand.page.goto("https://www.allrecipes.com/"); + const evaluator = new Evaluator(stagehand); + const agentResult = await agent.execute({ + instruction: + "Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.", + maxSteps: 20, + }); + + const { evaluation, reasoning } = await evaluator.evaluate({ + question: "Did the agent find a recipe for Beef Wellington", + }); + + logger.log(agentResult); + + const success = + agentResult.success && + evaluation === "YES" && + stagehand.page.url() === + "https://www.allrecipes.com/recipe/16899/beef-wellington/"; + + if (!success) { + return { + _success: false, + message: reasoning, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + error, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + } as unknown as ReturnType extends Promise + ? R + : never; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/apple_trade_in.ts b/evals/tasks/agent/apple_trade_in.ts new file mode 100644 index 00000000..0cd25154 --- /dev/null +++ b/evals/tasks/agent/apple_trade_in.ts @@ -0,0 +1,60 @@ +//this eval is expected to fail due to issues scrolling within the trade in dialog +import { EvalFunction } from "@/types/evals"; +import { z } from "zod"; + +export const apple_trade_in: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + await stagehand.page.goto("https://www.apple.com/shop/trade-in"); + const agentResult = await agent.execute({ + instruction: + "Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.", + maxSteps: 30, + }); + + const { tradeInValue } = await stagehand.page.extract({ + modelName: "google/gemini-2.5-flash", + instruction: + "Extract the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website. it will be inside this text : Get x trade-in credit toward a new iPhone', provide just the number", + schema: z.object({ + tradeInValue: z.number(), + }), + }); + + const success = + agentResult.success && + tradeInValue === 360 && + stagehand.page.url().includes("https://www.apple.com/shop/trade-in"); + + if (!success) { + return { + _success: false, + message: agentResult.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + message: error.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/apple_tv.ts b/evals/tasks/agent/apple_tv.ts new file mode 100644 index 00000000..7cab4c3a --- /dev/null +++ b/evals/tasks/agent/apple_tv.ts @@ -0,0 +1,61 @@ +import { EvalFunction } from "@/types/evals"; +import { z } from "zod"; + +export const apple_tv: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + await stagehand.page.goto("https://www.apple.com/"); + + const agentResult = await agent.execute({ + instruction: + "Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.", + maxSteps: 30, + }); + + const { height, width } = await stagehand.page.extract({ + modelName: "google/gemini-2.5-flash", + instruction: "Extract the size and weight of the Apple TV 4K", + schema: z.object({ + height: z.number().describe("The height of the Apple TV 4K in inches"), + width: z.number().describe("The width of the Apple TV 4K in inches"), + }), + }); + + const success = + agentResult.success && + height === 1.2 && + width === 3.66 && + stagehand.page.url().includes("https://www.apple.com/apple-tv-4k/specs/"); + + if (!success) { + return { + _success: false, + message: agentResult.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + message: error.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/arxiv_gpt_report.ts b/evals/tasks/agent/arxiv_gpt_report.ts new file mode 100644 index 00000000..578bbb53 --- /dev/null +++ b/evals/tasks/agent/arxiv_gpt_report.ts @@ -0,0 +1,60 @@ +//agent often fails on this one, +import { EvalFunction } from "@/types/evals"; +import { z } from "zod"; +export const arxiv_gpt_report: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + await stagehand.page.goto("https://arxiv.org/"); + + const agentResult = await agent.execute({ + instruction: + "Find the paper 'GPT-4 Technical Report', when was v3 submitted?", + maxSteps: 20, + }); + + // Mon, 27 Mar 2023 17:46:54 UTC + const { date } = await stagehand.page.extract({ + modelName: "google/gemini-2.5-flash", + instruction: + "Extract the date of the v3 submission history, it should be in the format 'MM-DD-YYYY'", + schema: z.object({ + date: z.string().describe("The date of the v3 submission history"), + }), + }); + + console.log(`date: ${date}`); + + const success = agentResult.success && date === "03-27-2023"; + + if (!success) { + return { + _success: false, + message: agentResult.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + message: error.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/github.ts b/evals/tasks/agent/github.ts new file mode 100644 index 00000000..795ec670 --- /dev/null +++ b/evals/tasks/agent/github.ts @@ -0,0 +1,56 @@ +import { EvalFunction } from "@/types/evals"; +import { Evaluator } from "@/evals/evaluator"; +export const github: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + await stagehand.page.goto("https://github.com/"); + const evaluator = new Evaluator(stagehand); + const agentResult = await agent.execute({ + instruction: + "Find a Ruby repository on GitHub that has been updated in the past 3 days and has at least 1000 stars.", + maxSteps: 14, + }); + logger.log(agentResult); + + const { evaluation, reasoning } = await evaluator.evaluate({ + question: + "Ruby repository on GitHub that has been updated in the past 3 days and has at least 1000 stars.", + }); + + const success = agentResult.success && evaluation === "YES"; + + if (!success) { + return { + _success: false, + message: reasoning, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + error, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + } as unknown as ReturnType extends Promise + ? R + : never; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/github_react_version.ts b/evals/tasks/agent/github_react_version.ts new file mode 100644 index 00000000..cbd29001 --- /dev/null +++ b/evals/tasks/agent/github_react_version.ts @@ -0,0 +1,52 @@ +import { EvalFunction } from "@/types/evals"; +import { Evaluator } from "@/evals/evaluator"; +export const github_react_version: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + const evaluator = new Evaluator(stagehand); + await stagehand.page.goto("https://github.com/"); + await agent.execute({ + instruction: + "Check the latest release version of React and the date it was published. ", + maxSteps: 20, + }); + const { evaluation, reasoning } = await evaluator.evaluate({ + question: + "Does the page show the latest version of react and the date it was published", + }); + console.log(`evaluation: ${evaluation}`); + console.log(`reasoning: ${reasoning}`); + // only use url check for now, as using extract on the version is prone to breaking in future + const success = evaluation === "YES"; + if (!success) { + return { + _success: false, + message: reasoning, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + message: error.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/google_flights.ts b/evals/tasks/agent/google_flights.ts index a5785d73..b507ef1d 100644 --- a/evals/tasks/agent/google_flights.ts +++ b/evals/tasks/agent/google_flights.ts @@ -6,17 +6,11 @@ export const google_flights: EvalFunction = async ({ sessionUrl, stagehand, logger, - modelName, + agent, }) => { try { await stagehand.page.goto("https://google.com/travel/flights"); - const agent = stagehand.agent({ - model: modelName, - provider: modelName.startsWith("claude") ? "anthropic" : "openai", - instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. Today is ${new Date().toISOString().slice(0, 10)}. The current page is ${await stagehand.page.title()}`, - }); - const agentResult = await agent.execute({ instruction: "Search for flights from San Francisco to New York for next weekend", diff --git a/evals/tasks/agent/google_maps.ts b/evals/tasks/agent/google_maps.ts index 13d7389e..9ac140e3 100644 --- a/evals/tasks/agent/google_maps.ts +++ b/evals/tasks/agent/google_maps.ts @@ -6,17 +6,11 @@ export const google_maps: EvalFunction = async ({ sessionUrl, stagehand, logger, - modelName, + agent, }) => { try { await stagehand.page.goto("https://maps.google.com"); - const agent = stagehand.agent({ - model: modelName, - provider: modelName.startsWith("claude") ? "anthropic" : "openai", - instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`, - }); - const agentResult = await agent.execute({ instruction: "How long does it take to get from San Francisco to New York driving?", diff --git a/evals/tasks/agent/google_maps_2.ts b/evals/tasks/agent/google_maps_2.ts index e80478fa..05ce5dbd 100644 --- a/evals/tasks/agent/google_maps_2.ts +++ b/evals/tasks/agent/google_maps_2.ts @@ -7,17 +7,11 @@ export const google_maps_2: EvalFunction = async ({ sessionUrl, stagehand, logger, - modelName, + agent, }) => { try { await stagehand.page.goto("https://maps.google.com"); - const agent = stagehand.agent({ - model: modelName, - provider: modelName.startsWith("claude") ? "anthropic" : "openai", - instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`, - }); - const agentResult = await agent.execute({ instruction: "Search for the fastest walking route from La Puerta de Alcalá to La Puerta del Sol", diff --git a/evals/tasks/agent/google_maps_3.ts b/evals/tasks/agent/google_maps_3.ts new file mode 100644 index 00000000..fe47b4ec --- /dev/null +++ b/evals/tasks/agent/google_maps_3.ts @@ -0,0 +1,52 @@ +import { EvalFunction } from "@/types/evals"; +import { Evaluator } from "@/evals/evaluator"; +export const google_maps_3: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + await stagehand.page.goto("https://maps.google.com/"); + const evaluator = new Evaluator(stagehand); + const agentResult = await agent.execute({ + instruction: + "Search for locksmiths open now but not open 24 hours in Texas City.", + maxSteps: 30, + }); + + const { evaluation, reasoning } = await evaluator.evaluate({ + question: + "Does the page show a locksmiths open now but not open 24 hours in Texas City?", + }); + + const success = agentResult.success && evaluation === "YES"; + + if (!success) { + return { + _success: false, + message: reasoning, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + message: error.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/google_shopping.ts b/evals/tasks/agent/google_shopping.ts new file mode 100644 index 00000000..b557f4cf --- /dev/null +++ b/evals/tasks/agent/google_shopping.ts @@ -0,0 +1,58 @@ +import { Evaluator } from "@/evals/evaluator"; +import { EvalFunction } from "@/types/evals"; + +export const google_shopping: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + await stagehand.page.goto("https://www.google.com/shopping"); + + const agentResult = await agent.execute({ + instruction: + "Find a drip coffee maker that is on sale and within $25-60 and has a black finish", + maxSteps: 20, + }); + logger.log(agentResult); + + const evaluator = new Evaluator(stagehand); + const { evaluation, reasoning } = await evaluator.evaluate({ + question: + "Does the page show a drip coffee maker that is on sale and within $25-60 and has a black finish?", + }); + + const success = agentResult.success && evaluation === "YES"; + + if (!success) { + return { + _success: false, + message: reasoning, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + error, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + } as unknown as ReturnType extends Promise + ? R + : never; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/hotel_booking.ts b/evals/tasks/agent/hotel_booking.ts new file mode 100644 index 00000000..292b691e --- /dev/null +++ b/evals/tasks/agent/hotel_booking.ts @@ -0,0 +1,58 @@ +//this eval is expected to fail. +import { EvalFunction } from "@/types/evals"; +import { Evaluator } from "@/evals/evaluator"; +export const hotel_booking: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + await stagehand.page.goto("https://www.booking.com/"); + + const agentResult = await agent.execute({ + instruction: + "Find a hotel in Sydney with a rating of 8 or higher, providing free Wi-Fi and parking, available for a four-night stay starting on December 10, 2025.", + maxSteps: 20, + }); + logger.log(agentResult); + + const evaluator = new Evaluator(stagehand); + const { evaluation, reasoning } = await evaluator.evaluate({ + question: + "Does the page show a hotel in Sydney with a rating of 8 or higher, providing free Wi-Fi and parking, available for a four-night stay starting on December 10, 2025?", + }); + + const success = agentResult.success && evaluation === "YES"; + + if (!success) { + return { + _success: false, + message: reasoning, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + error, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + } as unknown as ReturnType extends Promise + ? R + : never; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/hugging_face.ts b/evals/tasks/agent/hugging_face.ts new file mode 100644 index 00000000..d7e1e26a --- /dev/null +++ b/evals/tasks/agent/hugging_face.ts @@ -0,0 +1,54 @@ +import { EvalFunction } from "@/types/evals"; +import { z } from "zod"; + +export const hugging_face: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + await stagehand.page.goto("https://huggingface.co/"); + const agentResult = await agent.execute({ + instruction: + "Search for a model on Hugging Face with an Apache-2.0 license that has received the highest number of likes.", + maxSteps: 15, + }); + + const { modelName } = await stagehand.page.extract({ + modelName: "google/gemini-2.5-flash", + instruction: "Extract the name of the model", + schema: z.object({ + modelName: z.string(), + }), + }); + console.log(`modelName: ${modelName}`); + const success = agentResult.success && modelName === "Kokoro-82M"; + if (!success) { + return { + _success: false, + message: agentResult.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + message: error.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/iframe_form.ts b/evals/tasks/agent/iframe_form.ts index a24057f5..056668be 100644 --- a/evals/tasks/agent/iframe_form.ts +++ b/evals/tasks/agent/iframe_form.ts @@ -6,16 +6,11 @@ export const iframe_form: EvalFunction = async ({ sessionUrl, stagehand, logger, - modelName, + agent, }) => { try { await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/"); - const agent = stagehand.agent({ - provider: "anthropic", - model: modelName, - }); - const agentResult = await agent.execute({ instruction: "Fill in the form name with 'John Smith'", maxSteps: 3, diff --git a/evals/tasks/agent/iframe_form_multiple.ts b/evals/tasks/agent/iframe_form_multiple.ts index 0b2f4854..cad7c9a3 100644 --- a/evals/tasks/agent/iframe_form_multiple.ts +++ b/evals/tasks/agent/iframe_form_multiple.ts @@ -6,16 +6,11 @@ export const iframe_form_multiple: EvalFunction = async ({ sessionUrl, stagehand, logger, - modelName, + agent, }) => { try { await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/"); - const agent = stagehand.agent({ - provider: modelName.startsWith("claude") ? "anthropic" : "openai", - model: modelName, - }); - const agentResult = await agent.execute({ instruction: "Fill in the form name with 'John Smith', the email with 'john.smith@example.com', and select the 'Are you the domain owner?' option as 'No'", diff --git a/evals/tasks/agent/kayak.ts b/evals/tasks/agent/kayak.ts index e176a241..464ae3e1 100644 --- a/evals/tasks/agent/kayak.ts +++ b/evals/tasks/agent/kayak.ts @@ -6,18 +6,12 @@ export const kayak: EvalFunction = async ({ sessionUrl, stagehand, logger, + agent, }) => { try { const evaluator = new Evaluator(stagehand); await stagehand.page.goto("https://www.kayak.com"); - const agent = stagehand.agent({ - provider: "openai", - model: "computer-use-preview", - instructions: `You are a helpful assistant that can help me find flights. DON'T ASK FOLLOW UP QUESTIONS UNTIL YOU HAVE FULFILLED THE USER'S REQUEST. Today is ${new Date().toLocaleDateString()}.`, - options: { - apiKey: process.env.OPENAI_API_KEY, - }, - }); + await agent.execute({ instruction: "Find flights from San Francisco to Tokyo next week", maxSteps: 15, diff --git a/evals/tasks/agent/kith.ts b/evals/tasks/agent/kith.ts new file mode 100644 index 00000000..d796374c --- /dev/null +++ b/evals/tasks/agent/kith.ts @@ -0,0 +1,78 @@ +import { EvalFunction } from "@/types/evals"; +import { Evaluator } from "@/evals/evaluator"; + +export const kith: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + const evaluator = new Evaluator(stagehand); + await stagehand.page.goto( + "https://kith.com/collections/nike-air-force-1/products/nkcw2288-111?variant=19439468707968", + ); + + await agent.execute({ + instruction: + "add the shoes to cart, go to checkout, and fill the delivery information", + maxSteps: 25, + }); + + const { evaluation, reasoning } = await evaluator.evaluate({ + question: "Did the agent fill the delivery information", + }); + + const success = evaluation === "YES"; + + if (success) { + await agent.execute({ + instruction: "fill the credit card information", + maxSteps: 10, + }); + + const { evaluation: evaluation2, reasoning: reasoning2 } = + await evaluator.evaluate({ + question: "Did the agent fill the payment information", + }); + + const success2 = evaluation2 === "YES"; + + if (success2) { + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } else { + return { + _success: false, + message: reasoning2, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + } else { + return { + _success: false, + message: reasoning, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + } catch (error) { + return { + _success: false, + message: error.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/nba_trades.ts b/evals/tasks/agent/nba_trades.ts new file mode 100644 index 00000000..861f1d99 --- /dev/null +++ b/evals/tasks/agent/nba_trades.ts @@ -0,0 +1,59 @@ +import { EvalFunction } from "@/types/evals"; +import { Evaluator } from "@/evals/evaluator"; +export const nba_trades: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + const evaluator = new Evaluator(stagehand); + await stagehand.page.goto("https://www.espn.com/"); + + const agentResult = await agent.execute({ + instruction: + "Find the latest Team transaction in the NBA within the past week.", + maxSteps: 20, + }); + logger.log(agentResult); + + const { evaluation, reasoning } = await evaluator.evaluate({ + question: "Did the agent make it to the nba transactions page?", + }); + + const success = + agentResult.success && + stagehand.page.url() === "https://www.espn.com/nba/transactions" && + evaluation === "YES"; + + if (!success) { + return { + _success: false, + message: reasoning, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + error, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + } as unknown as ReturnType extends Promise + ? R + : never; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/sf_library_card.ts b/evals/tasks/agent/sf_library_card.ts index a31fb1cd..e29c9d14 100644 --- a/evals/tasks/agent/sf_library_card.ts +++ b/evals/tasks/agent/sf_library_card.ts @@ -6,17 +6,11 @@ export const sf_library_card: EvalFunction = async ({ sessionUrl, stagehand, logger, - modelName, + agent, }) => { try { await stagehand.page.goto("https://sflib1.sfpl.org/selfreg"); - const agent = stagehand.agent({ - model: modelName, - provider: modelName.startsWith("claude") ? "anthropic" : "openai", - instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`, - }); - const agentResult = await agent.execute({ instruction: "Fill in the 'Residential Address' field with '166 Geary St'", diff --git a/evals/tasks/agent/sf_library_card_multiple.ts b/evals/tasks/agent/sf_library_card_multiple.ts index 367ff763..8ea952fe 100644 --- a/evals/tasks/agent/sf_library_card_multiple.ts +++ b/evals/tasks/agent/sf_library_card_multiple.ts @@ -6,17 +6,11 @@ export const sf_library_card_multiple: EvalFunction = async ({ sessionUrl, stagehand, logger, - modelName, + agent, }) => { try { await stagehand.page.goto("https://sflib1.sfpl.org/selfreg"); - const agent = stagehand.agent({ - model: modelName, - provider: modelName.startsWith("claude") ? "anthropic" : "openai", - instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`, - }); - const agentResult = await agent.execute({ instruction: "Fill in ALL the required fields with mock data. DO NOT submit the form", diff --git a/evals/tasks/agent/sign_in.ts b/evals/tasks/agent/sign_in.ts index d03ae211..487cc760 100644 --- a/evals/tasks/agent/sign_in.ts +++ b/evals/tasks/agent/sign_in.ts @@ -5,17 +5,11 @@ export const sign_in: EvalFunction = async ({ sessionUrl, stagehand, logger, - modelName, + agent, }) => { try { await stagehand.page.goto("https://v0-modern-login-flow.vercel.app/"); - const agent = stagehand.agent({ - model: modelName, - provider: modelName.startsWith("claude") ? "anthropic" : "openai", - instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`, - }); - const agentResult = await agent.execute({ instruction: "Sign in with the email address 'test@browserbaser.com' and the password 'stagehand=goated' ", diff --git a/evals/tasks/agent/steam_games.ts b/evals/tasks/agent/steam_games.ts new file mode 100644 index 00000000..a7ad41da --- /dev/null +++ b/evals/tasks/agent/steam_games.ts @@ -0,0 +1,50 @@ +import { EvalFunction } from "@/types/evals"; + +export const steam_games: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + await stagehand.page.goto("https://store.steampowered.com/"); + + const agentResult = await agent.execute({ + instruction: + "Show most played games in Steam. And tell me the number of players in In game at this time", + maxSteps: 30, + }); + + //strictly used url check and no extract as the top games / players can vary + const success = + agentResult.success && + stagehand.page.url().includes("https://store.steampowered.com/"); + + if (!success) { + return { + _success: false, + message: agentResult.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + message: error.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/trivago.ts b/evals/tasks/agent/trivago.ts index 3b71c80f..847bb77d 100644 --- a/evals/tasks/agent/trivago.ts +++ b/evals/tasks/agent/trivago.ts @@ -5,17 +5,11 @@ export const trivago: EvalFunction = async ({ sessionUrl, stagehand, logger, - modelName, + agent, }) => { try { await stagehand.page.goto("https://www.trivago.com/"); - const agent = stagehand.agent({ - model: modelName, - provider: modelName.startsWith("claude") ? "anthropic" : "openai", - instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. Today is ${new Date().toISOString().slice(0, 10)}.The current page is ${await stagehand.page.title()}`, - }); - const agentResult = await agent.execute({ instruction: "Find the cheapest room in the hotel H10 Tribeca in Madrid next weekend. Stop at the trivago page showing the results", diff --git a/evals/tasks/agent/ubereats.ts b/evals/tasks/agent/ubereats.ts new file mode 100644 index 00000000..ee5d6720 --- /dev/null +++ b/evals/tasks/agent/ubereats.ts @@ -0,0 +1,54 @@ +import { EvalFunction } from "@/types/evals"; +import { Evaluator } from "@/evals/evaluator"; + +export const ubereats: EvalFunction = async ({ + debugUrl, + sessionUrl, + stagehand, + logger, + agent, +}) => { + try { + const evaluator = new Evaluator(stagehand); + await stagehand.page.goto("https://www.ubereats.com/"); + + await agent.execute({ + instruction: + "Order a pizza from ubereats to 639 geary st in sf, call the task complete once the login page is shown after adding pizza and viewing the cart", + maxSteps: 30, + }); + + const { evaluation, reasoning } = await evaluator.evaluate({ + question: "Did the agent make it to the login page?", + }); + + const success = + evaluation === "YES" && + stagehand.page.url().includes("https://auth.uber.com/"); + if (!success) { + return { + _success: false, + message: reasoning, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } + return { + _success: true, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + message: error.message, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } finally { + await stagehand.close(); + } +}; diff --git a/evals/tasks/agent/youtube.ts b/evals/tasks/agent/youtube.ts index f3b755bc..02d10020 100644 --- a/evals/tasks/agent/youtube.ts +++ b/evals/tasks/agent/youtube.ts @@ -5,17 +5,11 @@ export const youtube: EvalFunction = async ({ sessionUrl, stagehand, logger, - modelName, + agent, }) => { try { await stagehand.page.goto("https://youtube.com"); - const agent = stagehand.agent({ - model: modelName, - provider: modelName.startsWith("claude") ? "anthropic" : "openai", - instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. The current page is ${await stagehand.page.title()}`, - }); - const agentResult = await agent.execute({ instruction: "Search for Keinemusik's set under some very famous pointy landmarks", diff --git a/types/evals.ts b/types/evals.ts index d97e2796..51742896 100644 --- a/types/evals.ts +++ b/types/evals.ts @@ -13,6 +13,7 @@ export type StagehandInitResult = { sessionUrl: string; stagehandConfig: ConstructorParams; modelName: AvailableModel; + agent: ReturnType; }; export type EvalFunction = (taskInput: StagehandInitResult) => Promise<{