diff --git a/evals/evaluator.ts b/evals/evaluator.ts index e94ebe5d5..5625b3d69 100644 --- a/evals/evaluator.ts +++ b/evals/evaluator.ts @@ -64,7 +64,9 @@ export class Evaluator { systemPrompt = `You are an expert evaluator that confidently returns YES or NO given the state of a task (most times in the form of a screenshot) and a question. Provide a detailed reasoning for your answer. Return your response as a JSON object with the following format: { "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" } - Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`, + Be critical about the question and the answer, the slightest detail might be the difference between yes and no. + the current date is ${new Date().toISOString()} + `, screenshotDelayMs = 1000, } = options; @@ -138,7 +140,8 @@ export class Evaluator { systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task in the screenshot. Provide a detailed reasoning for your answer. Return your response as a JSON array, where each object corresponds to a question and has the following format: { "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" } - Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`, + Be critical about the question and the answer, the slightest detail might be the difference between yes and no. + the current date is ${new Date().toISOString()}`, screenshotDelayMs = 1000, } = options; diff --git a/evals/tasks/agent/all_recipes.ts b/evals/tasks/agent/all_recipes.ts index c23709472..51abcbda7 100644 --- a/evals/tasks/agent/all_recipes.ts +++ b/evals/tasks/agent/all_recipes.ts @@ -24,7 +24,6 @@ export const all_recipes: EvalFunction = async ({ logger.log(agentResult); const success = - agentResult.success && evaluation === "YES" && stagehand.page.url() === "https://www.allrecipes.com/recipe/16899/beef-wellington/"; diff --git a/evals/tasks/agent/apple_trade_in.ts b/evals/tasks/agent/apple_trade_in.ts index 0cd25154e..f1d8efd58 100644 --- a/evals/tasks/agent/apple_trade_in.ts +++ b/evals/tasks/agent/apple_trade_in.ts @@ -27,7 +27,6 @@ export const apple_trade_in: EvalFunction = async ({ }); const success = - agentResult.success && tradeInValue === 360 && stagehand.page.url().includes("https://www.apple.com/shop/trade-in"); diff --git a/evals/tasks/agent/apple_tv.ts b/evals/tasks/agent/apple_tv.ts index 7cab4c3a4..8393cf5ed 100644 --- a/evals/tasks/agent/apple_tv.ts +++ b/evals/tasks/agent/apple_tv.ts @@ -27,7 +27,6 @@ export const apple_tv: EvalFunction = async ({ }); const success = - agentResult.success && height === 1.2 && width === 3.66 && stagehand.page.url().includes("https://www.apple.com/apple-tv-4k/specs/"); diff --git a/evals/tasks/agent/arxiv_gpt_report.ts b/evals/tasks/agent/arxiv_gpt_report.ts index 578bbb53f..c2f934e22 100644 --- a/evals/tasks/agent/arxiv_gpt_report.ts +++ b/evals/tasks/agent/arxiv_gpt_report.ts @@ -29,7 +29,7 @@ export const arxiv_gpt_report: EvalFunction = async ({ console.log(`date: ${date}`); - const success = agentResult.success && date === "03-27-2023"; + const success = date === "03-27-2023"; if (!success) { return { diff --git a/evals/tasks/agent/github.ts b/evals/tasks/agent/github.ts index 589aa8f02..f8febeab5 100644 --- a/evals/tasks/agent/github.ts +++ b/evals/tasks/agent/github.ts @@ -22,7 +22,7 @@ export const github: EvalFunction = async ({ "Ruby repository on GitHub that has been updated in the past 3 days and has at least 1000 stars.", }); - const success = agentResult.success && evaluation === "YES"; + const success = evaluation === "YES"; if (!success) { return { diff --git a/evals/tasks/agent/google_maps_3.ts b/evals/tasks/agent/google_maps_3.ts index fe47b4ec2..3ee322d99 100644 --- a/evals/tasks/agent/google_maps_3.ts +++ b/evals/tasks/agent/google_maps_3.ts @@ -10,7 +10,7 @@ export const google_maps_3: EvalFunction = async ({ try { await stagehand.page.goto("https://maps.google.com/"); const evaluator = new Evaluator(stagehand); - const agentResult = await agent.execute({ + agent.execute({ instruction: "Search for locksmiths open now but not open 24 hours in Texas City.", maxSteps: 30, @@ -21,7 +21,7 @@ export const google_maps_3: EvalFunction = async ({ "Does the page show a locksmiths open now but not open 24 hours in Texas City?", }); - const success = agentResult.success && evaluation === "YES"; + const success = evaluation === "YES"; if (!success) { return { diff --git a/evals/tasks/agent/google_shopping.ts b/evals/tasks/agent/google_shopping.ts index e388db791..697edfba0 100644 --- a/evals/tasks/agent/google_shopping.ts +++ b/evals/tasks/agent/google_shopping.ts @@ -24,7 +24,7 @@ export const google_shopping: EvalFunction = async ({ "Does the page show a drip coffee maker that is on sale and within $25-60 and has a black finish?", }); - const success = agentResult.success && evaluation === "YES"; + const success = evaluation === "YES"; if (!success) { return { diff --git a/evals/tasks/agent/hotel_booking.ts b/evals/tasks/agent/hotel_booking.ts index e2a974e91..7663998ed 100644 --- a/evals/tasks/agent/hotel_booking.ts +++ b/evals/tasks/agent/hotel_booking.ts @@ -24,7 +24,7 @@ export const hotel_booking: EvalFunction = async ({ "Does the page show a hotel in Sydney with a rating of 8 or higher, providing free Wi-Fi and parking, available for a four-night stay starting on December 10, 2025?", }); - const success = agentResult.success && evaluation === "YES"; + const success = evaluation === "YES"; if (!success) { return { diff --git a/evals/tasks/agent/hugging_face.ts b/evals/tasks/agent/hugging_face.ts index d7e1e26a8..0f04166e8 100644 --- a/evals/tasks/agent/hugging_face.ts +++ b/evals/tasks/agent/hugging_face.ts @@ -24,7 +24,7 @@ export const hugging_face: EvalFunction = async ({ }), }); console.log(`modelName: ${modelName}`); - const success = agentResult.success && modelName === "Kokoro-82M"; + const success = modelName === "Kokoro-82M"; if (!success) { return { _success: false, diff --git a/evals/tasks/agent/nba_trades.ts b/evals/tasks/agent/nba_trades.ts index c0e920804..d874fb2c2 100644 --- a/evals/tasks/agent/nba_trades.ts +++ b/evals/tasks/agent/nba_trades.ts @@ -23,7 +23,6 @@ export const nba_trades: EvalFunction = async ({ }); const success = - agentResult.success && stagehand.page.url() === "https://www.espn.com/nba/transactions" && evaluation === "YES"; diff --git a/evals/tasks/agent/steam_games.ts b/evals/tasks/agent/steam_games.ts index a7ad41da0..cf5e46fcb 100644 --- a/evals/tasks/agent/steam_games.ts +++ b/evals/tasks/agent/steam_games.ts @@ -1,5 +1,5 @@ import { EvalFunction } from "@/types/evals"; - +import { Evaluator } from "@/evals/evaluator"; export const steam_games: EvalFunction = async ({ debugUrl, sessionUrl, @@ -10,21 +10,24 @@ export const steam_games: EvalFunction = async ({ try { await stagehand.page.goto("https://store.steampowered.com/"); - const agentResult = await agent.execute({ + agent.execute({ instruction: "Show most played games in Steam. And tell me the number of players in In game at this time", maxSteps: 30, }); - + const evaluator = new Evaluator(stagehand); + const { evaluation, reasoning } = await evaluator.evaluate({ + question: "Did the agent make it to the steam games page?", + }); //strictly used url check and no extract as the top games / players can vary const success = - agentResult.success && - stagehand.page.url().includes("https://store.steampowered.com/"); + stagehand.page.url().includes("https://store.steampowered.com/") && + evaluation === "YES"; if (!success) { return { _success: false, - message: agentResult.message, + message: reasoning, debugUrl, sessionUrl, logs: logger.getLogs(),