Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions evals/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ export class Evaluator {
systemPrompt = `You are an expert evaluator that confidently returns YES or NO given the state of a task (most times in the form of a screenshot) and a question. Provide a detailed reasoning for your answer.
Return your response as a JSON object with the following format:
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
Be critical about the question and the answer, the slightest detail might be the difference between yes and no.
the current date is ${new Date().toISOString()}
`,
screenshotDelayMs = 1000,
} = options;

Expand Down Expand Up @@ -138,7 +140,8 @@ export class Evaluator {
systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task in the screenshot. Provide a detailed reasoning for your answer.
Return your response as a JSON array, where each object corresponds to a question and has the following format:
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
Be critical about the question and the answer, the slightest detail might be the difference between yes and no.
the current date is ${new Date().toISOString()}`,
screenshotDelayMs = 1000,
} = options;

Expand Down
1 change: 0 additions & 1 deletion evals/tasks/agent/all_recipes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ export const all_recipes: EvalFunction = async ({
logger.log(agentResult);

const success =
agentResult.success &&
evaluation === "YES" &&
stagehand.page.url() ===
"https://www.allrecipes.com/recipe/16899/beef-wellington/";
Expand Down
1 change: 0 additions & 1 deletion evals/tasks/agent/apple_trade_in.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ export const apple_trade_in: EvalFunction = async ({
});

const success =
agentResult.success &&
tradeInValue === 360 &&
stagehand.page.url().includes("https://www.apple.com/shop/trade-in");

Expand Down
1 change: 0 additions & 1 deletion evals/tasks/agent/apple_tv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ export const apple_tv: EvalFunction = async ({
});

const success =
agentResult.success &&
height === 1.2 &&
width === 3.66 &&
stagehand.page.url().includes("https://www.apple.com/apple-tv-4k/specs/");
Expand Down
2 changes: 1 addition & 1 deletion evals/tasks/agent/arxiv_gpt_report.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ export const arxiv_gpt_report: EvalFunction = async ({

console.log(`date: ${date}`);

const success = agentResult.success && date === "03-27-2023";
const success = date === "03-27-2023";

if (!success) {
return {
Expand Down
2 changes: 1 addition & 1 deletion evals/tasks/agent/github.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ export const github: EvalFunction = async ({
"Ruby repository on GitHub that has been updated in the past 3 days and has at least 1000 stars.",
});

const success = agentResult.success && evaluation === "YES";
const success = evaluation === "YES";

if (!success) {
return {
Expand Down
4 changes: 2 additions & 2 deletions evals/tasks/agent/google_maps_3.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ export const google_maps_3: EvalFunction = async ({
try {
await stagehand.page.goto("https://maps.google.com/");
const evaluator = new Evaluator(stagehand);
const agentResult = await agent.execute({
agent.execute({
instruction:
"Search for locksmiths open now but not open 24 hours in Texas City.",
maxSteps: 30,
Expand All @@ -21,7 +21,7 @@ export const google_maps_3: EvalFunction = async ({
"Does the page show a locksmiths open now but not open 24 hours in Texas City?",
});

const success = agentResult.success && evaluation === "YES";
const success = evaluation === "YES";

if (!success) {
return {
Expand Down
2 changes: 1 addition & 1 deletion evals/tasks/agent/google_shopping.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ export const google_shopping: EvalFunction = async ({
"Does the page show a drip coffee maker that is on sale and within $25-60 and has a black finish?",
});

const success = agentResult.success && evaluation === "YES";
const success = evaluation === "YES";

if (!success) {
return {
Expand Down
2 changes: 1 addition & 1 deletion evals/tasks/agent/hotel_booking.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ export const hotel_booking: EvalFunction = async ({
"Does the page show a hotel in Sydney with a rating of 8 or higher, providing free Wi-Fi and parking, available for a four-night stay starting on December 10, 2025?",
});

const success = agentResult.success && evaluation === "YES";
const success = evaluation === "YES";

if (!success) {
return {
Expand Down
2 changes: 1 addition & 1 deletion evals/tasks/agent/hugging_face.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ export const hugging_face: EvalFunction = async ({
}),
});
console.log(`modelName: ${modelName}`);
const success = agentResult.success && modelName === "Kokoro-82M";
const success = modelName === "Kokoro-82M";
if (!success) {
return {
_success: false,
Expand Down
1 change: 0 additions & 1 deletion evals/tasks/agent/nba_trades.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ export const nba_trades: EvalFunction = async ({
});

const success =
agentResult.success &&
stagehand.page.url() === "https://www.espn.com/nba/transactions" &&
evaluation === "YES";

Expand Down
15 changes: 9 additions & 6 deletions evals/tasks/agent/steam_games.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { EvalFunction } from "@/types/evals";

import { Evaluator } from "@/evals/evaluator";
export const steam_games: EvalFunction = async ({
debugUrl,
sessionUrl,
Expand All @@ -10,21 +10,24 @@ export const steam_games: EvalFunction = async ({
try {
await stagehand.page.goto("https://store.steampowered.com/");

const agentResult = await agent.execute({
agent.execute({
instruction:
"Show most played games in Steam. And tell me the number of players in In game at this time",
maxSteps: 30,
});
Comment on lines +13 to 17
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logic: Missing await keyword - the agent execution is not being waited for, so evaluation will happen before the agent completes its task

Suggested change
agent.execute({
instruction:
"Show most played games in Steam. And tell me the number of players in In game at this time",
maxSteps: 30,
});
const agentResult = await agent.execute({
instruction:
"Show most played games in Steam. And tell me the number of players in In game at this time",
maxSteps: 30,
});


const evaluator = new Evaluator(stagehand);
const { evaluation, reasoning } = await evaluator.evaluate({
question: "Did the agent make it to the steam games page?",
});
//strictly used url check and no extract as the top games / players can vary
const success =
agentResult.success &&
stagehand.page.url().includes("https://store.steampowered.com/");
stagehand.page.url().includes("https://store.steampowered.com/") &&
evaluation === "YES";

if (!success) {
return {
_success: false,
message: agentResult.message,
message: reasoning,
debugUrl,
sessionUrl,
logs: logger.getLogs(),
Expand Down