add agent reasoning to evaluator, improve prompts & paramaterize max … (#1050)

tkattkat · web-flow · commit 444da198b5f2 · 2025-09-04T14:25:31.000-07:00
…steps

# why

- Evaluator often is too strict in its evaluation, resulting in false
positives
- max step limits in evals are very brittle and can cause false
negatives on tasks that exceed the current limit
- Evaluator can sometimes view screenshots as a single source of truth
resulting in false negatives. Providing the agents reasoning alongside
these heps mitigate this
# what changed

- paramaterized max steps to allow for easy configuration across all
evals through env
- adjusted evaluator prompting 
- added "agent reasoning" param which can be used to pass in the agents
reasoning alongside bulk screenshots in a format evaluator can
understand well

# test plan
tested locally
diff --git a/.env.example b/.env.example
@@ -9,4 +9,5 @@ ENABLE_CACHING=false
 EVAL_MODELS="gpt-4o,claude-3-5-sonnet-latest"
 EXPERIMENTAL_EVAL_MODELS="gpt-4o,claude-3-5-sonnet-latest,o1-mini,o1-preview"
 EVAL_CATEGORIES="observe,act,combination,extract,experimental"
+AGENT_EVAL_MAX_STEPS=50
 STAGEHAND_API_URL="http://localhost:80"
diff --git a/evals/evaluator.ts b/evals/evaluator.ts
@@ -55,6 +55,7 @@ export class Evaluator {
       screenshot = true,
       systemPrompt,
       screenshotDelayMs = 250,
+      agentReasoning,
     } = options;
     if (!question) {
       throw new Error("Question cannot be an empty string");
@@ -69,12 +70,12 @@ export class Evaluator {
         question,
         screenshots: screenshot,
         systemPrompt,
+        agentReasoning,
       });
     }
 
     // Single screenshot case (existing logic)
-    const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
-          Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
+    const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to  ${screenshot ? "a screenshot" : "the agents reasoning and actions throughout the task"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.
           Today's date is ${new Date().toLocaleDateString()}`;
 
     await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs));
@@ -97,7 +98,12 @@ export class Evaluator {
           {
             role: "user",
             content: [
-              { type: "text", text: question },
+              {
+                type: "text",
+                text: agentReasoning
+                  ? `Question: ${question}\n\nAgent's reasoning and actions taken:\n${agentReasoning}`
+                  : question,
+              },
               ...(screenshot
                 ? [
                     {
@@ -153,8 +159,7 @@ export class Evaluator {
     const {
       questions,
       screenshot = true,
-      systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
-           Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
+      systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task based on the original goal. You have access to  ${screenshot ? "a screenshot" : "the agents reasoning and actions throughout the task"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.
           Today's date is ${new Date().toLocaleDateString()}`,
       screenshotDelayMs = 1000,
     } = options;
@@ -260,14 +265,17 @@ export class Evaluator {
     question: string;
     screenshots: Buffer[];
     systemPrompt?: string;
+    agentReasoning?: string;
   }): Promise<EvaluationResult> {
     const {
       question,
       screenshots,
+      agentReasoning,
       systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
+        ${agentReasoning ? "You also have access to the agent's detailed reasoning and thought process throughout the task." : ""}
         Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
         Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
-        Be critical about the question but consider the ENTIRE sequence when making your determination.
+        ${agentReasoning ? "The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation." : ""}
         Today's date is ${new Date().toLocaleDateString()}`,
     } = options;
 
@@ -303,7 +311,9 @@ export class Evaluator {
             content: [
               {
                 type: "text",
-                text: `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
+                text: agentReasoning
+                  ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`
+                  : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
               },
               ...imageContents,
             ],
diff --git a/evals/tasks/agent/all_recipes.ts b/evals/tasks/agent/all_recipes.ts
@@ -14,7 +14,7 @@ export const all_recipes: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
-      maxSteps: 30,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
     });
 
     const { evaluation, reasoning } = await evaluator.ask({
diff --git a/evals/tasks/agent/apple_trade_in.ts b/evals/tasks/agent/apple_trade_in.ts
@@ -15,7 +15,7 @@ export const apple_trade_in: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
-      maxSteps: 30,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
     });
 
     const { evaluation, reasoning } = await evaluator.ask({
diff --git a/evals/tasks/agent/apple_tv.ts b/evals/tasks/agent/apple_tv.ts
@@ -14,7 +14,7 @@ export const apple_tv: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
-      maxSteps: 30,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
     });
 
     const { height, width } = await stagehand.page.extract({
diff --git a/evals/tasks/agent/arxiv_gpt_report.ts b/evals/tasks/agent/arxiv_gpt_report.ts
@@ -15,7 +15,7 @@ export const arxiv_gpt_report: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
-      maxSteps: 25,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 25,
     });
 
     // Mon, 27 Mar 2023 17:46:54 UTC
diff --git a/evals/tasks/agent/gaia.ts b/evals/tasks/agent/gaia.ts
@@ -51,7 +51,7 @@ export const gaia: EvalFunction = async ({
 
     const result = await agent.execute({
       instruction: params.ques,
-      maxSteps: 50,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
     });
 
     const expected = (params as Record<string, unknown>).expected as
diff --git a/evals/tasks/agent/github.ts b/evals/tasks/agent/github.ts
@@ -13,7 +13,7 @@ export const github: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Find a Ruby repository on GitHub that has been updated in the past 3 days and has at least 1000 stars.",
-      maxSteps: 20,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 20,
     });
     logger.log(agentResult);
 
diff --git a/evals/tasks/agent/github_react_version.ts b/evals/tasks/agent/github_react_version.ts
@@ -13,7 +13,7 @@ export const github_react_version: EvalFunction = async ({
     await agent.execute({
       instruction:
         "Check the latest release version of React and the date it was published. ",
-      maxSteps: 20,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 20,
     });
     const { evaluation, reasoning } = await evaluator.ask({
       question:
diff --git a/evals/tasks/agent/google_flights.ts b/evals/tasks/agent/google_flights.ts
@@ -14,7 +14,7 @@ export const google_flights: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Search for flights from San Francisco to New York for next weekend",
-      maxSteps: 30,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
     });
     logger.log(agentResult);
 
diff --git a/evals/tasks/agent/google_maps.ts b/evals/tasks/agent/google_maps.ts
@@ -14,7 +14,7 @@ export const google_maps: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "How long does it take to get from San Francisco to New York driving?",
-      maxSteps: 15,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 15,
     });
     logger.log(agentResult);
 
diff --git a/evals/tasks/agent/google_maps_2.ts b/evals/tasks/agent/google_maps_2.ts
@@ -15,7 +15,7 @@ export const google_maps_2: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Search for the fastest walking route from La Puerta de Alcalá to La Puerta del Sol",
-      maxSteps: 20,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 20,
     });
     logger.log(agentResult);
 
diff --git a/evals/tasks/agent/google_maps_3.ts b/evals/tasks/agent/google_maps_3.ts
@@ -13,7 +13,7 @@ export const google_maps_3: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Search for locksmiths open now but not open 24 hours in Texas City.",
-      maxSteps: 35,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 35,
     });
 
     const { evaluation, reasoning } = await evaluator.ask({
diff --git a/evals/tasks/agent/google_shopping.ts b/evals/tasks/agent/google_shopping.ts
@@ -14,7 +14,7 @@ export const google_shopping: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Find a drip coffee maker that is on sale and within $25-60 and has a black finish",
-      maxSteps: 20,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 20,
     });
     logger.log(agentResult);
 
diff --git a/evals/tasks/agent/hotel_booking.ts b/evals/tasks/agent/hotel_booking.ts
@@ -14,7 +14,7 @@ export const hotel_booking: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Find a hotel in Sydney with a rating of 8 or higher, providing free Wi-Fi and parking, available for a four-night stay starting on December 10, 2025.",
-      maxSteps: 20,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 20,
     });
     logger.log(agentResult);
 
diff --git a/evals/tasks/agent/hugging_face.ts b/evals/tasks/agent/hugging_face.ts
@@ -14,7 +14,7 @@ export const hugging_face: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Search for a model on Hugging Face with an Apache-2.0 license that has received the highest number of likes.",
-      maxSteps: 20,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 20,
     });
     console.log(`agentResult: ${agentResult.message}`);
     const { evaluation, reasoning } = await evaluator.ask({
diff --git a/evals/tasks/agent/iframe_form.ts b/evals/tasks/agent/iframe_form.ts
@@ -13,7 +13,7 @@ export const iframe_form: EvalFunction = async ({
 
     const agentResult = await agent.execute({
       instruction: "Fill in the form name with 'John Smith'",
-      maxSteps: 3,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 3,
     });
     logger.log(agentResult);
 
@@ -35,7 +35,7 @@ export const iframe_form: EvalFunction = async ({
 
     const agentResult2 = await agent.execute({
       instruction: "Fill in the form email with 'john.smith@example.com'",
-      maxSteps: 3,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 3,
     });
     logger.log(agentResult2);
 
diff --git a/evals/tasks/agent/iframe_form_multiple.ts b/evals/tasks/agent/iframe_form_multiple.ts
@@ -14,7 +14,7 @@ export const iframe_form_multiple: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Fill in the form name with 'John Smith', the email with 'john.smith@example.com', and select the 'Are you the domain owner?' option as 'No'",
-      maxSteps: 10,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 10,
     });
     logger.log(agentResult);
 
diff --git a/evals/tasks/agent/kayak.ts b/evals/tasks/agent/kayak.ts
@@ -14,11 +14,11 @@ export const kayak: EvalFunction = async ({
 
     await agent.execute({
       instruction: "Find flights from San Francisco to Tokyo next week",
-      maxSteps: 25,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 25,
     });
     await agent.execute({
       instruction: "Sort the flights by price",
-      maxSteps: 8,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 8,
     });
 
     if (stagehand.context.pages().length !== 2) {
diff --git a/evals/tasks/agent/kith.ts b/evals/tasks/agent/kith.ts
@@ -17,7 +17,7 @@ export const kith: EvalFunction = async ({
     await agent.execute({
       instruction:
         "add the shoes to cart, go to checkout, and fill the delivery information",
-      maxSteps: 25,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 25,
     });
 
     const { evaluation, reasoning } = await evaluator.ask({
@@ -29,7 +29,7 @@ export const kith: EvalFunction = async ({
     if (success) {
       await agent.execute({
         instruction: "fill the credit card information",
-        maxSteps: 10,
+        maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 10,
       });
 
       const { evaluation: evaluation2, reasoning: reasoning2 } =
diff --git a/evals/tasks/agent/nba_trades.ts b/evals/tasks/agent/nba_trades.ts
@@ -14,7 +14,7 @@ export const nba_trades: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Find the latest Team transaction in the NBA within the past week.",
-      maxSteps: 25,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 25,
     });
     logger.log(agentResult);
 
diff --git a/evals/tasks/agent/sf_library_card.ts b/evals/tasks/agent/sf_library_card.ts
@@ -14,7 +14,7 @@ export const sf_library_card: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Fill in the 'Residential Address' field with '166 Geary St'",
-      maxSteps: 3,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 3,
     });
     logger.log(agentResult);
 
diff --git a/evals/tasks/agent/sf_library_card_multiple.ts b/evals/tasks/agent/sf_library_card_multiple.ts
@@ -14,7 +14,7 @@ export const sf_library_card_multiple: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Fill in ALL the required fields with mock data. DO NOT submit the form",
-      maxSteps: 20,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 20,
     });
     logger.log(agentResult);
 
diff --git a/evals/tasks/agent/sign_in.ts b/evals/tasks/agent/sign_in.ts
@@ -13,7 +13,7 @@ export const sign_in: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Sign in with the email address 'test@browserbaser.com' and the password 'stagehand=goated' ",
-      maxSteps: 15,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 15,
     });
     logger.log(agentResult);
     const url = await stagehand.page.url();
diff --git a/evals/tasks/agent/steam_games.ts b/evals/tasks/agent/steam_games.ts
@@ -12,8 +12,8 @@ export const steam_games: EvalFunction = async ({
 
     const agentResult = await agent.execute({
       instruction:
-        "Show most played games in Steam. And tell me the number of players in In game at this time",
-      maxSteps: 30,
+        "Show most played games in Steam. And tell me the number of players in game at this time",
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
     });
 
     //strictly used url check and no extract as the top games / players can vary
diff --git a/evals/tasks/agent/trivago.ts b/evals/tasks/agent/trivago.ts
@@ -13,7 +13,7 @@ export const trivago: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Find the cheapest room in the hotel H10 Tribeca in Madrid next weekend. Stop at the trivago page showing the results",
-      maxSteps: 13,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 13,
     });
     logger.log(agentResult);
 
diff --git a/evals/tasks/agent/ubereats.ts b/evals/tasks/agent/ubereats.ts
@@ -15,7 +15,7 @@ export const ubereats: EvalFunction = async ({
     await agent.execute({
       instruction:
         "Order a pizza from ubereats to 639 geary st in sf, call the task complete once the login page is shown after adding pizza and viewing the cart",
-      maxSteps: 35,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 35,
     });
 
     const { evaluation, reasoning } = await evaluator.ask({
diff --git a/evals/tasks/agent/webvoyager.ts b/evals/tasks/agent/webvoyager.ts
@@ -44,9 +44,9 @@ export const webvoyager: EvalFunction = async ({
 
     screenshotCollector.start();
 
-    await agent.execute({
+    const agentResult = await agent.execute({
       instruction: params.ques,
-      maxSteps: 50,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
     });
 
     // Stop collecting and get all screenshots
@@ -60,8 +60,11 @@ export const webvoyager: EvalFunction = async ({
 
     const evaluator = new Evaluator(stagehand);
     const evalResult = await evaluator.ask({
-      question: `Did the agent successfully complete this task: "${params.ques}"? Look at all the screenshots showing the progression of the task to verify if it was completed successfully.`,
+      question: `Did the agent successfully complete this task: "${params.ques}"?`,
       screenshot: screenshots,
+      agentReasoning:
+        agentResult.message ||
+        "no reasoning available, agent potentially hit step limit",
     });
 
     return {
diff --git a/evals/tasks/agent/youtube.ts b/evals/tasks/agent/youtube.ts
@@ -13,7 +13,7 @@ export const youtube: EvalFunction = async ({
     const agentResult = await agent.execute({
       instruction:
         "Search for Keinemusik's set under some very famous pointy landmarks",
-      maxSteps: 15,
+      maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 15,
     });
     logger.log(agentResult);
     const url = await stagehand.page.url();
diff --git a/types/evaluator.ts b/types/evaluator.ts
@@ -9,6 +9,8 @@ export type EvaluateOptions = {
   systemPrompt?: string;
   /** Delay in milliseconds before taking the screenshot @default 250 */
   screenshotDelayMs?: number;
+  /** The agent's reasoning/thought process for completing the task */
+  agentReasoning?: string;
 };
 
 export type BatchAskOptions = {