browserbase
diff --git a/‎evals/evals.config.json‎
Lines changed: 0 additions & 4 deletions b/‎evals/evals.config.json‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎evals/evaluator.ts‎
Lines changed: 91 additions & 74 deletions b/‎evals/evaluator.ts‎
Lines changed: 91 additions & 74 deletions
diff --git a/‎evals/tasks/agent/all_recipes.ts‎
Lines changed: 1 addition & 1 deletion b/‎evals/tasks/agent/all_recipes.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎evals/tasks/agent/apple_trade_in.ts‎
Lines changed: 9 additions & 10 deletions b/‎evals/tasks/agent/apple_trade_in.ts‎
Lines changed: 9 additions & 10 deletions
diff --git a/‎evals/tasks/agent/arxiv_gpt_report.ts‎
Lines changed: 11 additions & 11 deletions b/‎evals/tasks/agent/arxiv_gpt_report.ts‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎evals/tasks/agent/github.ts‎
Lines changed: 1 addition & 1 deletion b/‎evals/tasks/agent/github.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎evals/tasks/agent/github_react_version.ts‎
Lines changed: 1 addition & 1 deletion b/‎evals/tasks/agent/github_react_version.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎evals/tasks/agent/google_flights.ts‎
Lines changed: 1 addition & 1 deletion b/‎evals/tasks/agent/google_flights.ts‎
Lines changed: 1 addition & 1 deletion
@@ -515,10 +515,6 @@
     {
       "name": "agent/all_recipes",
       "categories": ["agent"]
-    },
-    {
-      "name": "agent/google_shopping",
-      "categories": ["agent"]
     }
   ]
 }
@@ -13,8 +13,8 @@ import {
 import dotenv from "dotenv";
 import {
   EvaluateOptions,
+  BatchAskOptions,
   EvaluationResult,
-  BatchEvaluateOptions,
 } from "@/types/evaluator";
 import { LLMParsedResponse } from "@/lib/inference";
 import { LLMResponse } from "@/lib/llm/LLMClient";
@@ -46,30 +46,30 @@ export class Evaluator {
     this.modelClientOptions = modelClientOptions || {
       apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY || "",
     };
-    // Create a silent logger function that doesn't output anything
-    this.silentLogger = () => {};
   }
 
-  /**
-   * Evaluates the current state of the page against a specific question.
-   * Uses structured response generation to ensure proper format.
-   * Returns the evaluation result with normalized response and success status.
-   *
-   * @param options - The options for evaluation
-   * @returns A promise that resolves to an EvaluationResult
-   */
-  async evaluate(options: EvaluateOptions): Promise<EvaluationResult> {
+  async ask(options: EvaluateOptions): Promise<EvaluationResult> {
     const {
       question,
-      systemPrompt = `You are an expert evaluator that confidently returns YES or NO given the state of a task (most times in the form of a screenshot) and a question. Provide a detailed reasoning for your answer.
-          Return your response as a JSON object with the following format:
-          { "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
-          Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
-      screenshotDelayMs = 1000,
+      answer,
+      screenshot = true,
+      systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
+          Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
+          Today's date is ${new Date().toLocaleDateString()}`,
+      screenshotDelayMs = 250,
     } = options;
+    if (!question) {
+      throw new Error("Question cannot be an empty string");
+    }
+    if (!answer && !screenshot) {
+      throw new Error("Either answer (text) or screenshot must be provided");
+    }
 
     await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs));
-    const imageBuffer = await this.stagehand.page.screenshot();
+    let imageBuffer: Buffer;
+    if (screenshot) {
+      imageBuffer = await this.stagehand.page.screenshot();
+    }
     const llmClient = this.stagehand.llmProvider.getClient(
       this.modelName,
       this.modelClientOptions,
@@ -86,12 +86,24 @@ export class Evaluator {
             role: "user",
             content: [
               { type: "text", text: question },
-              {
-                type: "image_url",
-                image_url: {
-                  url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
-                },
-              },
+              ...(screenshot
+                ? [
+                    {
+                      type: "image_url",
+                      image_url: {
+                        url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
+                      },
+                    },
+                  ]
+                : []),
+              ...(answer
+                ? [
+                    {
+                      type: "text",
+                      text: `the answer is ${answer}`,
+                    },
+                  ]
+                : []),
             ],
           },
         ],
@@ -106,15 +118,10 @@ export class Evaluator {
       const result = response.data as unknown as z.infer<
         typeof EvaluationSchema
       >;
-
-      return {
-        evaluation: result.evaluation,
-        reasoning: result.reasoning,
-      };
+      return { evaluation: result.evaluation, reasoning: result.reasoning };
     } catch (error) {
       const errorMessage =
         error instanceof Error ? error.message : String(error);
-
       return {
         evaluation: "INVALID" as const,
         reasoning: `Failed to get structured response: ${errorMessage}`,
@@ -123,43 +130,65 @@ export class Evaluator {
   }
 
   /**
-   * Evaluates the current state of the page against multiple questions in a single screenshot.
-   * Uses structured response generation to ensure proper format.
+   * Evaluates multiple questions with optional answers and/or screenshot.
+   * Similar to ask() but processes multiple questions in a single call.
    * Returns an array of evaluation results.
    *
    * @param options - The options for batch evaluation
    * @returns A promise that resolves to an array of EvaluationResults
    */
-  async batchEvaluate(
-    options: BatchEvaluateOptions,
-  ): Promise<EvaluationResult[]> {
+  async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {
     const {
       questions,
-      systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task in the screenshot. Provide a detailed reasoning for your answer.
-          Return your response as a JSON array, where each object corresponds to a question and has the following format:
-          { "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
-          Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
+      screenshot = true,
+      systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
+           Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
+          Today's date is ${new Date().toLocaleDateString()}`,
       screenshotDelayMs = 1000,
     } = options;
 
+    // Validate inputs
+    if (!questions || questions.length === 0) {
+      throw new Error("Questions array cannot be empty");
+    }
+
+    for (const item of questions) {
+      if (!item.question) {
+        throw new Error("Question cannot be an empty string");
+      }
+      if (!item.answer && !screenshot) {
+        throw new Error(
+          "Either answer (text) or screenshot must be provided for each question",
+        );
+      }
+    }
+
     // Wait for the specified delay before taking screenshot
     await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs));
 
-    // Take a screenshot of the current page state
-    const imageBuffer = await this.stagehand.page.screenshot();
-
-    // Create a numbered list of questions for the VLM
-    const formattedQuestions = questions
-      .map((q, i) => `${i + 1}. ${q}`)
-      .join("\n");
+    let imageBuffer: Buffer;
+    if (screenshot) {
+      imageBuffer = await this.stagehand.page.screenshot();
+    }
 
     // Get the LLM client with our preferred model
     const llmClient = this.stagehand.llmProvider.getClient(
       this.modelName,
       this.modelClientOptions,
     );
 
-    // Use the model-specific LLM client to evaluate the screenshot with all questions
+    // Format all questions with their optional answers
+    const formattedQuestions = questions
+      .map((item, i) => {
+        let text = `${i + 1}. ${item.question}`;
+        if (item.answer) {
+          text += `\n   Answer: ${item.answer}`;
+        }
+        return text;
+      })
+      .join("\n\n");
+
+    // Use the model-specific LLM client to evaluate
     const response = await llmClient.createChatCompletion<
       LLMParsedResponse<LLMResponse>
     >({
@@ -168,18 +197,22 @@ export class Evaluator {
         messages: [
           {
             role: "system",
-            content: `${systemPrompt}\n\nYou will be given multiple questions. Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,
+            content: `${systemPrompt}\n\nYou will be given multiple questions${screenshot ? " with a screenshot" : ""}. ${questions.some((q) => q.answer) ? "Some questions include answers to evaluate." : ""} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,
           },
           {
             role: "user",
             content: [
               { type: "text", text: formattedQuestions },
-              {
-                type: "image_url",
-                image_url: {
-                  url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
-                },
-              },
+              ...(screenshot
+                ? [
+                    {
+                      type: "image_url",
+                      image_url: {
+                        url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
+                      },
+                    },
+                  ]
+                : []),
             ],
           },
         ],
@@ -194,29 +227,13 @@ export class Evaluator {
       const results = response.data as unknown as z.infer<
         typeof BatchEvaluationSchema
       >;
-
-      // Pad with INVALID results if we got fewer than expected
-      const finalResults: EvaluationResult[] = [];
-      for (let i = 0; i < questions.length; i++) {
-        if (i < results.length) {
-          finalResults.push({
-            evaluation: results[i].evaluation,
-            reasoning: results[i].reasoning,
-          });
-        } else {
-          finalResults.push({
-            evaluation: "INVALID",
-            reasoning: "No response found for this question.",
-          });
-        }
-      }
-
-      return finalResults;
+      return results.map((r) => ({
+        evaluation: r.evaluation,
+        reasoning: r.reasoning,
+      }));
     } catch (error) {
       const errorMessage =
         error instanceof Error ? error.message : String(error);
-
-      // Fallback: return INVALID for all questions
       return questions.map(() => ({
         evaluation: "INVALID" as const,
         reasoning: `Failed to get structured response: ${errorMessage}`,
 
@@ -17,7 +17,7 @@ export const all_recipes: EvalFunction = async ({
       maxSteps: 30,
     });
 
-    const { evaluation, reasoning } = await evaluator.evaluate({
+    const { evaluation, reasoning } = await evaluator.ask({
       question: "Did the agent find a recipe for Beef Wellington",
     });
 
 
@@ -1,6 +1,6 @@
 //this eval is expected to fail due to issues scrolling within the trade in dialog
 import { EvalFunction } from "@/types/evals";
-import { z } from "zod";
+import { Evaluator } from "../../evaluator";
 
 export const apple_trade_in: EvalFunction = async ({
   debugUrl,
@@ -11,27 +11,26 @@ export const apple_trade_in: EvalFunction = async ({
 }) => {
   try {
     await stagehand.page.goto("https://www.apple.com/shop/trade-in");
+    const evaluator = new Evaluator(stagehand);
     const agentResult = await agent.execute({
       instruction:
         "Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
       maxSteps: 30,
     });
 
-    const { tradeInValue } = await stagehand.page.extract({
-      modelName: "google/gemini-2.5-flash",
-      instruction:
-        "Extract the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website. it will be inside this text : Get x trade-in credit toward a new iPhone', provide just the number",
-      schema: z.object({
-        tradeInValue: z.number(),
-      }),
+    const { evaluation, reasoning } = await evaluator.ask({
+      question:
+        "Did the agent find the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website?",
+      screenshot: false,
+      answer: "360",
     });
 
-    const success = agentResult.success && tradeInValue === 360;
+    const success = agentResult.success && evaluation === "YES";
 
     if (!success) {
       return {
         _success: false,
-        message: agentResult.message,
+        message: reasoning,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),
 
@@ -1,6 +1,6 @@
 //agent often fails on this one,
 import { EvalFunction } from "@/types/evals";
-import { z } from "zod";
+import { Evaluator } from "../../evaluator";
 export const arxiv_gpt_report: EvalFunction = async ({
   debugUrl,
   sessionUrl,
@@ -9,6 +9,7 @@ export const arxiv_gpt_report: EvalFunction = async ({
   agent,
 }) => {
   try {
+    const evaluator = new Evaluator(stagehand);
     await stagehand.page.goto("https://arxiv.org/");
 
     const agentResult = await agent.execute({
@@ -18,23 +19,22 @@ export const arxiv_gpt_report: EvalFunction = async ({
     });
 
     // Mon, 27 Mar 2023 17:46:54 UTC
-    const { date } = await stagehand.page.extract({
-      modelName: "google/gemini-2.5-flash",
-      instruction:
-        "Extract the date of the v3 submission history, it should be in the format 'MM-DD-YYYY'",
-      schema: z.object({
-        date: z.string().describe("The date of the v3 submission history"),
-      }),
+
+    const { evaluation, reasoning } = await evaluator.ask({
+      question:
+        "Did the agent find the published paper 'GPT-4 Technical Report' and the date it was submitted?",
+      screenshot: false,
+      answer: "03-27-2023",
     });
 
-    console.log(`date: ${date}`);
+    console.log(`reasoning: ${reasoning}`);
 
-    const success = agentResult.success && date === "03-27-2023";
+    const success = agentResult.success && evaluation === "YES";
 
     if (!success) {
       return {
         _success: false,
-        message: agentResult.message,
+        message: reasoning,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),
 
@@ -17,7 +17,7 @@ export const github: EvalFunction = async ({
     });
     logger.log(agentResult);
 
-    const { evaluation, reasoning } = await evaluator.evaluate({
+    const { evaluation, reasoning } = await evaluator.ask({
       question:
         "Ruby repository on GitHub that has been updated in the past 3 days and has at least 1000 stars.",
     });
 
@@ -15,7 +15,7 @@ export const github_react_version: EvalFunction = async ({
         "Check the latest release version of React and the date it was published. ",
       maxSteps: 20,
     });
-    const { evaluation, reasoning } = await evaluator.evaluate({
+    const { evaluation, reasoning } = await evaluator.ask({
       question:
         "Does the page show the latest version of react and the date it was published",
     });
 
@@ -19,7 +19,7 @@ export const google_flights: EvalFunction = async ({
     logger.log(agentResult);
 
     const evaluator = new Evaluator(stagehand);
-    const result = await evaluator.evaluate({
+    const result = await evaluator.ask({
       question:
         "Does the page show flights (options, available flights, not a search form) from San Francisco to New York?",
     });
Original file line number	Diff line number	Diff line change
`@@ -515,10 +515,6 @@`
`515`	`515`	`{`
`516`	`516`	`"name": "agent/all_recipes",`
`517`	`517`	`"categories": ["agent"]`
`518`		`- },`
`519`		`- {`
`520`		`- "name": "agent/google_shopping",`
`521`		`- "categories": ["agent"]`
`522`	`518`	`}`
`523`	`519`	`]`
`524`	`520`	`}`