Evaluator bug fix stg 667 (#963)

tkattkat · web-flow · commit 0ead63d6526f · 2025-08-12T15:03:18.000-07:00
# why

Currently we do not properly handle passing images from evaluator to the
llmclient

# what changed

- we now provide images in the correct format to the llm from evaluator
- switched to generate object in order to consolidate complex parsing
logic in evaluator

# test plan

tested locally &amp; on browserbase
diff --git a/.changeset/pretty-jokes-own.md b/.changeset/pretty-jokes-own.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Properly handle images in evaluator + clean up response parsing logic 
diff --git a/evals/evaluator.ts b/evals/evaluator.ts
@@ -10,7 +10,6 @@ import {
   ClientOptions,
   Stagehand,
 } from "@browserbasehq/stagehand";
-import { LLMResponseError } from "@/types/stagehandErrors";
 import dotenv from "dotenv";
 import {
   EvaluateOptions,
@@ -20,17 +19,22 @@ import {
 import { LLMParsedResponse } from "@/lib/inference";
 import { LLMResponse } from "@/lib/llm/LLMClient";
 import { LogLine } from "@/types/log";
+import { z } from "zod";
 
 dotenv.config();
 
+const EvaluationSchema = z.object({
+  evaluation: z.enum(["YES", "NO"]),
+  reasoning: z.string(),
+});
+
+const BatchEvaluationSchema = z.array(EvaluationSchema);
+
 export class Evaluator {
   private stagehand: Stagehand;
   private modelName: AvailableModel;
   private modelClientOptions: ClientOptions | { apiKey: string };
   private silentLogger: (message: LogLine) => void;
-  // Define regex patterns directly in the class or as constants if preferred elsewhere
-  private yesPattern = /^(YES|Y|TRUE|CORRECT|AFFIRMATIVE)/i;
-  private noPattern = /^(NO|N|FALSE|INCORRECT|NEGATIVE)/i;
 
   constructor(
     stagehand: Stagehand,
@@ -48,12 +52,11 @@ export class Evaluator {
 
   /**
    * Evaluates the current state of the page against a specific question.
-   * Expects a JSON object response: { "evaluation": "YES" | "NO", "reasoning": "..." }
+   * Uses structured response generation to ensure proper format.
    * Returns the evaluation result with normalized response and success status.
    *
    * @param options - The options for evaluation
    * @returns A promise that resolves to an EvaluationResult
-   * @throws Error if strictResponse is true and response is not clearly YES or NO, or if JSON parsing/validation fails.
    */
   async evaluate(options: EvaluateOptions): Promise<EvaluationResult> {
     const {
@@ -63,7 +66,6 @@ export class Evaluator {
           { "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
           Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
       screenshotDelayMs = 1000,
-      strictResponse = false,
     } = options;
 
     await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs));
@@ -80,86 +82,53 @@ export class Evaluator {
       options: {
         messages: [
           { role: "system", content: systemPrompt },
-          { role: "user", content: question },
+          {
+            role: "user",
+            content: [
+              { type: "text", text: question },
+              {
+                type: "image_url",
+                image_url: {
+                  url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
+                },
+              },
+            ],
+          },
         ],
-        image: { buffer: imageBuffer },
+        response_model: {
+          name: "EvaluationResult",
+          schema: EvaluationSchema,
+        },
       },
     });
-    const rawResponse = response.data as unknown as string;
-    let evaluationResult: "YES" | "NO" | "INVALID" = "INVALID";
-    let reasoning = `Failed to process response. Raw response: ${rawResponse}`;
 
     try {
-      // Clean potential markdown fences
-      const cleanedResponse = rawResponse
-        .replace(/^```json\s*/, "")
-        .replace(/\s*```$/, "")
-        .trim();
-
-      // Attempt to parse the JSON object
-      const parsedResult: { evaluation: unknown; reasoning: unknown } =
-        JSON.parse(cleanedResponse);
-
-      // Validate structure
-      if (
-        typeof parsedResult !== "object" ||
-        parsedResult === null ||
-        typeof parsedResult.evaluation !== "string" ||
-        typeof parsedResult.reasoning !== "string"
-      ) {
-        throw new LLMResponseError(
-          "Evaluator",
-          `Invalid JSON structure received: ${JSON.stringify(parsedResult)}`,
-        );
-      }
-
-      const evaluationString = parsedResult.evaluation.trim().toUpperCase();
-      reasoning = parsedResult.reasoning.trim(); // Update reasoning from parsed object
-
-      // Use regex patterns to validate the evaluation string
-      const isYes = this.yesPattern.test(evaluationString);
-      const isNo = this.noPattern.test(evaluationString);
-
-      if (isYes) {
-        evaluationResult = "YES";
-      } else if (isNo) {
-        evaluationResult = "NO";
-      } else {
-        // Parsed JSON but evaluation value wasn't YES/NO variant
-        if (strictResponse) {
-          throw new LLMResponseError(
-            "Evaluator",
-            `Invalid evaluation value in JSON: ${parsedResult.evaluation}`,
-          );
-        }
-        // Keep INVALID, reasoning already updated
-        reasoning = `Invalid evaluation value: ${parsedResult.evaluation}. Reasoning: ${reasoning}`;
-      }
+      const result = response.data as unknown as z.infer<
+        typeof EvaluationSchema
+      >;
+
+      return {
+        evaluation: result.evaluation,
+        reasoning: result.reasoning,
+      };
     } catch (error) {
       const errorMessage =
         error instanceof Error ? error.message : String(error);
-      // Update reasoning with error details
-      reasoning = `Processing error: ${errorMessage}. Raw response: ${rawResponse}`;
-      if (strictResponse) {
-        // Re-throw error if in strict mode
-        throw new LLMResponseError("Evaluator", reasoning);
-      }
-      // Keep evaluationResult as "INVALID"
-    }
 
-    return {
-      evaluation: evaluationResult,
-      reasoning: reasoning,
-    };
+      return {
+        evaluation: "INVALID" as const,
+        reasoning: `Failed to get structured response: ${errorMessage}`,
+      };
+    }
   }
 
   /**
    * Evaluates the current state of the page against multiple questions in a single screenshot.
+   * Uses structured response generation to ensure proper format.
    * Returns an array of evaluation results.
    *
    * @param options - The options for batch evaluation
    * @returns A promise that resolves to an array of EvaluationResults
-   * @throws Error if strictResponse is true and any response is not clearly YES or NO
    */
   async batchEvaluate(
     options: BatchEvaluateOptions,
@@ -171,7 +140,6 @@ export class Evaluator {
           { "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
           Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
       screenshotDelayMs = 1000,
-      strictResponse = false,
     } = options;
 
     // Wait for the specified delay before taking screenshot
@@ -204,125 +172,55 @@ export class Evaluator {
           },
           {
             role: "user",
-            content: formattedQuestions,
+            content: [
+              { type: "text", text: formattedQuestions },
+              {
+                type: "image_url",
+                image_url: {
+                  url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
+                },
+              },
+            ],
           },
         ],
-        image: {
-          buffer: imageBuffer,
+        response_model: {
+          name: "BatchEvaluationResult",
+          schema: BatchEvaluationSchema,
         },
       },
     });
 
-    const rawResponse = response.data as unknown as string;
-    let finalResults: EvaluationResult[] = [];
-
     try {
-      // Clean potential markdown fences
-      const cleanedResponse = rawResponse
-        .replace(/^```json\s*/, "")
-        .replace(/\s*```$/, "")
-        .trim();
-
-      // Attempt to parse the JSON array
-      const parsedResults: { evaluation: unknown; reasoning: unknown }[] =
-        JSON.parse(cleanedResponse);
-
-      if (!Array.isArray(parsedResults)) {
-        throw new LLMResponseError(
-          "Evaluator",
-          "Response is not a JSON array.",
-        );
-      }
-
-      if (parsedResults.length !== questions.length && strictResponse) {
-        throw new LLMResponseError(
-          "Evaluator",
-          `Expected ${questions.length} results, but got ${parsedResults.length}`,
-        );
-      }
+      const results = response.data as unknown as z.infer<
+        typeof BatchEvaluationSchema
+      >;
 
+      // Pad with INVALID results if we got fewer than expected
+      const finalResults: EvaluationResult[] = [];
       for (let i = 0; i < questions.length; i++) {
-        if (i < parsedResults.length) {
-          const item = parsedResults[i];
-          // Ensure item is an object and has the required properties
-          if (
-            typeof item !== "object" ||
-            item === null ||
-            typeof item.evaluation !== "string" ||
-            typeof item.reasoning !== "string"
-          ) {
-            if (strictResponse) {
-              throw new LLMResponseError(
-                "Evaluator",
-                `Invalid object structure for question ${i + 1}: ${JSON.stringify(item)}`,
-              );
-            }
-            finalResults.push({
-              evaluation: "INVALID",
-              reasoning: `Invalid object structure received: ${JSON.stringify(
-                item,
-              )}`,
-            });
-            continue; // Move to the next question
-          }
-
-          // Use regex patterns for validation
-          const evaluationString = item.evaluation.trim().toUpperCase();
-          const reasoning = item.reasoning.trim();
-          const isYes = this.yesPattern.test(evaluationString);
-          const isNo = this.noPattern.test(evaluationString);
-
-          if (isYes) {
-            finalResults.push({ evaluation: "YES", reasoning: reasoning });
-          } else if (isNo) {
-            finalResults.push({ evaluation: "NO", reasoning: reasoning });
-          } else {
-            // Invalid evaluation value
-            if (strictResponse) {
-              throw new LLMResponseError(
-                "Evaluator",
-                `Invalid evaluation value for question ${i + 1}: ${item.evaluation}`,
-              );
-            }
-            finalResults.push({
-              evaluation: "INVALID",
-              reasoning: `Invalid evaluation value: ${item.evaluation}. Reasoning: ${reasoning}`,
-            });
-          }
+        if (i < results.length) {
+          finalResults.push({
+            evaluation: results[i].evaluation,
+            reasoning: results[i].reasoning,
+          });
         } else {
-          // Missing result for this question
-          if (strictResponse) {
-            throw new LLMResponseError(
-              "Evaluator",
-              `No response found for question ${i + 1}`,
-            );
-          }
           finalResults.push({
             evaluation: "INVALID",
             reasoning: "No response found for this question.",
           });
         }
       }
+
+      return finalResults;
     } catch (error) {
       const errorMessage =
         error instanceof Error ? error.message : String(error);
-      // If JSON parsing fails or structure is wrong, handle based on strictResponse
-      if (strictResponse) {
-        throw new LLMResponseError(
-          "Evaluator",
-          `Failed to parse LLM response or invalid format: ${rawResponse}. Error: ${errorMessage}`,
-        );
-      }
+
       // Fallback: return INVALID for all questions
-      finalResults = []; // Clear any potentially partially filled results
-      for (let i = 0; i < questions.length; i++) {
-        finalResults.push({
-          evaluation: "INVALID",
-          reasoning: `Failed to parse response. Raw response: ${rawResponse}. Error: ${errorMessage}`,
-        });
-      }
+      return questions.map(() => ({
+        evaluation: "INVALID" as const,
+        reasoning: `Failed to get structured response: ${errorMessage}`,
+      }));
     }
-
-    return finalResults;
   }
 }
diff --git a/evals/tasks/agent/google_flights.ts b/evals/tasks/agent/google_flights.ts
@@ -28,7 +28,6 @@ export const google_flights: EvalFunction = async ({
     const result = await evaluator.evaluate({
       question:
         "Does the page show flights (options, available flights, not a search form) from San Francisco to New York?",
-      strictResponse: true,
     });
 
     if (result.evaluation !== "YES" && result.evaluation !== "NO") {
diff --git a/evals/tasks/agent/google_maps.ts b/evals/tasks/agent/google_maps.ts
@@ -28,7 +28,6 @@ export const google_maps: EvalFunction = async ({
     const result = await evaluator.evaluate({
       question:
         "Does the page show the time it takes to drive from San Francisco to New York at all?",
-      strictResponse: true,
     });
 
     if (result.evaluation !== "YES" && result.evaluation !== "NO") {
diff --git a/evals/tasks/agent/google_maps_2.ts b/evals/tasks/agent/google_maps_2.ts
@@ -29,7 +29,6 @@ export const google_maps_2: EvalFunction = async ({
     const result = await evaluator.evaluate({
       question:
         "Does the page show the fastest walking route from La Puerta de Alcalá to La Puerta del Sol? Does the distance between the two points show as 1.5 km?",
-      strictResponse: true,
     });
     const { distance } = await stagehand.page.extract({
       modelName: "google/gemini-2.5-flash",
diff --git a/evals/tasks/agent/iframe_form.ts b/evals/tasks/agent/iframe_form.ts
@@ -26,7 +26,6 @@ export const iframe_form: EvalFunction = async ({
     const evaluator = new Evaluator(stagehand);
     const result = await evaluator.evaluate({
       question: "Is the form name input filled with 'John Smith'?",
-      strictResponse: true,
     });
 
     if (result.evaluation !== "YES" && result.evaluation !== "NO") {
@@ -48,7 +47,6 @@ export const iframe_form: EvalFunction = async ({
     await stagehand.page.mouse.wheel(0, -1000);
     const result2 = await evaluator.evaluate({
       question: "Is the form email input filled with 'john.smith@example.com'?",
-      strictResponse: true,
     });
 
     if (result2.evaluation !== "YES" && result2.evaluation !== "NO") {
diff --git a/evals/tasks/agent/iframe_form_multiple.ts b/evals/tasks/agent/iframe_form_multiple.ts
@@ -31,7 +31,6 @@ export const iframe_form_multiple: EvalFunction = async ({
         "Is the form email input filled with 'john.smith@example.com'?",
         "Is the 'Are you the domain owner?' option selected as 'No'?",
       ],
-      strictResponse: true,
     });
 
     for (const r of results) {
diff --git a/evals/tasks/agent/sf_library_card.ts b/evals/tasks/agent/sf_library_card.ts
@@ -29,7 +29,6 @@ export const sf_library_card: EvalFunction = async ({
     const result = await evaluator.evaluate({
       question:
         "Does the page show the 'Residential Address' field filled with '166 Geary St'?",
-      strictResponse: true,
     });
 
     if (result.evaluation !== "YES" && result.evaluation !== "NO") {
diff --git a/evals/tasks/agent/sf_library_card_multiple.ts b/evals/tasks/agent/sf_library_card_multiple.ts
diff --git a/types/evaluator.ts b/types/evaluator.ts

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@browserbasehq/stagehand": patch
 +---
++
 +Properly handle images in evaluator + clean up response parsing logic