add screenshots to evals (#1039)

filip-michalsky · web-flow · commit 430db8c20d5c · 2025-08-29T15:52:21.000-04:00
# why

We need screenshots to better eval agent performance

# what changed
Side running screenshot service

# test plan
evals run locally
diff --git a/evals/evaluator.ts b/evals/evaluator.ts
@@ -53,9 +53,7 @@ export class Evaluator {
       question,
       answer,
       screenshot = true,
-      systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
-          Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
-          Today's date is ${new Date().toLocaleDateString()}`,
+      systemPrompt,
       screenshotDelayMs = 250,
     } = options;
     if (!question) {
@@ -65,6 +63,20 @@ export class Evaluator {
       throw new Error("Either answer (text) or screenshot must be provided");
     }
 
+    // Handle multiple screenshots case
+    if (Array.isArray(screenshot)) {
+      return this._evaluateWithMultipleScreenshots({
+        question,
+        screenshots: screenshot,
+        systemPrompt,
+      });
+    }
+
+    // Single screenshot case (existing logic)
+    const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
+          Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
+          Today's date is ${new Date().toLocaleDateString()}`;
+
     await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs));
     let imageBuffer: Buffer;
     if (screenshot) {
@@ -81,7 +93,7 @@ export class Evaluator {
       logger: this.silentLogger,
       options: {
         messages: [
-          { role: "system", content: systemPrompt },
+          { role: "system", content: systemPrompt || defaultSystemPrompt },
           {
             role: "user",
             content: [
@@ -240,4 +252,82 @@ export class Evaluator {
       }));
     }
   }
+
+  /**
+   * Private method to evaluate with multiple screenshots
+   */
+  private async _evaluateWithMultipleScreenshots(options: {
+    question: string;
+    screenshots: Buffer[];
+    systemPrompt?: string;
+  }): Promise<EvaluationResult> {
+    const {
+      question,
+      screenshots,
+      systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
+        Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
+        Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
+        Be critical about the question but consider the ENTIRE sequence when making your determination.
+        Today's date is ${new Date().toLocaleDateString()}`,
+    } = options;
+
+    if (!question) {
+      throw new Error("Question cannot be an empty string");
+    }
+
+    if (!screenshots || screenshots.length === 0) {
+      throw new Error("At least one screenshot must be provided");
+    }
+
+    const llmClient = this.stagehand.llmProvider.getClient(
+      this.modelName,
+      this.modelClientOptions,
+    );
+
+    const imageContents = screenshots.map((screenshot) => ({
+      type: "image_url" as const,
+      image_url: {
+        url: `data:image/jpeg;base64,${screenshot.toString("base64")}`,
+      },
+    }));
+
+    const response = await llmClient.createChatCompletion<
+      LLMParsedResponse<LLMResponse>
+    >({
+      logger: this.silentLogger,
+      options: {
+        messages: [
+          { role: "system", content: systemPrompt },
+          {
+            role: "user",
+            content: [
+              {
+                type: "text",
+                text: `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
+              },
+              ...imageContents,
+            ],
+          },
+        ],
+        response_model: {
+          name: "EvaluationResult",
+          schema: EvaluationSchema,
+        },
+      },
+    });
+
+    try {
+      const result = response.data as unknown as z.infer<
+        typeof EvaluationSchema
+      >;
+      return { evaluation: result.evaluation, reasoning: result.reasoning };
+    } catch (error) {
+      const errorMessage =
+        error instanceof Error ? error.message : String(error);
+      return {
+        evaluation: "INVALID" as const,
+        reasoning: `Failed to get structured response: ${errorMessage}`,
+      };
+    }
+  }
 }
diff --git a/evals/tasks/agent/webvoyager.ts b/evals/tasks/agent/webvoyager.ts
@@ -1,5 +1,6 @@
 import { EvalFunction } from "@/types/evals";
 import { Evaluator } from "../../evaluator";
+import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
 
 export const webvoyager: EvalFunction = async ({
   stagehand,
@@ -35,20 +36,39 @@ export const webvoyager: EvalFunction = async ({
       instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}`,
     });
 
+    // Start collecting screenshots in parallel
+    const screenshotCollector = new ScreenshotCollector(stagehand.page, {
+      interval: 2000, // Capture every 2 seconds
+      maxScreenshots: 10, // Keep last 10 screenshots
+      captureOnNavigation: true, // Also capture on page navigation
+    });
+
+    screenshotCollector.start();
+
     await agent.execute({
       instruction: params.ques,
       maxSteps: 50,
     });
 
+    // Stop collecting and get all screenshots
+    const screenshots = screenshotCollector.stop();
+
+    logger.log({
+      category: "evaluation",
+      message: `Collected ${screenshots.length} screenshots for evaluation`,
+      level: 1,
+    });
+
     const evaluator = new Evaluator(stagehand);
     const evalResult = await evaluator.ask({
-      question: `Did the agent successfully complete this task: "${params.ques}"? Look at the current state of the page to verify if the task was completed successfully.`,
-      screenshot: true,
+      question: `Did the agent successfully complete this task: "${params.ques}"? Look at all the screenshots showing the progression of the task to verify if it was completed successfully.`,
+      screenshot: screenshots,
     });
 
     return {
       _success: evalResult.evaluation === "YES",
       reasoning: evalResult.reasoning,
+      screenshotCount: screenshots.length,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),
diff --git a/evals/utils/ScreenshotCollector.ts b/evals/utils/ScreenshotCollector.ts
@@ -0,0 +1,102 @@
+import { Page } from "@playwright/test";
+
+export interface ScreenshotCollectorOptions {
+  interval?: number;
+  maxScreenshots?: number;
+  captureOnNavigation?: boolean;
+}
+
+export class ScreenshotCollector {
+  private screenshots: Buffer[] = [];
+  private page: Page;
+  private interval: number;
+  private maxScreenshots: number;
+  private captureOnNavigation: boolean;
+  private intervalId?: NodeJS.Timeout;
+  private navigationListeners: Array<() => void> = [];
+  private isCapturing: boolean = false;
+
+  constructor(page: Page, options: ScreenshotCollectorOptions = {}) {
+    this.page = page;
+    this.interval = options.interval || 2000;
+    this.maxScreenshots = options.maxScreenshots || 10;
+    this.captureOnNavigation = options.captureOnNavigation ?? true;
+  }
+
+  start(): void {
+    if (this.intervalId) {
+      return;
+    }
+
+    this.intervalId = setInterval(async () => {
+      await this.captureScreenshot("interval");
+    }, this.interval);
+
+    if (this.captureOnNavigation) {
+      const loadListener = () => this.captureScreenshot("load");
+      const domContentListener = () =>
+        this.captureScreenshot("domcontentloaded");
+
+      this.page.on("load", loadListener);
+      this.page.on("domcontentloaded", domContentListener);
+
+      this.navigationListeners = [
+        () => this.page.off("load", loadListener),
+        () => this.page.off("domcontentloaded", domContentListener),
+      ];
+    }
+
+    this.captureScreenshot("initial");
+  }
+
+  stop(): Buffer[] {
+    if (this.intervalId) {
+      clearInterval(this.intervalId);
+      this.intervalId = undefined;
+    }
+
+    this.navigationListeners.forEach((removeListener) => removeListener());
+    this.navigationListeners = [];
+
+    this.captureScreenshot("final");
+
+    return this.getScreenshots();
+  }
+
+  private async captureScreenshot(trigger: string): Promise<void> {
+    if (this.isCapturing) {
+      return;
+    }
+
+    this.isCapturing = true;
+
+    try {
+      const screenshot = await this.page.screenshot();
+      this.screenshots.push(screenshot);
+
+      if (this.screenshots.length > this.maxScreenshots) {
+        this.screenshots.shift();
+      }
+
+      console.log(
+        `Screenshot captured (trigger: ${trigger}), total: ${this.screenshots.length}`,
+      );
+    } catch (error) {
+      console.error(`Failed to capture screenshot (${trigger}):`, error);
+    } finally {
+      this.isCapturing = false;
+    }
+  }
+
+  getScreenshots(): Buffer[] {
+    return [...this.screenshots];
+  }
+
+  getScreenshotCount(): number {
+    return this.screenshots.length;
+  }
+
+  clear(): void {
+    this.screenshots = [];
+  }
+}
diff --git a/types/evaluator.ts b/types/evaluator.ts
@@ -3,8 +3,8 @@ export type EvaluateOptions = {
   question: string;
   /** The answer to the question */
   answer?: string;
-  /** Whether to take a screenshot of the task state */
-  screenshot?: boolean;
+  /** Whether to take a screenshot of the task state, or array of screenshots to evaluate */
+  screenshot?: boolean | Buffer[];
   /** Custom system prompt for the evaluator */
   systemPrompt?: string;
   /** Delay in milliseconds before taking the screenshot @default 250 */