Skip to content

Commit 430db8c

Browse files
add screenshots to evals (#1039)
# why We need screenshots to better eval agent performance # what changed Side running screenshot service # test plan evals run locally
1 parent 88d1565 commit 430db8c

File tree

4 files changed

+220
-8
lines changed

4 files changed

+220
-8
lines changed

evals/evaluator.ts

Lines changed: 94 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,7 @@ export class Evaluator {
5353
question,
5454
answer,
5555
screenshot = true,
56-
systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
57-
Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
58-
Today's date is ${new Date().toLocaleDateString()}`,
56+
systemPrompt,
5957
screenshotDelayMs = 250,
6058
} = options;
6159
if (!question) {
@@ -65,6 +63,20 @@ export class Evaluator {
6563
throw new Error("Either answer (text) or screenshot must be provided");
6664
}
6765

66+
// Handle multiple screenshots case
67+
if (Array.isArray(screenshot)) {
68+
return this._evaluateWithMultipleScreenshots({
69+
question,
70+
screenshots: screenshot,
71+
systemPrompt,
72+
});
73+
}
74+
75+
// Single screenshot case (existing logic)
76+
const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
77+
Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
78+
Today's date is ${new Date().toLocaleDateString()}`;
79+
6880
await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs));
6981
let imageBuffer: Buffer;
7082
if (screenshot) {
@@ -81,7 +93,7 @@ export class Evaluator {
8193
logger: this.silentLogger,
8294
options: {
8395
messages: [
84-
{ role: "system", content: systemPrompt },
96+
{ role: "system", content: systemPrompt || defaultSystemPrompt },
8597
{
8698
role: "user",
8799
content: [
@@ -240,4 +252,82 @@ export class Evaluator {
240252
}));
241253
}
242254
}
255+
256+
/**
257+
* Private method to evaluate with multiple screenshots
258+
*/
259+
private async _evaluateWithMultipleScreenshots(options: {
260+
question: string;
261+
screenshots: Buffer[];
262+
systemPrompt?: string;
263+
}): Promise<EvaluationResult> {
264+
const {
265+
question,
266+
screenshots,
267+
systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
268+
Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
269+
Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
270+
Be critical about the question but consider the ENTIRE sequence when making your determination.
271+
Today's date is ${new Date().toLocaleDateString()}`,
272+
} = options;
273+
274+
if (!question) {
275+
throw new Error("Question cannot be an empty string");
276+
}
277+
278+
if (!screenshots || screenshots.length === 0) {
279+
throw new Error("At least one screenshot must be provided");
280+
}
281+
282+
const llmClient = this.stagehand.llmProvider.getClient(
283+
this.modelName,
284+
this.modelClientOptions,
285+
);
286+
287+
const imageContents = screenshots.map((screenshot) => ({
288+
type: "image_url" as const,
289+
image_url: {
290+
url: `data:image/jpeg;base64,${screenshot.toString("base64")}`,
291+
},
292+
}));
293+
294+
const response = await llmClient.createChatCompletion<
295+
LLMParsedResponse<LLMResponse>
296+
>({
297+
logger: this.silentLogger,
298+
options: {
299+
messages: [
300+
{ role: "system", content: systemPrompt },
301+
{
302+
role: "user",
303+
content: [
304+
{
305+
type: "text",
306+
text: `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
307+
},
308+
...imageContents,
309+
],
310+
},
311+
],
312+
response_model: {
313+
name: "EvaluationResult",
314+
schema: EvaluationSchema,
315+
},
316+
},
317+
});
318+
319+
try {
320+
const result = response.data as unknown as z.infer<
321+
typeof EvaluationSchema
322+
>;
323+
return { evaluation: result.evaluation, reasoning: result.reasoning };
324+
} catch (error) {
325+
const errorMessage =
326+
error instanceof Error ? error.message : String(error);
327+
return {
328+
evaluation: "INVALID" as const,
329+
reasoning: `Failed to get structured response: ${errorMessage}`,
330+
};
331+
}
332+
}
243333
}

evals/tasks/agent/webvoyager.ts

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { EvalFunction } from "@/types/evals";
22
import { Evaluator } from "../../evaluator";
3+
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
34

45
export const webvoyager: EvalFunction = async ({
56
stagehand,
@@ -35,20 +36,39 @@ export const webvoyager: EvalFunction = async ({
3536
instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}`,
3637
});
3738

39+
// Start collecting screenshots in parallel
40+
const screenshotCollector = new ScreenshotCollector(stagehand.page, {
41+
interval: 2000, // Capture every 2 seconds
42+
maxScreenshots: 10, // Keep last 10 screenshots
43+
captureOnNavigation: true, // Also capture on page navigation
44+
});
45+
46+
screenshotCollector.start();
47+
3848
await agent.execute({
3949
instruction: params.ques,
4050
maxSteps: 50,
4151
});
4252

53+
// Stop collecting and get all screenshots
54+
const screenshots = screenshotCollector.stop();
55+
56+
logger.log({
57+
category: "evaluation",
58+
message: `Collected ${screenshots.length} screenshots for evaluation`,
59+
level: 1,
60+
});
61+
4362
const evaluator = new Evaluator(stagehand);
4463
const evalResult = await evaluator.ask({
45-
question: `Did the agent successfully complete this task: "${params.ques}"? Look at the current state of the page to verify if the task was completed successfully.`,
46-
screenshot: true,
64+
question: `Did the agent successfully complete this task: "${params.ques}"? Look at all the screenshots showing the progression of the task to verify if it was completed successfully.`,
65+
screenshot: screenshots,
4766
});
4867

4968
return {
5069
_success: evalResult.evaluation === "YES",
5170
reasoning: evalResult.reasoning,
71+
screenshotCount: screenshots.length,
5272
debugUrl,
5373
sessionUrl,
5474
logs: logger.getLogs(),

evals/utils/ScreenshotCollector.ts

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import { Page } from "@playwright/test";
2+
3+
export interface ScreenshotCollectorOptions {
4+
interval?: number;
5+
maxScreenshots?: number;
6+
captureOnNavigation?: boolean;
7+
}
8+
9+
export class ScreenshotCollector {
10+
private screenshots: Buffer[] = [];
11+
private page: Page;
12+
private interval: number;
13+
private maxScreenshots: number;
14+
private captureOnNavigation: boolean;
15+
private intervalId?: NodeJS.Timeout;
16+
private navigationListeners: Array<() => void> = [];
17+
private isCapturing: boolean = false;
18+
19+
constructor(page: Page, options: ScreenshotCollectorOptions = {}) {
20+
this.page = page;
21+
this.interval = options.interval || 2000;
22+
this.maxScreenshots = options.maxScreenshots || 10;
23+
this.captureOnNavigation = options.captureOnNavigation ?? true;
24+
}
25+
26+
start(): void {
27+
if (this.intervalId) {
28+
return;
29+
}
30+
31+
this.intervalId = setInterval(async () => {
32+
await this.captureScreenshot("interval");
33+
}, this.interval);
34+
35+
if (this.captureOnNavigation) {
36+
const loadListener = () => this.captureScreenshot("load");
37+
const domContentListener = () =>
38+
this.captureScreenshot("domcontentloaded");
39+
40+
this.page.on("load", loadListener);
41+
this.page.on("domcontentloaded", domContentListener);
42+
43+
this.navigationListeners = [
44+
() => this.page.off("load", loadListener),
45+
() => this.page.off("domcontentloaded", domContentListener),
46+
];
47+
}
48+
49+
this.captureScreenshot("initial");
50+
}
51+
52+
stop(): Buffer[] {
53+
if (this.intervalId) {
54+
clearInterval(this.intervalId);
55+
this.intervalId = undefined;
56+
}
57+
58+
this.navigationListeners.forEach((removeListener) => removeListener());
59+
this.navigationListeners = [];
60+
61+
this.captureScreenshot("final");
62+
63+
return this.getScreenshots();
64+
}
65+
66+
private async captureScreenshot(trigger: string): Promise<void> {
67+
if (this.isCapturing) {
68+
return;
69+
}
70+
71+
this.isCapturing = true;
72+
73+
try {
74+
const screenshot = await this.page.screenshot();
75+
this.screenshots.push(screenshot);
76+
77+
if (this.screenshots.length > this.maxScreenshots) {
78+
this.screenshots.shift();
79+
}
80+
81+
console.log(
82+
`Screenshot captured (trigger: ${trigger}), total: ${this.screenshots.length}`,
83+
);
84+
} catch (error) {
85+
console.error(`Failed to capture screenshot (${trigger}):`, error);
86+
} finally {
87+
this.isCapturing = false;
88+
}
89+
}
90+
91+
getScreenshots(): Buffer[] {
92+
return [...this.screenshots];
93+
}
94+
95+
getScreenshotCount(): number {
96+
return this.screenshots.length;
97+
}
98+
99+
clear(): void {
100+
this.screenshots = [];
101+
}
102+
}

types/evaluator.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ export type EvaluateOptions = {
33
question: string;
44
/** The answer to the question */
55
answer?: string;
6-
/** Whether to take a screenshot of the task state */
7-
screenshot?: boolean;
6+
/** Whether to take a screenshot of the task state, or array of screenshots to evaluate */
7+
screenshot?: boolean | Buffer[];
88
/** Custom system prompt for the evaluator */
99
systemPrompt?: string;
1010
/** Delay in milliseconds before taking the screenshot @default 250 */

0 commit comments

Comments
 (0)