Skip to content

Commit dc2d420

Browse files
img diff algo for screenshots (#1072)
# why Our existing screenshot service is a dummy time-based triggered service. It also does not trigger based on any actions of the agent. # what changed Added img hash diff algo (quick check with MSE, verify with SSIM algo) to see if there was an actual UI change and only store ss in the buffer if that is so. Added ss interceptor which copies each screenshot the agent is taking to a buffer (if different enough from the previous ss) to be later used for evals. - There's also a small refactor of the agent initialization config to enable the screenshot collector service to be attached # test plan Tests pass locally --------- Co-authored-by: Miguel <[email protected]> Co-authored-by: miguel <[email protected]>
1 parent 8c0fd01 commit dc2d420

24 files changed

+2821
-1815
lines changed

.changeset/curly-boats-push.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand-evals": patch
3+
---
4+
5+
improve evals screenshot service - add img hashing diff to add screenshots and change to screenshot intercepts from the agent

evals/cli.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,6 @@ function handleRun(args: string[]): void {
381381
webbench: "agent/webbench",
382382
gaia: "agent/gaia",
383383
webvoyager: "agent/webvoyager",
384-
osworld: "agent/osworld",
385384
onlineMind2Web: "agent/onlineMind2Web",
386385
};
387386

evals/evaluator.ts

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ import {
1919
import { LLMParsedResponse } from "@/lib/inference";
2020
import { LLMResponse } from "@/lib/llm/LLMClient";
2121
import { LogLine } from "@/types/log";
22-
import { z } from "zod";
22+
import { z } from "zod/v3";
23+
import { imageResize } from "./utils/imageUtils";
2324

2425
dotenv.config();
2526

@@ -292,17 +293,36 @@ export class Evaluator {
292293
this.modelClientOptions,
293294
);
294295

295-
const imageContents = screenshots.map((screenshot) => ({
296+
//Downsize screenshots:
297+
const downsizedScreenshots = await Promise.all(
298+
screenshots.map(async (screenshot) => {
299+
return await imageResize(screenshot, 0.7);
300+
}),
301+
);
302+
303+
const imageContents = downsizedScreenshots.map((screenshot) => ({
296304
type: "image_url" as const,
297305
image_url: {
298-
url: `data:image/jpeg;base64,${screenshot.toString("base64")}`,
306+
url: `data:image/png;base64,${screenshot.toString("base64")}`,
299307
},
300308
}));
301309

310+
this.stagehand.logger?.({
311+
category: "evaluator",
312+
message: `Evaluating question: ${question} with ${screenshots.length} screenshots`,
313+
level: 2,
314+
auxiliary: {
315+
images: {
316+
value: JSON.stringify(imageContents),
317+
type: "object",
318+
},
319+
},
320+
});
321+
302322
const response = await llmClient.createChatCompletion<
303323
LLMParsedResponse<LLMResponse>
304324
>({
305-
logger: this.silentLogger,
325+
logger: this.stagehand.logger,
306326
options: {
307327
messages: [
308328
{ role: "system", content: systemPrompt },

evals/index.eval.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,8 @@ const generateFilteredTestcases = (): Testcase[] => {
300300
const braintrustProjectName =
301301
process.env.CI === "true" ? "stagehand" : "stagehand-dev";
302302

303+
const startTime = Date.now();
304+
303305
try {
304306
// Run the evaluations with the braintrust Eval function
305307
const evalResult = await Eval(braintrustProjectName, {
@@ -483,6 +485,9 @@ const generateFilteredTestcases = (): Testcase[] => {
483485

484486
// Generate and write the summary
485487
await generateSummary(summaryResults, experimentName);
488+
console.log(
489+
`\n⌛️Evaluation completed in ${(Date.now() - startTime) / 1000}s\n`,
490+
);
486491
} catch (error) {
487492
console.error("Error during evaluation run:", error);
488493
process.exit(1);

evals/initStagehand.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@ export const initStagehand = async ({
117117
} else {
118118
agentConfig = {
119119
model: modelName,
120-
executionModel: "google/gemini-2.5-flash",
121120
} as AgentConfig;
122121
}
123122

evals/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
"e2e:local": "pnpm run build && playwright test --config deterministic/local.playwright.config.ts"
1313
},
1414
"dependencies": {
15-
"@browserbasehq/stagehand": "workspace:*"
15+
"@browserbasehq/stagehand": "workspace:*",
16+
"sharp": "^0.33.5"
1617
},
1718
"devDependencies": {
1819
"@types/papaparse": "^5.3.16",

evals/tasks/agent/gaia.ts

Lines changed: 60 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
import { EvalFunction } from "@/types/evals";
22
import { Evaluator } from "../../evaluator";
3+
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
4+
import { loadApiKeyFromEnv } from "@/lib/utils";
5+
import { modelToAgentProviderMap } from "@/lib/agent/AgentProvider";
6+
import dotenv from "dotenv";
37

8+
dotenv.config();
49
/**
510
* Data-driven GAIA agent eval
611
* - Expects per-test params injected via eval runner: { id, level, web, ques }
@@ -14,25 +19,20 @@ export const gaia: EvalFunction = async ({
1419
debugUrl,
1520
sessionUrl,
1621
input,
17-
agent,
22+
modelName,
1823
}) => {
24+
const startTime = Date.now();
25+
1926
try {
2027
const params = ((input && input.params) || {}) as {
2128
id?: string;
2229
level?: number;
2330
web?: string;
2431
ques?: string;
32+
expected?: string;
2533
};
2634

2735
if (!params.web || !params.ques) {
28-
logger.error({
29-
category: "gaia",
30-
level: 0,
31-
message: `Missing GAIA params (web, ques).`,
32-
auxiliary: {
33-
params: { value: JSON.stringify(params), type: "object" },
34-
},
35-
});
3636
return {
3737
_success: false,
3838
error: `Missing GAIA params (web, ques). Got: ${JSON.stringify(params)}`,
@@ -41,53 +41,74 @@ export const gaia: EvalFunction = async ({
4141
logs: logger.getLogs(),
4242
};
4343
}
44-
await stagehand.page.goto(params.web);
4544

46-
const result = await agent.execute({
45+
await stagehand.page.goto(params.web, {
46+
timeout: 75_000,
47+
});
48+
49+
const provider =
50+
modelName in modelToAgentProviderMap
51+
? modelToAgentProviderMap[modelName]
52+
: undefined;
53+
54+
const agent = stagehand.agent({
55+
model: modelName,
56+
provider,
57+
instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}. ALWAYS OPERATE WITHIN THE PAGE OPENED BY THE USER, WHICHEVER TASK YOU ARE ATTEMPTING TO COMPLETE CAN BE ACCOMPLISHED WITHIN THE PAGE.`,
58+
options: {
59+
apiKey: loadApiKeyFromEnv(provider, stagehand.logger),
60+
},
61+
});
62+
63+
// Start collecting screenshots with hybrid approach
64+
const screenshotCollector = new ScreenshotCollector(stagehand.page, {
65+
maxScreenshots: 8, // Keep last 8 screenshots
66+
});
67+
68+
// Set the collector on the agent so it captures screenshots
69+
if (agent.setScreenshotCollector) {
70+
agent.setScreenshotCollector(screenshotCollector);
71+
}
72+
73+
screenshotCollector.start();
74+
75+
const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 50;
76+
const agentResult = await agent.execute({
4777
instruction: params.ques,
48-
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
78+
maxSteps,
79+
});
80+
// Stop collecting and get all screenshots
81+
const screenshots = screenshotCollector.stop();
82+
83+
logger.log({
84+
category: "evaluation",
85+
message: `Collected ${screenshots.length} screenshots for evaluation`,
86+
level: 1,
4987
});
5088

51-
const expected = (params as Record<string, unknown>).expected as
52-
| string
53-
| undefined;
89+
const expected = params.expected;
5490
const evaluator = new Evaluator(stagehand);
5591
const evalResult = await evaluator.ask({
5692
question: `Did the agent provide the expected answer: "${expected}"?`,
57-
answer: result?.message || "",
58-
screenshot: false,
93+
answer: agentResult.message || "",
94+
screenshot: screenshots,
5995
});
6096

6197
return {
6298
_success: evalResult.evaluation === "YES",
6399
reasoning: evalResult.reasoning,
64100
expectedAnswer: expected,
101+
final_answer: agentResult?.message,
102+
screenshotCount: screenshots.length,
103+
task_level: params.level,
104+
execution_time: Date.now() - startTime,
65105
debugUrl,
66106
sessionUrl,
67107
logs: logger.getLogs(),
68108
};
69109
} catch (error) {
70-
logger.error({
71-
category: "gaia",
72-
level: 0,
73-
message: `Unhandled error in GAIA task`,
74-
auxiliary: {
75-
error: {
76-
value: error instanceof Error ? error.message : String(error),
77-
type: "string",
78-
},
79-
trace: {
80-
value: error instanceof Error && error.stack ? error.stack : "",
81-
type: "string",
82-
},
83-
},
84-
});
85-
return {
86-
_success: false,
87-
error,
88-
debugUrl,
89-
sessionUrl,
90-
logs: logger.getLogs(),
91-
};
110+
// Let the error propagate - the parent runner will handle cleanup
111+
console.error(error);
112+
throw error;
92113
}
93114
};

evals/tasks/agent/google_maps_2.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { EvalFunction } from "@/types/evals";
22
import { Evaluator } from "../../evaluator";
3-
import { z } from "zod";
3+
import { z } from "zod/v3";
44

55
export const google_maps_2: EvalFunction = async ({
66
debugUrl,

evals/tasks/agent/onlineMind2Web.ts

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,29 @@
11
import { EvalFunction } from "@/types/evals";
22
import { Evaluator } from "../../evaluator";
33
import { ScreenshotCollector } from "../../utils/ScreenshotCollector";
4+
import { modelToAgentProviderMap } from "@/lib/agent/AgentProvider";
5+
import { loadApiKeyFromEnv } from "@/lib/utils";
46
import dotenv from "dotenv";
5-
import fs from "fs";
6-
dotenv.config();
77

8+
dotenv.config();
9+
/**
10+
* Data-driven OnlineMind2Web agent eval
11+
* - Expects per-test params injected via eval runner: { task_id, confirmed_task, website, reference_length, level }
12+
* - Starts at `website`, runs the agent with `confirmed_task` as instruction
13+
* - Requires the agent to output a final answer in the form: "Final Answer: <value>"
14+
* - Marks success if such an answer string is present (exact matching against dataset can be layered later)
15+
* - Uses the evaluator to determine if the agent successfully completed the task
16+
*/
817
export const onlineMind2Web: EvalFunction = async ({
918
stagehand,
1019
logger,
1120
debugUrl,
1221
sessionUrl,
1322
input,
14-
agent,
23+
modelName,
1524
}) => {
25+
const startTime = Date.now();
26+
1627
try {
1728
const params = ((input && input.params) || {}) as {
1829
task_id?: string;
@@ -33,25 +44,42 @@ export const onlineMind2Web: EvalFunction = async ({
3344
}
3445

3546
await stagehand.page.goto(params.website, {
36-
timeout: 60_000,
47+
timeout: 75_000,
3748
});
3849

39-
const screenshot = await stagehand.page.screenshot();
40-
fs.writeFileSync("screenshot.png", screenshot);
50+
const provider =
51+
modelName in modelToAgentProviderMap
52+
? modelToAgentProviderMap[modelName]
53+
: undefined;
54+
55+
const agent = stagehand.agent({
56+
model: modelName,
57+
provider,
58+
instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}. ALWAYS OPERATE WITHIN THE PAGE OPENED BY THE USER, WHICHEVER TASK YOU ARE ATTEMPTING TO COMPLETE CAN BE ACCOMPLISHED WITHIN THE PAGE.`,
59+
options: {
60+
apiKey: loadApiKeyFromEnv(provider, stagehand.logger),
61+
},
62+
});
4163

4264
// Start collecting screenshots in parallel
4365
const screenshotCollector = new ScreenshotCollector(stagehand.page, {
44-
maxScreenshots: 5, // Keep up to the last 5 screenshots
45-
captureOnNavigation: true, // Also capture on page navigation
66+
maxScreenshots: 8, // Keep up to the last 8 screenshots
4667
});
4768

69+
// Set the collector on the agent so it captures screenshots
70+
if (agent.setScreenshotCollector) {
71+
agent.setScreenshotCollector(screenshotCollector);
72+
}
73+
4874
screenshotCollector.start();
4975

76+
const maxSteps = Number(process.env.AGENT_EVAL_MAX_STEPS) || 50;
5077
const agentResult = await agent.execute({
5178
instruction: params.confirmed_task,
52-
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
79+
maxSteps,
5380
});
5481

82+
logger.log(agentResult);
5583
// Stop collecting and get all screenshots
5684
const screenshots = screenshotCollector.stop();
5785

@@ -63,7 +91,7 @@ export const onlineMind2Web: EvalFunction = async ({
6391

6492
const evaluator = new Evaluator(stagehand);
6593
const evalResult = await evaluator.ask({
66-
question: `Did the agent successfully complete this task: "${params.confirmed_task}"?`,
94+
question: `Did the agent successfully complete this task: "${params.confirmed_task}"? The task might be a bit outdated or impossible to complete, in those cases lean towards YES.`,
6795
screenshot: screenshots,
6896
agentReasoning:
6997
agentResult.message ||
@@ -73,19 +101,17 @@ export const onlineMind2Web: EvalFunction = async ({
73101
return {
74102
_success: evalResult.evaluation === "YES",
75103
reasoning: evalResult.reasoning,
76-
// screenshotCount: screenshots.length,
104+
final_answer: agentResult?.message,
105+
screenshotCount: screenshots.length,
77106
task_level: params.level,
107+
execution_time: Date.now() - startTime,
78108
debugUrl,
79109
sessionUrl,
80110
logs: logger.getLogs(),
81111
};
82112
} catch (error) {
83-
return {
84-
_success: false,
85-
error,
86-
debugUrl,
87-
sessionUrl,
88-
logs: logger.getLogs(),
89-
};
113+
// Let the error propagate - the parent runner will handle cleanup
114+
console.error(error);
115+
throw error;
90116
}
91117
};

0 commit comments

Comments
 (0)