Skip to content

Commit 0ead63d

Browse files
authored
Evaluator bug fix stg 667 (#963)
# why Currently we do not properly handle passing images from evaluator to the llmclient # what changed - we now provide images in the correct format to the llm from evaluator - switched to generate object in order to consolidate complex parsing logic in evaluator # test plan tested locally & on browserbase
1 parent 9f54bcf commit 0ead63d

File tree

10 files changed

+73
-188
lines changed

10 files changed

+73
-188
lines changed

.changeset/pretty-jokes-own.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
Properly handle images in evaluator + clean up response parsing logic

evals/evaluator.ts

Lines changed: 68 additions & 170 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import {
1010
ClientOptions,
1111
Stagehand,
1212
} from "@browserbasehq/stagehand";
13-
import { LLMResponseError } from "@/types/stagehandErrors";
1413
import dotenv from "dotenv";
1514
import {
1615
EvaluateOptions,
@@ -20,17 +19,22 @@ import {
2019
import { LLMParsedResponse } from "@/lib/inference";
2120
import { LLMResponse } from "@/lib/llm/LLMClient";
2221
import { LogLine } from "@/types/log";
22+
import { z } from "zod";
2323

2424
dotenv.config();
2525

26+
const EvaluationSchema = z.object({
27+
evaluation: z.enum(["YES", "NO"]),
28+
reasoning: z.string(),
29+
});
30+
31+
const BatchEvaluationSchema = z.array(EvaluationSchema);
32+
2633
export class Evaluator {
2734
private stagehand: Stagehand;
2835
private modelName: AvailableModel;
2936
private modelClientOptions: ClientOptions | { apiKey: string };
3037
private silentLogger: (message: LogLine) => void;
31-
// Define regex patterns directly in the class or as constants if preferred elsewhere
32-
private yesPattern = /^(YES|Y|TRUE|CORRECT|AFFIRMATIVE)/i;
33-
private noPattern = /^(NO|N|FALSE|INCORRECT|NEGATIVE)/i;
3438

3539
constructor(
3640
stagehand: Stagehand,
@@ -48,12 +52,11 @@ export class Evaluator {
4852

4953
/**
5054
* Evaluates the current state of the page against a specific question.
51-
* Expects a JSON object response: { "evaluation": "YES" | "NO", "reasoning": "..." }
55+
* Uses structured response generation to ensure proper format.
5256
* Returns the evaluation result with normalized response and success status.
5357
*
5458
* @param options - The options for evaluation
5559
* @returns A promise that resolves to an EvaluationResult
56-
* @throws Error if strictResponse is true and response is not clearly YES or NO, or if JSON parsing/validation fails.
5760
*/
5861
async evaluate(options: EvaluateOptions): Promise<EvaluationResult> {
5962
const {
@@ -63,7 +66,6 @@ export class Evaluator {
6366
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
6467
Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
6568
screenshotDelayMs = 1000,
66-
strictResponse = false,
6769
} = options;
6870

6971
await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs));
@@ -80,86 +82,53 @@ export class Evaluator {
8082
options: {
8183
messages: [
8284
{ role: "system", content: systemPrompt },
83-
{ role: "user", content: question },
85+
{
86+
role: "user",
87+
content: [
88+
{ type: "text", text: question },
89+
{
90+
type: "image_url",
91+
image_url: {
92+
url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
93+
},
94+
},
95+
],
96+
},
8497
],
85-
image: { buffer: imageBuffer },
98+
response_model: {
99+
name: "EvaluationResult",
100+
schema: EvaluationSchema,
101+
},
86102
},
87103
});
88-
const rawResponse = response.data as unknown as string;
89-
let evaluationResult: "YES" | "NO" | "INVALID" = "INVALID";
90-
let reasoning = `Failed to process response. Raw response: ${rawResponse}`;
91104

92105
try {
93-
// Clean potential markdown fences
94-
const cleanedResponse = rawResponse
95-
.replace(/^```json\s*/, "")
96-
.replace(/\s*```$/, "")
97-
.trim();
98-
99-
// Attempt to parse the JSON object
100-
const parsedResult: { evaluation: unknown; reasoning: unknown } =
101-
JSON.parse(cleanedResponse);
102-
103-
// Validate structure
104-
if (
105-
typeof parsedResult !== "object" ||
106-
parsedResult === null ||
107-
typeof parsedResult.evaluation !== "string" ||
108-
typeof parsedResult.reasoning !== "string"
109-
) {
110-
throw new LLMResponseError(
111-
"Evaluator",
112-
`Invalid JSON structure received: ${JSON.stringify(parsedResult)}`,
113-
);
114-
}
115-
116-
const evaluationString = parsedResult.evaluation.trim().toUpperCase();
117-
reasoning = parsedResult.reasoning.trim(); // Update reasoning from parsed object
118-
119-
// Use regex patterns to validate the evaluation string
120-
const isYes = this.yesPattern.test(evaluationString);
121-
const isNo = this.noPattern.test(evaluationString);
122-
123-
if (isYes) {
124-
evaluationResult = "YES";
125-
} else if (isNo) {
126-
evaluationResult = "NO";
127-
} else {
128-
// Parsed JSON but evaluation value wasn't YES/NO variant
129-
if (strictResponse) {
130-
throw new LLMResponseError(
131-
"Evaluator",
132-
`Invalid evaluation value in JSON: ${parsedResult.evaluation}`,
133-
);
134-
}
135-
// Keep INVALID, reasoning already updated
136-
reasoning = `Invalid evaluation value: ${parsedResult.evaluation}. Reasoning: ${reasoning}`;
137-
}
106+
const result = response.data as unknown as z.infer<
107+
typeof EvaluationSchema
108+
>;
109+
110+
return {
111+
evaluation: result.evaluation,
112+
reasoning: result.reasoning,
113+
};
138114
} catch (error) {
139115
const errorMessage =
140116
error instanceof Error ? error.message : String(error);
141-
// Update reasoning with error details
142-
reasoning = `Processing error: ${errorMessage}. Raw response: ${rawResponse}`;
143-
if (strictResponse) {
144-
// Re-throw error if in strict mode
145-
throw new LLMResponseError("Evaluator", reasoning);
146-
}
147-
// Keep evaluationResult as "INVALID"
148-
}
149117

150-
return {
151-
evaluation: evaluationResult,
152-
reasoning: reasoning,
153-
};
118+
return {
119+
evaluation: "INVALID" as const,
120+
reasoning: `Failed to get structured response: ${errorMessage}`,
121+
};
122+
}
154123
}
155124

156125
/**
157126
* Evaluates the current state of the page against multiple questions in a single screenshot.
127+
* Uses structured response generation to ensure proper format.
158128
* Returns an array of evaluation results.
159129
*
160130
* @param options - The options for batch evaluation
161131
* @returns A promise that resolves to an array of EvaluationResults
162-
* @throws Error if strictResponse is true and any response is not clearly YES or NO
163132
*/
164133
async batchEvaluate(
165134
options: BatchEvaluateOptions,
@@ -171,7 +140,6 @@ export class Evaluator {
171140
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
172141
Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
173142
screenshotDelayMs = 1000,
174-
strictResponse = false,
175143
} = options;
176144

177145
// Wait for the specified delay before taking screenshot
@@ -204,125 +172,55 @@ export class Evaluator {
204172
},
205173
{
206174
role: "user",
207-
content: formattedQuestions,
175+
content: [
176+
{ type: "text", text: formattedQuestions },
177+
{
178+
type: "image_url",
179+
image_url: {
180+
url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
181+
},
182+
},
183+
],
208184
},
209185
],
210-
image: {
211-
buffer: imageBuffer,
186+
response_model: {
187+
name: "BatchEvaluationResult",
188+
schema: BatchEvaluationSchema,
212189
},
213190
},
214191
});
215192

216-
const rawResponse = response.data as unknown as string;
217-
let finalResults: EvaluationResult[] = [];
218-
219193
try {
220-
// Clean potential markdown fences
221-
const cleanedResponse = rawResponse
222-
.replace(/^```json\s*/, "")
223-
.replace(/\s*```$/, "")
224-
.trim();
225-
226-
// Attempt to parse the JSON array
227-
const parsedResults: { evaluation: unknown; reasoning: unknown }[] =
228-
JSON.parse(cleanedResponse);
229-
230-
if (!Array.isArray(parsedResults)) {
231-
throw new LLMResponseError(
232-
"Evaluator",
233-
"Response is not a JSON array.",
234-
);
235-
}
236-
237-
if (parsedResults.length !== questions.length && strictResponse) {
238-
throw new LLMResponseError(
239-
"Evaluator",
240-
`Expected ${questions.length} results, but got ${parsedResults.length}`,
241-
);
242-
}
194+
const results = response.data as unknown as z.infer<
195+
typeof BatchEvaluationSchema
196+
>;
243197

198+
// Pad with INVALID results if we got fewer than expected
199+
const finalResults: EvaluationResult[] = [];
244200
for (let i = 0; i < questions.length; i++) {
245-
if (i < parsedResults.length) {
246-
const item = parsedResults[i];
247-
// Ensure item is an object and has the required properties
248-
if (
249-
typeof item !== "object" ||
250-
item === null ||
251-
typeof item.evaluation !== "string" ||
252-
typeof item.reasoning !== "string"
253-
) {
254-
if (strictResponse) {
255-
throw new LLMResponseError(
256-
"Evaluator",
257-
`Invalid object structure for question ${i + 1}: ${JSON.stringify(item)}`,
258-
);
259-
}
260-
finalResults.push({
261-
evaluation: "INVALID",
262-
reasoning: `Invalid object structure received: ${JSON.stringify(
263-
item,
264-
)}`,
265-
});
266-
continue; // Move to the next question
267-
}
268-
269-
// Use regex patterns for validation
270-
const evaluationString = item.evaluation.trim().toUpperCase();
271-
const reasoning = item.reasoning.trim();
272-
const isYes = this.yesPattern.test(evaluationString);
273-
const isNo = this.noPattern.test(evaluationString);
274-
275-
if (isYes) {
276-
finalResults.push({ evaluation: "YES", reasoning: reasoning });
277-
} else if (isNo) {
278-
finalResults.push({ evaluation: "NO", reasoning: reasoning });
279-
} else {
280-
// Invalid evaluation value
281-
if (strictResponse) {
282-
throw new LLMResponseError(
283-
"Evaluator",
284-
`Invalid evaluation value for question ${i + 1}: ${item.evaluation}`,
285-
);
286-
}
287-
finalResults.push({
288-
evaluation: "INVALID",
289-
reasoning: `Invalid evaluation value: ${item.evaluation}. Reasoning: ${reasoning}`,
290-
});
291-
}
201+
if (i < results.length) {
202+
finalResults.push({
203+
evaluation: results[i].evaluation,
204+
reasoning: results[i].reasoning,
205+
});
292206
} else {
293-
// Missing result for this question
294-
if (strictResponse) {
295-
throw new LLMResponseError(
296-
"Evaluator",
297-
`No response found for question ${i + 1}`,
298-
);
299-
}
300207
finalResults.push({
301208
evaluation: "INVALID",
302209
reasoning: "No response found for this question.",
303210
});
304211
}
305212
}
213+
214+
return finalResults;
306215
} catch (error) {
307216
const errorMessage =
308217
error instanceof Error ? error.message : String(error);
309-
// If JSON parsing fails or structure is wrong, handle based on strictResponse
310-
if (strictResponse) {
311-
throw new LLMResponseError(
312-
"Evaluator",
313-
`Failed to parse LLM response or invalid format: ${rawResponse}. Error: ${errorMessage}`,
314-
);
315-
}
218+
316219
// Fallback: return INVALID for all questions
317-
finalResults = []; // Clear any potentially partially filled results
318-
for (let i = 0; i < questions.length; i++) {
319-
finalResults.push({
320-
evaluation: "INVALID",
321-
reasoning: `Failed to parse response. Raw response: ${rawResponse}. Error: ${errorMessage}`,
322-
});
323-
}
220+
return questions.map(() => ({
221+
evaluation: "INVALID" as const,
222+
reasoning: `Failed to get structured response: ${errorMessage}`,
223+
}));
324224
}
325-
326-
return finalResults;
327225
}
328226
}

evals/tasks/agent/google_flights.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ export const google_flights: EvalFunction = async ({
2828
const result = await evaluator.evaluate({
2929
question:
3030
"Does the page show flights (options, available flights, not a search form) from San Francisco to New York?",
31-
strictResponse: true,
3231
});
3332

3433
if (result.evaluation !== "YES" && result.evaluation !== "NO") {

evals/tasks/agent/google_maps.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ export const google_maps: EvalFunction = async ({
2828
const result = await evaluator.evaluate({
2929
question:
3030
"Does the page show the time it takes to drive from San Francisco to New York at all?",
31-
strictResponse: true,
3231
});
3332

3433
if (result.evaluation !== "YES" && result.evaluation !== "NO") {

evals/tasks/agent/google_maps_2.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ export const google_maps_2: EvalFunction = async ({
2929
const result = await evaluator.evaluate({
3030
question:
3131
"Does the page show the fastest walking route from La Puerta de Alcalá to La Puerta del Sol? Does the distance between the two points show as 1.5 km?",
32-
strictResponse: true,
3332
});
3433
const { distance } = await stagehand.page.extract({
3534
modelName: "google/gemini-2.5-flash",

evals/tasks/agent/iframe_form.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ export const iframe_form: EvalFunction = async ({
2626
const evaluator = new Evaluator(stagehand);
2727
const result = await evaluator.evaluate({
2828
question: "Is the form name input filled with 'John Smith'?",
29-
strictResponse: true,
3029
});
3130

3231
if (result.evaluation !== "YES" && result.evaluation !== "NO") {
@@ -48,7 +47,6 @@ export const iframe_form: EvalFunction = async ({
4847
await stagehand.page.mouse.wheel(0, -1000);
4948
const result2 = await evaluator.evaluate({
5049
question: "Is the form email input filled with '[email protected]'?",
51-
strictResponse: true,
5250
});
5351

5452
if (result2.evaluation !== "YES" && result2.evaluation !== "NO") {

evals/tasks/agent/iframe_form_multiple.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ export const iframe_form_multiple: EvalFunction = async ({
3131
"Is the form email input filled with '[email protected]'?",
3232
"Is the 'Are you the domain owner?' option selected as 'No'?",
3333
],
34-
strictResponse: true,
3534
});
3635

3736
for (const r of results) {

evals/tasks/agent/sf_library_card.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ export const sf_library_card: EvalFunction = async ({
2929
const result = await evaluator.evaluate({
3030
question:
3131
"Does the page show the 'Residential Address' field filled with '166 Geary St'?",
32-
strictResponse: true,
3332
});
3433

3534
if (result.evaluation !== "YES" && result.evaluation !== "NO") {

0 commit comments

Comments
 (0)