@@ -10,7 +10,6 @@ import {
10
10
ClientOptions ,
11
11
Stagehand ,
12
12
} from "@browserbasehq/stagehand" ;
13
- import { LLMResponseError } from "@/types/stagehandErrors" ;
14
13
import dotenv from "dotenv" ;
15
14
import {
16
15
EvaluateOptions ,
@@ -20,17 +19,22 @@ import {
20
19
import { LLMParsedResponse } from "@/lib/inference" ;
21
20
import { LLMResponse } from "@/lib/llm/LLMClient" ;
22
21
import { LogLine } from "@/types/log" ;
22
+ import { z } from "zod" ;
23
23
24
24
dotenv . config ( ) ;
25
25
26
+ const EvaluationSchema = z . object ( {
27
+ evaluation : z . enum ( [ "YES" , "NO" ] ) ,
28
+ reasoning : z . string ( ) ,
29
+ } ) ;
30
+
31
+ const BatchEvaluationSchema = z . array ( EvaluationSchema ) ;
32
+
26
33
export class Evaluator {
27
34
private stagehand : Stagehand ;
28
35
private modelName : AvailableModel ;
29
36
private modelClientOptions : ClientOptions | { apiKey : string } ;
30
37
private silentLogger : ( message : LogLine ) => void ;
31
- // Define regex patterns directly in the class or as constants if preferred elsewhere
32
- private yesPattern = / ^ ( Y E S | Y | T R U E | C O R R E C T | A F F I R M A T I V E ) / i;
33
- private noPattern = / ^ ( N O | N | F A L S E | I N C O R R E C T | N E G A T I V E ) / i;
34
38
35
39
constructor (
36
40
stagehand : Stagehand ,
@@ -48,12 +52,11 @@ export class Evaluator {
48
52
49
53
/**
50
54
* Evaluates the current state of the page against a specific question.
51
- * Expects a JSON object response: { "evaluation": "YES" | "NO", "reasoning": "..." }
55
+ * Uses structured response generation to ensure proper format.
52
56
* Returns the evaluation result with normalized response and success status.
53
57
*
54
58
* @param options - The options for evaluation
55
59
* @returns A promise that resolves to an EvaluationResult
56
- * @throws Error if strictResponse is true and response is not clearly YES or NO, or if JSON parsing/validation fails.
57
60
*/
58
61
async evaluate ( options : EvaluateOptions ) : Promise < EvaluationResult > {
59
62
const {
@@ -63,7 +66,6 @@ export class Evaluator {
63
66
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
64
67
Be critical about the question and the answer, the slightest detail might be the difference between yes and no.` ,
65
68
screenshotDelayMs = 1000 ,
66
- strictResponse = false ,
67
69
} = options ;
68
70
69
71
await new Promise ( ( resolve ) => setTimeout ( resolve , screenshotDelayMs ) ) ;
@@ -80,86 +82,53 @@ export class Evaluator {
80
82
options : {
81
83
messages : [
82
84
{ role : "system" , content : systemPrompt } ,
83
- { role : "user" , content : question } ,
85
+ {
86
+ role : "user" ,
87
+ content : [
88
+ { type : "text" , text : question } ,
89
+ {
90
+ type : "image_url" ,
91
+ image_url : {
92
+ url : `data:image/jpeg;base64,${ imageBuffer . toString ( "base64" ) } ` ,
93
+ } ,
94
+ } ,
95
+ ] ,
96
+ } ,
84
97
] ,
85
- image : { buffer : imageBuffer } ,
98
+ response_model : {
99
+ name : "EvaluationResult" ,
100
+ schema : EvaluationSchema ,
101
+ } ,
86
102
} ,
87
103
} ) ;
88
- const rawResponse = response . data as unknown as string ;
89
- let evaluationResult : "YES" | "NO" | "INVALID" = "INVALID" ;
90
- let reasoning = `Failed to process response. Raw response: ${ rawResponse } ` ;
91
104
92
105
try {
93
- // Clean potential markdown fences
94
- const cleanedResponse = rawResponse
95
- . replace ( / ^ ` ` ` j s o n \s * / , "" )
96
- . replace ( / \s * ` ` ` $ / , "" )
97
- . trim ( ) ;
98
-
99
- // Attempt to parse the JSON object
100
- const parsedResult : { evaluation : unknown ; reasoning : unknown } =
101
- JSON . parse ( cleanedResponse ) ;
102
-
103
- // Validate structure
104
- if (
105
- typeof parsedResult !== "object" ||
106
- parsedResult === null ||
107
- typeof parsedResult . evaluation !== "string" ||
108
- typeof parsedResult . reasoning !== "string"
109
- ) {
110
- throw new LLMResponseError (
111
- "Evaluator" ,
112
- `Invalid JSON structure received: ${ JSON . stringify ( parsedResult ) } ` ,
113
- ) ;
114
- }
115
-
116
- const evaluationString = parsedResult . evaluation . trim ( ) . toUpperCase ( ) ;
117
- reasoning = parsedResult . reasoning . trim ( ) ; // Update reasoning from parsed object
118
-
119
- // Use regex patterns to validate the evaluation string
120
- const isYes = this . yesPattern . test ( evaluationString ) ;
121
- const isNo = this . noPattern . test ( evaluationString ) ;
122
-
123
- if ( isYes ) {
124
- evaluationResult = "YES" ;
125
- } else if ( isNo ) {
126
- evaluationResult = "NO" ;
127
- } else {
128
- // Parsed JSON but evaluation value wasn't YES/NO variant
129
- if ( strictResponse ) {
130
- throw new LLMResponseError (
131
- "Evaluator" ,
132
- `Invalid evaluation value in JSON: ${ parsedResult . evaluation } ` ,
133
- ) ;
134
- }
135
- // Keep INVALID, reasoning already updated
136
- reasoning = `Invalid evaluation value: ${ parsedResult . evaluation } . Reasoning: ${ reasoning } ` ;
137
- }
106
+ const result = response . data as unknown as z . infer <
107
+ typeof EvaluationSchema
108
+ > ;
109
+
110
+ return {
111
+ evaluation : result . evaluation ,
112
+ reasoning : result . reasoning ,
113
+ } ;
138
114
} catch ( error ) {
139
115
const errorMessage =
140
116
error instanceof Error ? error . message : String ( error ) ;
141
- // Update reasoning with error details
142
- reasoning = `Processing error: ${ errorMessage } . Raw response: ${ rawResponse } ` ;
143
- if ( strictResponse ) {
144
- // Re-throw error if in strict mode
145
- throw new LLMResponseError ( "Evaluator" , reasoning ) ;
146
- }
147
- // Keep evaluationResult as "INVALID"
148
- }
149
117
150
- return {
151
- evaluation : evaluationResult ,
152
- reasoning : reasoning ,
153
- } ;
118
+ return {
119
+ evaluation : "INVALID" as const ,
120
+ reasoning : `Failed to get structured response: ${ errorMessage } ` ,
121
+ } ;
122
+ }
154
123
}
155
124
156
125
/**
157
126
* Evaluates the current state of the page against multiple questions in a single screenshot.
127
+ * Uses structured response generation to ensure proper format.
158
128
* Returns an array of evaluation results.
159
129
*
160
130
* @param options - The options for batch evaluation
161
131
* @returns A promise that resolves to an array of EvaluationResults
162
- * @throws Error if strictResponse is true and any response is not clearly YES or NO
163
132
*/
164
133
async batchEvaluate (
165
134
options : BatchEvaluateOptions ,
@@ -171,7 +140,6 @@ export class Evaluator {
171
140
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
172
141
Be critical about the question and the answer, the slightest detail might be the difference between yes and no.` ,
173
142
screenshotDelayMs = 1000 ,
174
- strictResponse = false ,
175
143
} = options ;
176
144
177
145
// Wait for the specified delay before taking screenshot
@@ -204,125 +172,55 @@ export class Evaluator {
204
172
} ,
205
173
{
206
174
role : "user" ,
207
- content : formattedQuestions ,
175
+ content : [
176
+ { type : "text" , text : formattedQuestions } ,
177
+ {
178
+ type : "image_url" ,
179
+ image_url : {
180
+ url : `data:image/jpeg;base64,${ imageBuffer . toString ( "base64" ) } ` ,
181
+ } ,
182
+ } ,
183
+ ] ,
208
184
} ,
209
185
] ,
210
- image : {
211
- buffer : imageBuffer ,
186
+ response_model : {
187
+ name : "BatchEvaluationResult" ,
188
+ schema : BatchEvaluationSchema ,
212
189
} ,
213
190
} ,
214
191
} ) ;
215
192
216
- const rawResponse = response . data as unknown as string ;
217
- let finalResults : EvaluationResult [ ] = [ ] ;
218
-
219
193
try {
220
- // Clean potential markdown fences
221
- const cleanedResponse = rawResponse
222
- . replace ( / ^ ` ` ` j s o n \s * / , "" )
223
- . replace ( / \s * ` ` ` $ / , "" )
224
- . trim ( ) ;
225
-
226
- // Attempt to parse the JSON array
227
- const parsedResults : { evaluation : unknown ; reasoning : unknown } [ ] =
228
- JSON . parse ( cleanedResponse ) ;
229
-
230
- if ( ! Array . isArray ( parsedResults ) ) {
231
- throw new LLMResponseError (
232
- "Evaluator" ,
233
- "Response is not a JSON array." ,
234
- ) ;
235
- }
236
-
237
- if ( parsedResults . length !== questions . length && strictResponse ) {
238
- throw new LLMResponseError (
239
- "Evaluator" ,
240
- `Expected ${ questions . length } results, but got ${ parsedResults . length } ` ,
241
- ) ;
242
- }
194
+ const results = response . data as unknown as z . infer <
195
+ typeof BatchEvaluationSchema
196
+ > ;
243
197
198
+ // Pad with INVALID results if we got fewer than expected
199
+ const finalResults : EvaluationResult [ ] = [ ] ;
244
200
for ( let i = 0 ; i < questions . length ; i ++ ) {
245
- if ( i < parsedResults . length ) {
246
- const item = parsedResults [ i ] ;
247
- // Ensure item is an object and has the required properties
248
- if (
249
- typeof item !== "object" ||
250
- item === null ||
251
- typeof item . evaluation !== "string" ||
252
- typeof item . reasoning !== "string"
253
- ) {
254
- if ( strictResponse ) {
255
- throw new LLMResponseError (
256
- "Evaluator" ,
257
- `Invalid object structure for question ${ i + 1 } : ${ JSON . stringify ( item ) } ` ,
258
- ) ;
259
- }
260
- finalResults . push ( {
261
- evaluation : "INVALID" ,
262
- reasoning : `Invalid object structure received: ${ JSON . stringify (
263
- item ,
264
- ) } `,
265
- } ) ;
266
- continue ; // Move to the next question
267
- }
268
-
269
- // Use regex patterns for validation
270
- const evaluationString = item . evaluation . trim ( ) . toUpperCase ( ) ;
271
- const reasoning = item . reasoning . trim ( ) ;
272
- const isYes = this . yesPattern . test ( evaluationString ) ;
273
- const isNo = this . noPattern . test ( evaluationString ) ;
274
-
275
- if ( isYes ) {
276
- finalResults . push ( { evaluation : "YES" , reasoning : reasoning } ) ;
277
- } else if ( isNo ) {
278
- finalResults . push ( { evaluation : "NO" , reasoning : reasoning } ) ;
279
- } else {
280
- // Invalid evaluation value
281
- if ( strictResponse ) {
282
- throw new LLMResponseError (
283
- "Evaluator" ,
284
- `Invalid evaluation value for question ${ i + 1 } : ${ item . evaluation } ` ,
285
- ) ;
286
- }
287
- finalResults . push ( {
288
- evaluation : "INVALID" ,
289
- reasoning : `Invalid evaluation value: ${ item . evaluation } . Reasoning: ${ reasoning } ` ,
290
- } ) ;
291
- }
201
+ if ( i < results . length ) {
202
+ finalResults . push ( {
203
+ evaluation : results [ i ] . evaluation ,
204
+ reasoning : results [ i ] . reasoning ,
205
+ } ) ;
292
206
} else {
293
- // Missing result for this question
294
- if ( strictResponse ) {
295
- throw new LLMResponseError (
296
- "Evaluator" ,
297
- `No response found for question ${ i + 1 } ` ,
298
- ) ;
299
- }
300
207
finalResults . push ( {
301
208
evaluation : "INVALID" ,
302
209
reasoning : "No response found for this question." ,
303
210
} ) ;
304
211
}
305
212
}
213
+
214
+ return finalResults ;
306
215
} catch ( error ) {
307
216
const errorMessage =
308
217
error instanceof Error ? error . message : String ( error ) ;
309
- // If JSON parsing fails or structure is wrong, handle based on strictResponse
310
- if ( strictResponse ) {
311
- throw new LLMResponseError (
312
- "Evaluator" ,
313
- `Failed to parse LLM response or invalid format: ${ rawResponse } . Error: ${ errorMessage } ` ,
314
- ) ;
315
- }
218
+
316
219
// Fallback: return INVALID for all questions
317
- finalResults = [ ] ; // Clear any potentially partially filled results
318
- for ( let i = 0 ; i < questions . length ; i ++ ) {
319
- finalResults . push ( {
320
- evaluation : "INVALID" ,
321
- reasoning : `Failed to parse response. Raw response: ${ rawResponse } . Error: ${ errorMessage } ` ,
322
- } ) ;
323
- }
220
+ return questions . map ( ( ) => ( {
221
+ evaluation : "INVALID" as const ,
222
+ reasoning : `Failed to get structured response: ${ errorMessage } ` ,
223
+ } ) ) ;
324
224
}
325
-
326
- return finalResults ;
327
225
}
328
226
}
0 commit comments