@@ -23,7 +23,13 @@ import { generateExperimentName } from "./utils";
23
23
import { exactMatch , errorMatch } from "./scoring" ;
24
24
import { tasksByName , tasksConfig , getModelList } from "./taskConfig" ;
25
25
import { Eval , wrapAISDKModel , wrapOpenAI } from "braintrust" ;
26
- import { SummaryResult , Testcase , EvalInput } from "@/types/evals" ;
26
+ import {
27
+ SummaryResult ,
28
+ Testcase ,
29
+ EvalInput ,
30
+ ErrorType ,
31
+ EvalOutput ,
32
+ } from "@/types/evals" ;
27
33
import { EvalLogger } from "./logger" ;
28
34
import { AvailableModel , LLMClient } from "@browserbasehq/stagehand" ;
29
35
import { env } from "./env" ;
@@ -46,6 +52,14 @@ import { buildOnlineMind2WebTestcases } from "./suites/onlineMind2Web";
46
52
47
53
dotenv . config ( ) ;
48
54
55
+ process . on ( "uncaughtException" , ( err ) => {
56
+ console . error ( "[eval-runner] Uncaught exception:" , err ) ;
57
+ } ) ;
58
+
59
+ process . on ( "unhandledRejection" , ( reason ) => {
60
+ console . error ( "[eval-runner] Unhandled rejection:" , reason ) ;
61
+ } ) ;
62
+
49
63
/**
50
64
* Read max concurrency and trial count from environment variables set in args.ts.
51
65
* Fallback to defaults (20 and 5) if they're not provided.
@@ -107,20 +121,6 @@ const generateFilteredTestcases = (): Testcase[] => {
107
121
) ;
108
122
}
109
123
110
- // Check for dataset filter from environment
111
- const datasetFilter = process . env . EVAL_DATASET ;
112
-
113
- // If using external benchmarks via dataset filter, override category to use agent models
114
- if (
115
- datasetFilter &&
116
- [ "gaia" , "webvoyager" , "webbench" , "osworld" ] . includes ( datasetFilter )
117
- ) {
118
- effectiveCategory = "external_agent_benchmarks" ;
119
- console . log (
120
- `Using dataset filter "${ datasetFilter } ", switching to external_agent_benchmarks category.` ,
121
- ) ;
122
- }
123
-
124
124
// Dynamically determine the MODELS based on the effective category
125
125
const currentModels = getModelList ( effectiveCategory ) ;
126
126
@@ -130,18 +130,15 @@ const generateFilteredTestcases = (): Testcase[] => {
130
130
) ;
131
131
132
132
// Special handling: fan out GAIA dataset for agent/gaia
133
- const isGAIATaskIncluded =
134
- taskNamesToRun . includes ( "agent/gaia" ) || datasetFilter === "gaia" ;
133
+ const isGAIATaskIncluded = taskNamesToRun . includes ( "agent/gaia" ) ;
135
134
// Special handling: fan out WebVoyager dataset for agent/webvoyager
136
- const isWebVoyagerTaskIncluded =
137
- taskNamesToRun . includes ( "agent/webvoyager" ) ||
138
- datasetFilter === "webvoyager" ;
135
+ const isWebVoyagerTaskIncluded = taskNamesToRun . includes ( "agent/webvoyager" ) ;
139
136
// Special handling: fan out WebBench dataset for agent/webbench
140
- const isWebBenchTaskIncluded =
141
- taskNamesToRun . includes ( "agent/webbench" ) || datasetFilter === "webbench" ;
137
+ const isWebBenchTaskIncluded = taskNamesToRun . includes ( "agent/webbench" ) ;
138
+
142
139
// Special handling: fan out OSWorld dataset for agent/osworld
143
- const isOSWorldTaskIncluded =
144
- taskNamesToRun . includes ( "agent/osworld" ) || datasetFilter === "osworld" ;
140
+ const isOSWorldTaskIncluded = taskNamesToRun . includes ( "agent/osworld" ) ;
141
+
145
142
// Special handling: fan out Mind2Web dataset for agent/onlineMind2Web
146
143
const isMind2WebTaskIncluded = taskNamesToRun . includes (
147
144
"agent/onlineMind2Web" ,
@@ -150,100 +147,57 @@ const generateFilteredTestcases = (): Testcase[] => {
150
147
let allTestcases : Testcase [ ] = [ ] ;
151
148
152
149
// Only include GAIA if no dataset filter or if gaia is selected
153
- if ( isGAIATaskIncluded && ( ! datasetFilter || datasetFilter === "gaia" ) ) {
150
+ if ( isGAIATaskIncluded ) {
154
151
taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/gaia" ) ;
155
152
allTestcases . push ( ...buildGAIATestcases ( currentModels ) ) ;
156
- } else if (
157
- taskNamesToRun . includes ( "agent/gaia" ) &&
158
- datasetFilter &&
159
- datasetFilter !== "gaia"
160
- ) {
161
- // Remove GAIA from tasks to run if dataset filter excludes it
162
- taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/gaia" ) ;
163
153
}
164
154
165
155
// Only include WebVoyager if no dataset filter or if webvoyager is selected
166
- if (
167
- isWebVoyagerTaskIncluded &&
168
- ( ! datasetFilter || datasetFilter === "webvoyager" )
169
- ) {
156
+ if ( isWebVoyagerTaskIncluded ) {
170
157
taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/webvoyager" ) ;
171
158
allTestcases . push ( ...buildWebVoyagerTestcases ( currentModels ) ) ;
172
- } else if (
173
- taskNamesToRun . includes ( "agent/webvoyager" ) &&
174
- datasetFilter &&
175
- datasetFilter !== "webvoyager"
176
- ) {
177
- // Remove WebVoyager from tasks to run if dataset filter excludes it
178
- taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/webvoyager" ) ;
179
159
}
180
160
181
161
// Only include WebBench if no dataset filter or if webbench is selected
182
- if (
183
- isWebBenchTaskIncluded &&
184
- ( ! datasetFilter || datasetFilter === "webbench" )
185
- ) {
162
+ if ( isWebBenchTaskIncluded ) {
186
163
taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/webbench" ) ;
187
164
allTestcases . push ( ...buildWebBenchTestcases ( currentModels ) ) ;
188
- } else if (
189
- taskNamesToRun . includes ( "agent/webbench" ) &&
190
- datasetFilter &&
191
- datasetFilter !== "webbench"
192
- ) {
193
- // Remove WebBench from tasks to run if dataset filter excludes it
194
- taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/webbench" ) ;
195
165
}
196
166
197
167
// Only include OSWorld if no dataset filter or if osworld is selected
198
- if (
199
- isOSWorldTaskIncluded &&
200
- ( ! datasetFilter || datasetFilter === "osworld" )
201
- ) {
168
+ if ( isOSWorldTaskIncluded ) {
202
169
taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/osworld" ) ;
203
170
allTestcases . push ( ...buildOSWorldTestcases ( currentModels ) ) ;
204
- } else if (
205
- taskNamesToRun . includes ( "agent/osworld" ) &&
206
- datasetFilter &&
207
- datasetFilter !== "osworld"
208
- ) {
209
- // Remove OSWorld from tasks to run if dataset filter excludes it
210
- taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/osworld" ) ;
211
171
}
212
172
213
173
// Only include Mind2Web if no dataset filter or if onlineMind2Web is selected
214
- if (
215
- isMind2WebTaskIncluded &&
216
- ( ! datasetFilter || datasetFilter === "onlineMind2Web" )
217
- ) {
174
+ if ( isMind2WebTaskIncluded ) {
218
175
taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/onlineMind2Web" ) ;
219
176
allTestcases . push ( ...buildOnlineMind2WebTestcases ( currentModels ) ) ;
220
- } else if (
221
- isMind2WebTaskIncluded &&
222
- datasetFilter &&
223
- datasetFilter !== "onlineMind2Web"
224
- ) {
225
- // Remove Mind2Web from tasks to run if dataset filter excludes it
226
- taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/onlineMind2Web" ) ;
227
177
}
228
178
229
179
// Create a list of all remaining testcases using the determined task names and models
230
180
const regularTestcases = currentModels . flatMap ( ( model ) =>
231
- taskNamesToRun . map ( ( testName ) => ( {
232
- input : { name : testName , modelName : model as AvailableModel } ,
233
- name : testName ,
234
- tags : [
235
- model ,
236
- testName ,
237
- ...( tasksConfig . find ( ( t ) => t . name === testName ) ?. categories || [ ] ) . map (
238
- ( x ) => `category/${ x } ` ,
239
- ) ,
240
- ] ,
241
- metadata : {
242
- model : model as AvailableModel ,
243
- test : testName ,
244
- } ,
245
- expected : true ,
246
- } ) ) ,
181
+ taskNamesToRun . map ( ( testName ) => {
182
+ const taskCategories =
183
+ tasksConfig . find ( ( t ) => t . name === testName ) ?. categories || [ ] ;
184
+ return {
185
+ input : { name : testName , modelName : model as AvailableModel } ,
186
+ name : testName ,
187
+ tags : [
188
+ model ,
189
+ // Only include primary category as tag
190
+ taskCategories . length > 0 ? taskCategories [ 0 ] : "uncategorized" ,
191
+ ] ,
192
+ metadata : {
193
+ model : model as AvailableModel ,
194
+ test : testName ,
195
+ category : taskCategories [ 0 ] ,
196
+ categories : taskCategories , // Keep all categories in metadata for filtering
197
+ } ,
198
+ expected : true ,
199
+ } ;
200
+ } ) ,
247
201
) ;
248
202
249
203
allTestcases = [ ...allTestcases , ...regularTestcases ] ;
@@ -312,42 +266,27 @@ const generateFilteredTestcases = (): Testcase[] => {
312
266
const logger = new EvalLogger ( ) ;
313
267
try {
314
268
// Dynamically import the task based on its name
315
- const taskModulePath = path . join (
316
- __dirname ,
317
- "tasks" ,
318
- `${ input . name } .ts` ,
319
- ) ;
269
+ const basePath = path . join ( __dirname , "tasks" , `${ input . name } ` ) ;
270
+ const candidatePaths = [ `${ basePath } .js` , `${ basePath } .ts` ] ;
320
271
321
- // Check if file exists at direct path
322
272
let taskModule ;
323
- try {
324
- // First try to import directly (for backward compatibility)
325
- taskModule = await import ( taskModulePath ) ;
326
- } catch ( error ) {
327
- if ( input . name . includes ( "/" ) ) {
328
- // If the name includes a path separator, try to import from subdirectory
329
- const subDirPath = path . join (
330
- __dirname ,
331
- "tasks" ,
332
- `${ input . name } .ts` ,
333
- ) ;
334
- try {
335
- taskModule = await import ( subDirPath ) ;
336
- } catch ( subError ) {
337
- throw new StagehandEvalError (
338
- `Failed to import task module for ${ input . name } . Tried paths:\n` +
339
- `- ${ taskModulePath } \n` +
340
- `- ${ subDirPath } \n` +
341
- `Error: ${ subError . message } ` ,
342
- ) ;
343
- }
344
- } else {
345
- throw new StagehandEvalError (
346
- `Failed to import task module for ${ input . name } at path ${ taskModulePath } : ${ error . message } ` ,
347
- ) ;
273
+ let lastError : unknown ;
274
+ for ( const candidate of candidatePaths ) {
275
+ try {
276
+ taskModule = await import ( candidate ) ;
277
+ break ;
278
+ } catch ( err ) {
279
+ lastError = err ;
348
280
}
349
281
}
350
282
283
+ if ( ! taskModule ) {
284
+ const tried = candidatePaths . join ( "\n- " ) ;
285
+ throw new StagehandEvalError (
286
+ `Failed to import task module for ${ input . name } . Tried paths:\n- ${ tried } \nError: ${ ( lastError as Error ) ?. message } ` ,
287
+ ) ;
288
+ }
289
+
351
290
// Extract the task function
352
291
const taskName = input . name . includes ( "/" )
353
292
? input . name . split ( "/" ) . pop ( ) // Get the last part of the path for nested tasks
@@ -362,9 +301,6 @@ const generateFilteredTestcases = (): Testcase[] => {
362
301
}
363
302
364
303
// Execute the task
365
- console . log (
366
- `🏃 Running eval: ${ input . name } with model: ${ input . modelName } ` ,
367
- ) ;
368
304
let taskInput : Awaited < ReturnType < typeof initStagehand > > ;
369
305
370
306
if ( USE_API ) {
@@ -426,6 +362,7 @@ const generateFilteredTestcases = (): Testcase[] => {
426
362
}
427
363
// Pass full EvalInput to the task (data-driven params available via input.params)
428
364
let result ;
365
+ let isStagehandClosed = false ;
429
366
try {
430
367
result = await taskFunction ( { ...taskInput , input } ) ;
431
368
// Log result to console
@@ -435,31 +372,80 @@ const generateFilteredTestcases = (): Testcase[] => {
435
372
console . log ( `❌ ${ input . name } : Failed` ) ;
436
373
}
437
374
} finally {
438
- await taskInput . stagehand . close ( ) ;
375
+ // Only close if not already closed
376
+ if ( taskInput . stagehand && ! isStagehandClosed ) {
377
+ try {
378
+ await taskInput . stagehand . close ( ) ;
379
+ isStagehandClosed = true ;
380
+ } catch ( closeError ) {
381
+ console . warn ( "Error closing stagehand:" , closeError ) ;
382
+ }
383
+ }
439
384
}
440
385
return result ;
441
386
} catch ( error ) {
387
+ // Categorize the error
388
+ let errorType = ErrorType . UNKNOWN ;
389
+ const errorMessage =
390
+ error instanceof Error ? error . message : String ( error ) ;
391
+
392
+ if ( error instanceof Error ) {
393
+ if (
394
+ error . message . includes ( "timeout" ) ||
395
+ error . message . includes ( "Timeout" )
396
+ ) {
397
+ errorType = ErrorType . TIMEOUT ;
398
+ } else if (
399
+ error . message . includes ( "network" ) ||
400
+ error . message . includes ( "fetch" )
401
+ ) {
402
+ errorType = ErrorType . NETWORK ;
403
+ } else if (
404
+ error . message . includes ( "parse" ) ||
405
+ error . message . includes ( "JSON" )
406
+ ) {
407
+ errorType = ErrorType . PARSING_ERROR ;
408
+ } else if (
409
+ error . message . includes ( "init" ) ||
410
+ error . message . includes ( "setup" )
411
+ ) {
412
+ errorType = ErrorType . SETUP_ERROR ;
413
+ }
414
+ }
415
+
442
416
// Log any errors that occur during task execution
443
- console . error ( `❌ ${ input . name } : Error - ${ error } ` ) ;
417
+ console . error ( `❌ ${ input . name } : ${ errorType } - ${ errorMessage } ` ) ;
444
418
logger . error ( {
445
419
message : `Error in task ${ input . name } ` ,
446
420
level : 0 ,
447
421
auxiliary : {
448
422
error : {
449
- value : error . message ,
423
+ value : errorMessage ,
424
+ type : "string" ,
425
+ } ,
426
+ error_type : {
427
+ value : errorType ,
450
428
type : "string" ,
451
429
} ,
452
430
trace : {
453
- value : error . stack ,
431
+ value : error instanceof Error ? error . stack : "" ,
454
432
type : "string" ,
455
433
} ,
456
434
} ,
457
435
} ) ;
458
- return {
436
+
437
+ const output : EvalOutput = {
459
438
_success : false ,
460
439
error : JSON . parse ( JSON . stringify ( error , null , 2 ) ) ,
440
+ error_type : errorType ,
441
+ error_message : errorMessage ,
442
+ error_stack : error instanceof Error ? error . stack : undefined ,
461
443
logs : logger . getLogs ( ) ,
444
+ debugUrl : "" ,
445
+ sessionUrl : "" ,
462
446
} ;
447
+
448
+ return output ;
463
449
}
464
450
} ,
465
451
// Use the scoring functions defined above
@@ -475,6 +461,10 @@ const generateFilteredTestcases = (): Testcase[] => {
475
461
? { _success : result . output }
476
462
: result . output ;
477
463
464
+ // The full output object (including error_type, error_message, etc.)
465
+ // is already captured in result.output and will be visible in Braintrust
466
+ // We don't need to duplicate it in metadata
467
+
478
468
return {
479
469
input : result . input ,
480
470
output,
0 commit comments