Skip to content

Commit f89b13e

Browse files
authored
Eval metadata (#1092)
# why To help make sense of eval test cases and results # what changed Added metadata to eval runs, cleaned deprecated code # test plan
1 parent dc2d420 commit f89b13e

File tree

4 files changed

+166
-170
lines changed

4 files changed

+166
-170
lines changed

evals/index.eval.ts

Lines changed: 121 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,13 @@ import { generateExperimentName } from "./utils";
2323
import { exactMatch, errorMatch } from "./scoring";
2424
import { tasksByName, tasksConfig, getModelList } from "./taskConfig";
2525
import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust";
26-
import { SummaryResult, Testcase, EvalInput } from "@/types/evals";
26+
import {
27+
SummaryResult,
28+
Testcase,
29+
EvalInput,
30+
ErrorType,
31+
EvalOutput,
32+
} from "@/types/evals";
2733
import { EvalLogger } from "./logger";
2834
import { AvailableModel, LLMClient } from "@browserbasehq/stagehand";
2935
import { env } from "./env";
@@ -46,6 +52,14 @@ import { buildOnlineMind2WebTestcases } from "./suites/onlineMind2Web";
4652

4753
dotenv.config();
4854

55+
process.on("uncaughtException", (err) => {
56+
console.error("[eval-runner] Uncaught exception:", err);
57+
});
58+
59+
process.on("unhandledRejection", (reason) => {
60+
console.error("[eval-runner] Unhandled rejection:", reason);
61+
});
62+
4963
/**
5064
* Read max concurrency and trial count from environment variables set in args.ts.
5165
* Fallback to defaults (20 and 5) if they're not provided.
@@ -107,20 +121,6 @@ const generateFilteredTestcases = (): Testcase[] => {
107121
);
108122
}
109123

110-
// Check for dataset filter from environment
111-
const datasetFilter = process.env.EVAL_DATASET;
112-
113-
// If using external benchmarks via dataset filter, override category to use agent models
114-
if (
115-
datasetFilter &&
116-
["gaia", "webvoyager", "webbench", "osworld"].includes(datasetFilter)
117-
) {
118-
effectiveCategory = "external_agent_benchmarks";
119-
console.log(
120-
`Using dataset filter "${datasetFilter}", switching to external_agent_benchmarks category.`,
121-
);
122-
}
123-
124124
// Dynamically determine the MODELS based on the effective category
125125
const currentModels = getModelList(effectiveCategory);
126126

@@ -130,18 +130,15 @@ const generateFilteredTestcases = (): Testcase[] => {
130130
);
131131

132132
// Special handling: fan out GAIA dataset for agent/gaia
133-
const isGAIATaskIncluded =
134-
taskNamesToRun.includes("agent/gaia") || datasetFilter === "gaia";
133+
const isGAIATaskIncluded = taskNamesToRun.includes("agent/gaia");
135134
// Special handling: fan out WebVoyager dataset for agent/webvoyager
136-
const isWebVoyagerTaskIncluded =
137-
taskNamesToRun.includes("agent/webvoyager") ||
138-
datasetFilter === "webvoyager";
135+
const isWebVoyagerTaskIncluded = taskNamesToRun.includes("agent/webvoyager");
139136
// Special handling: fan out WebBench dataset for agent/webbench
140-
const isWebBenchTaskIncluded =
141-
taskNamesToRun.includes("agent/webbench") || datasetFilter === "webbench";
137+
const isWebBenchTaskIncluded = taskNamesToRun.includes("agent/webbench");
138+
142139
// Special handling: fan out OSWorld dataset for agent/osworld
143-
const isOSWorldTaskIncluded =
144-
taskNamesToRun.includes("agent/osworld") || datasetFilter === "osworld";
140+
const isOSWorldTaskIncluded = taskNamesToRun.includes("agent/osworld");
141+
145142
// Special handling: fan out Mind2Web dataset for agent/onlineMind2Web
146143
const isMind2WebTaskIncluded = taskNamesToRun.includes(
147144
"agent/onlineMind2Web",
@@ -150,100 +147,57 @@ const generateFilteredTestcases = (): Testcase[] => {
150147
let allTestcases: Testcase[] = [];
151148

152149
// Only include GAIA if no dataset filter or if gaia is selected
153-
if (isGAIATaskIncluded && (!datasetFilter || datasetFilter === "gaia")) {
150+
if (isGAIATaskIncluded) {
154151
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/gaia");
155152
allTestcases.push(...buildGAIATestcases(currentModels));
156-
} else if (
157-
taskNamesToRun.includes("agent/gaia") &&
158-
datasetFilter &&
159-
datasetFilter !== "gaia"
160-
) {
161-
// Remove GAIA from tasks to run if dataset filter excludes it
162-
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/gaia");
163153
}
164154

165155
// Only include WebVoyager if no dataset filter or if webvoyager is selected
166-
if (
167-
isWebVoyagerTaskIncluded &&
168-
(!datasetFilter || datasetFilter === "webvoyager")
169-
) {
156+
if (isWebVoyagerTaskIncluded) {
170157
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webvoyager");
171158
allTestcases.push(...buildWebVoyagerTestcases(currentModels));
172-
} else if (
173-
taskNamesToRun.includes("agent/webvoyager") &&
174-
datasetFilter &&
175-
datasetFilter !== "webvoyager"
176-
) {
177-
// Remove WebVoyager from tasks to run if dataset filter excludes it
178-
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webvoyager");
179159
}
180160

181161
// Only include WebBench if no dataset filter or if webbench is selected
182-
if (
183-
isWebBenchTaskIncluded &&
184-
(!datasetFilter || datasetFilter === "webbench")
185-
) {
162+
if (isWebBenchTaskIncluded) {
186163
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webbench");
187164
allTestcases.push(...buildWebBenchTestcases(currentModels));
188-
} else if (
189-
taskNamesToRun.includes("agent/webbench") &&
190-
datasetFilter &&
191-
datasetFilter !== "webbench"
192-
) {
193-
// Remove WebBench from tasks to run if dataset filter excludes it
194-
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webbench");
195165
}
196166

197167
// Only include OSWorld if no dataset filter or if osworld is selected
198-
if (
199-
isOSWorldTaskIncluded &&
200-
(!datasetFilter || datasetFilter === "osworld")
201-
) {
168+
if (isOSWorldTaskIncluded) {
202169
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/osworld");
203170
allTestcases.push(...buildOSWorldTestcases(currentModels));
204-
} else if (
205-
taskNamesToRun.includes("agent/osworld") &&
206-
datasetFilter &&
207-
datasetFilter !== "osworld"
208-
) {
209-
// Remove OSWorld from tasks to run if dataset filter excludes it
210-
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/osworld");
211171
}
212172

213173
// Only include Mind2Web if no dataset filter or if onlineMind2Web is selected
214-
if (
215-
isMind2WebTaskIncluded &&
216-
(!datasetFilter || datasetFilter === "onlineMind2Web")
217-
) {
174+
if (isMind2WebTaskIncluded) {
218175
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/onlineMind2Web");
219176
allTestcases.push(...buildOnlineMind2WebTestcases(currentModels));
220-
} else if (
221-
isMind2WebTaskIncluded &&
222-
datasetFilter &&
223-
datasetFilter !== "onlineMind2Web"
224-
) {
225-
// Remove Mind2Web from tasks to run if dataset filter excludes it
226-
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/onlineMind2Web");
227177
}
228178

229179
// Create a list of all remaining testcases using the determined task names and models
230180
const regularTestcases = currentModels.flatMap((model) =>
231-
taskNamesToRun.map((testName) => ({
232-
input: { name: testName, modelName: model as AvailableModel },
233-
name: testName,
234-
tags: [
235-
model,
236-
testName,
237-
...(tasksConfig.find((t) => t.name === testName)?.categories || []).map(
238-
(x) => `category/${x}`,
239-
),
240-
],
241-
metadata: {
242-
model: model as AvailableModel,
243-
test: testName,
244-
},
245-
expected: true,
246-
})),
181+
taskNamesToRun.map((testName) => {
182+
const taskCategories =
183+
tasksConfig.find((t) => t.name === testName)?.categories || [];
184+
return {
185+
input: { name: testName, modelName: model as AvailableModel },
186+
name: testName,
187+
tags: [
188+
model,
189+
// Only include primary category as tag
190+
taskCategories.length > 0 ? taskCategories[0] : "uncategorized",
191+
],
192+
metadata: {
193+
model: model as AvailableModel,
194+
test: testName,
195+
category: taskCategories[0],
196+
categories: taskCategories, // Keep all categories in metadata for filtering
197+
},
198+
expected: true,
199+
};
200+
}),
247201
);
248202

249203
allTestcases = [...allTestcases, ...regularTestcases];
@@ -312,42 +266,27 @@ const generateFilteredTestcases = (): Testcase[] => {
312266
const logger = new EvalLogger();
313267
try {
314268
// Dynamically import the task based on its name
315-
const taskModulePath = path.join(
316-
__dirname,
317-
"tasks",
318-
`${input.name}.ts`,
319-
);
269+
const basePath = path.join(__dirname, "tasks", `${input.name}`);
270+
const candidatePaths = [`${basePath}.js`, `${basePath}.ts`];
320271

321-
// Check if file exists at direct path
322272
let taskModule;
323-
try {
324-
// First try to import directly (for backward compatibility)
325-
taskModule = await import(taskModulePath);
326-
} catch (error) {
327-
if (input.name.includes("/")) {
328-
// If the name includes a path separator, try to import from subdirectory
329-
const subDirPath = path.join(
330-
__dirname,
331-
"tasks",
332-
`${input.name}.ts`,
333-
);
334-
try {
335-
taskModule = await import(subDirPath);
336-
} catch (subError) {
337-
throw new StagehandEvalError(
338-
`Failed to import task module for ${input.name}. Tried paths:\n` +
339-
`- ${taskModulePath}\n` +
340-
`- ${subDirPath}\n` +
341-
`Error: ${subError.message}`,
342-
);
343-
}
344-
} else {
345-
throw new StagehandEvalError(
346-
`Failed to import task module for ${input.name} at path ${taskModulePath}: ${error.message}`,
347-
);
273+
let lastError: unknown;
274+
for (const candidate of candidatePaths) {
275+
try {
276+
taskModule = await import(candidate);
277+
break;
278+
} catch (err) {
279+
lastError = err;
348280
}
349281
}
350282

283+
if (!taskModule) {
284+
const tried = candidatePaths.join("\n- ");
285+
throw new StagehandEvalError(
286+
`Failed to import task module for ${input.name}. Tried paths:\n- ${tried}\nError: ${(lastError as Error)?.message}`,
287+
);
288+
}
289+
351290
// Extract the task function
352291
const taskName = input.name.includes("/")
353292
? input.name.split("/").pop() // Get the last part of the path for nested tasks
@@ -362,9 +301,6 @@ const generateFilteredTestcases = (): Testcase[] => {
362301
}
363302

364303
// Execute the task
365-
console.log(
366-
`🏃 Running eval: ${input.name} with model: ${input.modelName}`,
367-
);
368304
let taskInput: Awaited<ReturnType<typeof initStagehand>>;
369305

370306
if (USE_API) {
@@ -426,6 +362,7 @@ const generateFilteredTestcases = (): Testcase[] => {
426362
}
427363
// Pass full EvalInput to the task (data-driven params available via input.params)
428364
let result;
365+
let isStagehandClosed = false;
429366
try {
430367
result = await taskFunction({ ...taskInput, input });
431368
// Log result to console
@@ -435,31 +372,80 @@ const generateFilteredTestcases = (): Testcase[] => {
435372
console.log(`❌ ${input.name}: Failed`);
436373
}
437374
} finally {
438-
await taskInput.stagehand.close();
375+
// Only close if not already closed
376+
if (taskInput.stagehand && !isStagehandClosed) {
377+
try {
378+
await taskInput.stagehand.close();
379+
isStagehandClosed = true;
380+
} catch (closeError) {
381+
console.warn("Error closing stagehand:", closeError);
382+
}
383+
}
439384
}
440385
return result;
441386
} catch (error) {
387+
// Categorize the error
388+
let errorType = ErrorType.UNKNOWN;
389+
const errorMessage =
390+
error instanceof Error ? error.message : String(error);
391+
392+
if (error instanceof Error) {
393+
if (
394+
error.message.includes("timeout") ||
395+
error.message.includes("Timeout")
396+
) {
397+
errorType = ErrorType.TIMEOUT;
398+
} else if (
399+
error.message.includes("network") ||
400+
error.message.includes("fetch")
401+
) {
402+
errorType = ErrorType.NETWORK;
403+
} else if (
404+
error.message.includes("parse") ||
405+
error.message.includes("JSON")
406+
) {
407+
errorType = ErrorType.PARSING_ERROR;
408+
} else if (
409+
error.message.includes("init") ||
410+
error.message.includes("setup")
411+
) {
412+
errorType = ErrorType.SETUP_ERROR;
413+
}
414+
}
415+
442416
// Log any errors that occur during task execution
443-
console.error(`❌ ${input.name}: Error - ${error}`);
417+
console.error(`❌ ${input.name}: ${errorType} - ${errorMessage}`);
444418
logger.error({
445419
message: `Error in task ${input.name}`,
446420
level: 0,
447421
auxiliary: {
448422
error: {
449-
value: error.message,
423+
value: errorMessage,
424+
type: "string",
425+
},
426+
error_type: {
427+
value: errorType,
450428
type: "string",
451429
},
452430
trace: {
453-
value: error.stack,
431+
value: error instanceof Error ? error.stack : "",
454432
type: "string",
455433
},
456434
},
457435
});
458-
return {
436+
437+
const output: EvalOutput = {
459438
_success: false,
460439
error: JSON.parse(JSON.stringify(error, null, 2)),
440+
error_type: errorType,
441+
error_message: errorMessage,
442+
error_stack: error instanceof Error ? error.stack : undefined,
461443
logs: logger.getLogs(),
444+
debugUrl: "",
445+
sessionUrl: "",
462446
};
447+
448+
return output;
463449
}
464450
},
465451
// Use the scoring functions defined above
@@ -475,6 +461,10 @@ const generateFilteredTestcases = (): Testcase[] => {
475461
? { _success: result.output }
476462
: result.output;
477463

464+
// The full output object (including error_type, error_message, etc.)
465+
// is already captured in result.output and will be visible in Braintrust
466+
// We don't need to duplicate it in metadata
467+
478468
return {
479469
input: result.input,
480470
output,

0 commit comments

Comments
 (0)