Skip to content

work on adding voyager to evals #959

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .changeset/pretty-jokes-own.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
"@browserbasehq/stagehand": patch
---

Properly handle images in evaluator + clean up response parsing logic
Properly handle images in evaluator + clean up response parsing logic
69 changes: 69 additions & 0 deletions evals/core/summary.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import fs from "fs";
import { tasksByName } from "../taskConfig";
import type { SummaryResult } from "@/types/evals";

export const generateSummary = async (
results: SummaryResult[],
experimentName: string,
) => {
const passed = results
.filter((r) => r.output._success)
.map((r) => ({
eval: r.input.name,
model: r.input.modelName,
categories: tasksByName[r.input.name].categories,
}));

const failed = results
.filter((r) => !r.output._success)
.map((r) => ({
eval: r.input.name,
model: r.input.modelName,
categories: tasksByName[r.input.name].categories,
}));

const categorySuccessCounts: Record<
string,
{ total: number; success: number }
> = {};
for (const taskName of Object.keys(tasksByName)) {
const taskCategories = tasksByName[taskName].categories;
const taskResults = results.filter((r) => r.input.name === taskName);
const successCount = taskResults.filter((r) => r.output._success).length;

for (const cat of taskCategories) {
if (!categorySuccessCounts[cat]) {
categorySuccessCounts[cat] = { total: 0, success: 0 };
}
categorySuccessCounts[cat].total += taskResults.length;
categorySuccessCounts[cat].success += successCount;
}
}

const categories: Record<string, number> = {};
for (const [cat, counts] of Object.entries(categorySuccessCounts)) {
categories[cat] = Math.round((counts.success / counts.total) * 100);
}

const models: Record<string, number> = {};
const allModels = [...new Set(results.map((r) => r.input.modelName))];
for (const model of allModels) {
const modelResults = results.filter((r) => r.input.modelName === model);
const successCount = modelResults.filter((r) => r.output._success).length;
models[model] = Math.round((successCount / modelResults.length) * 100);
}

const formattedSummary = {
experimentName,
passed,
failed,
categories,
models,
};

fs.writeFileSync(
"eval-summary.json",
JSON.stringify(formattedSummary, null, 2),
);
console.log("Evaluation summary written to eval-summary.json");
};
90 changes: 90 additions & 0 deletions evals/datasets/gaia/GAIA_web.jsonl

Large diffs are not rendered by default.

643 changes: 643 additions & 0 deletions evals/datasets/webvoyager/WebVoyager_data.jsonl

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions evals/evals.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -419,5 +419,15 @@
"name": "agent/sign_in",
"categories": ["agent"]
}
,
{
"name": "agent/webarena_gaia",
"categories": ["agent"]
}
,
{
"name": "agent/webvoyager",
"categories": ["agent"]
}
]
}
118 changes: 29 additions & 89 deletions evals/index.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
* - Runs each selected task against each selected model in parallel, collecting results.
* - Saves a summary of the evaluation results to `eval-summary.json`.
*/
import fs from "fs";
import path from "path";
import process from "process";
import {
Expand All @@ -24,7 +23,7 @@ import { generateExperimentName } from "./utils";
import { exactMatch, errorMatch } from "./scoring";
import { tasksByName, tasksConfig, getModelList } from "./taskConfig";
import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust";
import { SummaryResult, Testcase } from "@/types/evals";
import { SummaryResult, Testcase, EvalInput } from "@/types/evals";
import { EvalLogger } from "./logger";
import { AvailableModel, LLMClient } from "@browserbasehq/stagehand";
import { env } from "./env";
Expand All @@ -37,6 +36,9 @@ import { AISdkClient } from "@/examples/external_clients/aisdk";
import { getAISDKLanguageModel } from "@/lib/llm/LLMProvider";
import { loadApiKeyFromEnv } from "@/lib/utils";
import { LogLine } from "@/types/log";
import { generateSummary } from "./core/summary";
import { buildGAIATestcases } from "./suites/gaia";
import { buildWebVoyagerTestcases } from "./suites/webvoyager";

dotenv.config();

Expand All @@ -54,88 +56,6 @@ const TRIAL_COUNT = process.env.EVAL_TRIAL_COUNT

const USE_API: boolean = (process.env.USE_API ?? "").toLowerCase() === "true";

/**
* generateSummary:
* After all evaluations have finished, aggregate the results into a summary.
* This summary includes:
* - Which tasks passed or failed (with model and categories).
* - Category-wise success percentages.
* - Model-wise success percentages.
*
* The summary is written to `eval-summary.json` for further analysis.
*/
const generateSummary = async (
results: SummaryResult[],
experimentName: string,
) => {
// Determine passed testcases (those with _success: true)
const passed = results
.filter((r) => r.output._success)
.map((r) => ({
eval: r.input.name,
model: r.input.modelName,
categories: tasksByName[r.input.name].categories,
}));

// Determine failed testcases (those with _success: false)
const failed = results
.filter((r) => !r.output._success)
.map((r) => ({
eval: r.input.name,
model: r.input.modelName,
categories: tasksByName[r.input.name].categories,
}));

// Calculate success counts for each category
const categorySuccessCounts: Record<
string,
{ total: number; success: number }
> = {};
for (const taskName of Object.keys(tasksByName)) {
const taskCategories = tasksByName[taskName].categories;
const taskResults = results.filter((r) => r.input.name === taskName);
const successCount = taskResults.filter((r) => r.output._success).length;

for (const cat of taskCategories) {
if (!categorySuccessCounts[cat]) {
categorySuccessCounts[cat] = { total: 0, success: 0 };
}
categorySuccessCounts[cat].total += taskResults.length;
categorySuccessCounts[cat].success += successCount;
}
}

// Compute percentage success per category
const categories: Record<string, number> = {};
for (const [cat, counts] of Object.entries(categorySuccessCounts)) {
categories[cat] = Math.round((counts.success / counts.total) * 100);
}

// Compute percentage success per model
const models: Record<string, number> = {};
const allModels = [...new Set(results.map((r) => r.input.modelName))];
for (const model of allModels) {
const modelResults = results.filter((r) => r.input.modelName === model);
const successCount = modelResults.filter((r) => r.output._success).length;
models[model] = Math.round((successCount / modelResults.length) * 100);
}

// Format and write the summary to a JSON file
const formattedSummary = {
experimentName,
passed,
failed,
categories,
models,
};

fs.writeFileSync(
"eval-summary.json",
JSON.stringify(formattedSummary, null, 2),
);
console.log("Evaluation summary written to eval-summary.json");
};

/**
* generateFilteredTestcases:
* Based on the chosen filters (category or specific eval name) and environment,
Expand Down Expand Up @@ -187,8 +107,25 @@ const generateFilteredTestcases = (): Testcase[] => {
currentModels,
);

// Create a list of all testcases using the determined task names and models
let allTestcases = currentModels.flatMap((model) =>
// Special handling: fan out GAIA (WebVoyager) dataset for agent/webarena_gaia
const isGAIATaskIncluded = taskNamesToRun.includes("agent/webarena_gaia");
// Special handling: fan out WebVoyager dataset for agent/webvoyager
const isWebVoyagerTaskIncluded = taskNamesToRun.includes("agent/webvoyager");

let allTestcases: Testcase[] = [];

if (isGAIATaskIncluded) {
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webarena_gaia");
allTestcases.push(...buildGAIATestcases(currentModels));
}

if (isWebVoyagerTaskIncluded) {
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webvoyager");
allTestcases.push(...buildWebVoyagerTestcases(currentModels));
}

// Create a list of all remaining testcases using the determined task names and models
const regularTestcases = currentModels.flatMap((model) =>
taskNamesToRun.map((testName) => ({
input: { name: testName, modelName: model as AvailableModel },
name: testName,
Expand All @@ -202,12 +139,13 @@ const generateFilteredTestcases = (): Testcase[] => {
metadata: {
model: model as AvailableModel,
test: testName,
categories: tasksConfig.find((t) => t.name === testName)?.categories,
},
expected: true,
})),
);

allTestcases = [...allTestcases, ...regularTestcases];

// This filtering step might now be redundant if taskNamesToRun is already filtered
if (filterByCategory) {
allTestcases = allTestcases.filter((testcase) =>
Expand All @@ -227,7 +165,7 @@ const generateFilteredTestcases = (): Testcase[] => {
allTestcases
.map(
(t, i) =>
`${i}: ${t.name} (${t.input.modelName}): ${t.metadata.categories}`,
`${i}: ${t.name} (${t.input.modelName}): ${tasksByName[t.name].categories}`,
)
.join("\n"),
);
Expand Down Expand Up @@ -266,7 +204,7 @@ const generateFilteredTestcases = (): Testcase[] => {
experimentName,
data: generateFilteredTestcases,
// Each test is a function that runs the corresponding task module
task: async (input: { name: string; modelName: AvailableModel }) => {
task: async (input: EvalInput) => {
const logger = new EvalLogger();
try {
// Dynamically import the task based on its name
Expand Down Expand Up @@ -367,6 +305,8 @@ const generateFilteredTestcases = (): Testcase[] => {
modelName: input.modelName,
});
}
// Attach per-test parameters (for data-driven tasks)
taskInput.taskParams = input.params;
let result;
try {
result = await taskFunction(taskInput);
Expand Down
120 changes: 120 additions & 0 deletions evals/suites/gaia.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import fs from "fs";
import path from "path";
import type { Testcase, EvalInput } from "@/types/evals";
import type { AvailableModel } from "@/types/model";
import { tasksConfig } from "../taskConfig";

export const buildGAIATestcases = (models: string[]): Testcase[] => {
const gaiaFilePath =
process.env.EVAL_GAIA_FILE ||
path.join(__dirname, "..", "datasets", "gaia", "GAIA_web.jsonl");

let gaiaLines: string[] = [];
try {
const content = fs.readFileSync(gaiaFilePath, "utf-8");
gaiaLines = content.split(/\r?\n/).filter((l) => l.trim().length > 0);
} catch (e) {
console.warn(
`Could not read GAIA file at ${gaiaFilePath}. Set EVAL_GAIA_FILE to override. Error: ${e instanceof Error ? e.message : String(e)}`,
);
gaiaLines = [];
}

const levelFilter = process.env.EVAL_GAIA_LEVEL
? Number(process.env.EVAL_GAIA_LEVEL)
: undefined;
const maxCases = process.env.EVAL_GAIA_LIMIT
? Number(process.env.EVAL_GAIA_LIMIT)
: 25;
const sampleCount = process.env.EVAL_GAIA_SAMPLE
? Number(process.env.EVAL_GAIA_SAMPLE)
: undefined;

type GaiaRow = {
id: string;
Level?: number;
web: string;
ques: string;
[key: string]: unknown;
};

const gaiaRows: GaiaRow[] = [];
const candidates: GaiaRow[] = [];
for (const line of gaiaLines) {
try {
const parsed = JSON.parse(line) as GaiaRow;
if (
typeof parsed.id === "string" &&
typeof parsed.web === "string" &&
typeof parsed.ques === "string"
) {
if (!levelFilter || parsed.Level === levelFilter) {
candidates.push(parsed);
}
}
} catch {
// skip invalid lines
}
}
if (sampleCount && sampleCount > 0) {
gaiaRows.push(...sampleUniform(candidates, sampleCount));
} else {
for (const row of candidates) {
gaiaRows.push(row);
if (gaiaRows.length >= maxCases) break;
}
}

const allTestcases: Testcase[] = [];
for (const model of models) {
for (const row of gaiaRows) {
const finalAnswer = (row as Record<string, unknown>)[
"Final answer"
] as unknown;
const input: EvalInput = {
name: "agent/webarena_gaia",
modelName: model as AvailableModel,
params: {
id: row.id,
level: row.Level,
web: row.web,
ques: row.ques,
expected: typeof finalAnswer === "string" ? finalAnswer : undefined,
},
};
allTestcases.push({
input,
name: input.name,
tags: [
model,
input.name,
...(
tasksConfig.find((t) => t.name === input.name)?.categories || []
).map((x) => `category/${x}`),
`gaia/id/${row.id}`,
row.Level ? `gaia/level/${row.Level}` : "gaia/level/unknown",
],
metadata: {
model: model as AvailableModel,
test: `${input.name}:${row.id}`,
},
expected: true,
});
}
}

return allTestcases;
};

function sampleUniform<T>(arr: T[], k: number): T[] {
const n = arr.length;
if (k >= n) return arr.slice();
const copy = arr.slice();
for (let i = n - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
const tmp = copy[i];
copy[i] = copy[j];
copy[j] = tmp;
}
return copy.slice(0, k);
}
Loading