Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2e3e151
work on adding voyager to evals
filip-michalsky Aug 12, 2025
d272670
add gaia evals
filip-michalsky Aug 12, 2025
1e27e07
Merge branch 'main' into fm/stg-661-add-web-voyager
filip-michalsky Aug 13, 2025
71cfcb0
refactor
filip-michalsky Aug 13, 2025
16e3a52
add sampling of suites
filip-michalsky Aug 13, 2025
0461322
updating evals
filip-michalsky Aug 14, 2025
4aaca65
linting
filip-michalsky Aug 14, 2025
1922e38
Merge branch 'main' into fm/stg-661-add-web-voyager
filip-michalsky Aug 14, 2025
b9b1702
remove logs, small updates
filip-michalsky Aug 15, 2025
22c9fe7
remove logs
filip-michalsky Aug 15, 2025
dcaeb83
revert unwanted change
filip-michalsky Aug 15, 2025
df880ac
more revert
filip-michalsky Aug 15, 2025
087f8cd
load env at root
filip-michalsky Aug 16, 2025
7c1d5a0
add changeset
filip-michalsky Aug 16, 2025
bd7352b
update
filip-michalsky Aug 16, 2025
4be8864
update ci
filip-michalsky Aug 16, 2025
b78d824
ci update
filip-michalsky Aug 16, 2025
70a7b7c
Merge main branch
filip-michalsky Aug 21, 2025
9a7c057
Merge main branch
filip-michalsky Aug 21, 2025
f9817c6
Merge fm/stg-670-add-agent-to-ci into fm/stg-661-add-web-voyager
filip-michalsky Aug 21, 2025
c04a226
Update .github/workflows/ci.yml
filip-michalsky Aug 27, 2025
bc85c12
lint
filip-michalsky Aug 27, 2025
5081d3b
exclude gaia and voyager from agent ci
filip-michalsky Aug 27, 2025
ce6abd4
update stagehandInitType to send eval inputs in EvalInput
filip-michalsky Aug 27, 2025
f5abf91
add external agent benchmarks as a category to CI
filip-michalsky Aug 27, 2025
51246f6
Update .github/workflows/ci.yml
filip-michalsky Aug 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .changeset/pretty-jokes-own.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
"@browserbasehq/stagehand": patch
---

Properly handle images in evaluator + clean up response parsing logic
Properly handle images in evaluator + clean up response parsing logic
69 changes: 69 additions & 0 deletions evals/core/summary.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import fs from "fs";
import { tasksByName } from "../taskConfig";
import type { SummaryResult } from "@/types/evals";

export const generateSummary = async (
results: SummaryResult[],
experimentName: string,
) => {
const passed = results
.filter((r) => r.output._success)
.map((r) => ({
eval: r.input.name,
model: r.input.modelName,
categories: tasksByName[r.input.name].categories,
}));

const failed = results
.filter((r) => !r.output._success)
.map((r) => ({
eval: r.input.name,
model: r.input.modelName,
categories: tasksByName[r.input.name].categories,
}));

const categorySuccessCounts: Record<
string,
{ total: number; success: number }
> = {};
for (const taskName of Object.keys(tasksByName)) {
const taskCategories = tasksByName[taskName].categories;
const taskResults = results.filter((r) => r.input.name === taskName);
const successCount = taskResults.filter((r) => r.output._success).length;

for (const cat of taskCategories) {
if (!categorySuccessCounts[cat]) {
categorySuccessCounts[cat] = { total: 0, success: 0 };
}
categorySuccessCounts[cat].total += taskResults.length;
categorySuccessCounts[cat].success += successCount;
}
}

const categories: Record<string, number> = {};
for (const [cat, counts] of Object.entries(categorySuccessCounts)) {
categories[cat] = Math.round((counts.success / counts.total) * 100);
}

const models: Record<string, number> = {};
const allModels = [...new Set(results.map((r) => r.input.modelName))];
for (const model of allModels) {
const modelResults = results.filter((r) => r.input.modelName === model);
const successCount = modelResults.filter((r) => r.output._success).length;
models[model] = Math.round((successCount / modelResults.length) * 100);
}

const formattedSummary = {
experimentName,
passed,
failed,
categories,
models,
};

fs.writeFileSync(
"eval-summary.json",
JSON.stringify(formattedSummary, null, 2),
);
console.log("Evaluation summary written to eval-summary.json");
};
90 changes: 90 additions & 0 deletions evals/datasets/gaia/GAIA_web.jsonl

Large diffs are not rendered by default.

643 changes: 643 additions & 0 deletions evals/datasets/webvoyager/WebVoyager_data.jsonl

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions evals/evals.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -463,5 +463,15 @@
"name": "iframe_scroll",
"categories": ["act"]
}
,
{
"name": "agent/webarena_gaia",
"categories": ["agent"]
}
,
{
"name": "agent/webvoyager",
"categories": ["agent"]
}
]
}
133 changes: 43 additions & 90 deletions evals/index.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
* - Runs each selected task against each selected model in parallel, collecting results.
* - Saves a summary of the evaluation results to `eval-summary.json`.
*/
import fs from "fs";
import path from "path";
import process from "process";
import {
Expand All @@ -24,7 +23,7 @@ import { generateExperimentName } from "./utils";
import { exactMatch, errorMatch } from "./scoring";
import { tasksByName, tasksConfig, getModelList } from "./taskConfig";
import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust";
import { SummaryResult, Testcase } from "@/types/evals";
import { SummaryResult, Testcase, EvalInput } from "@/types/evals";
import { EvalLogger } from "./logger";
import { AvailableModel, LLMClient } from "@browserbasehq/stagehand";
import { env } from "./env";
Expand All @@ -33,10 +32,14 @@ import { StagehandEvalError } from "@/types/stagehandErrors";
import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI";
import OpenAI from "openai";
import { initStagehand } from "./initStagehand";
import { AgentProvider } from "@/lib/agent/AgentProvider";
import { AISdkClient } from "@/examples/external_clients/aisdk";
import { getAISDKLanguageModel } from "@/lib/llm/LLMProvider";
import { loadApiKeyFromEnv } from "@/lib/utils";
import { LogLine } from "@/types/log";
import { generateSummary } from "./core/summary";
import { buildGAIATestcases } from "./suites/gaia";
import { buildWebVoyagerTestcases } from "./suites/webvoyager";

dotenv.config();

Expand All @@ -54,88 +57,6 @@ const TRIAL_COUNT = process.env.EVAL_TRIAL_COUNT

const USE_API: boolean = (process.env.USE_API ?? "").toLowerCase() === "true";

/**
* generateSummary:
* After all evaluations have finished, aggregate the results into a summary.
* This summary includes:
* - Which tasks passed or failed (with model and categories).
* - Category-wise success percentages.
* - Model-wise success percentages.
*
* The summary is written to `eval-summary.json` for further analysis.
*/
const generateSummary = async (
results: SummaryResult[],
experimentName: string,
) => {
// Determine passed testcases (those with _success: true)
const passed = results
.filter((r) => r.output._success)
.map((r) => ({
eval: r.input.name,
model: r.input.modelName,
categories: tasksByName[r.input.name].categories,
}));

// Determine failed testcases (those with _success: false)
const failed = results
.filter((r) => !r.output._success)
.map((r) => ({
eval: r.input.name,
model: r.input.modelName,
categories: tasksByName[r.input.name].categories,
}));

// Calculate success counts for each category
const categorySuccessCounts: Record<
string,
{ total: number; success: number }
> = {};
for (const taskName of Object.keys(tasksByName)) {
const taskCategories = tasksByName[taskName].categories;
const taskResults = results.filter((r) => r.input.name === taskName);
const successCount = taskResults.filter((r) => r.output._success).length;

for (const cat of taskCategories) {
if (!categorySuccessCounts[cat]) {
categorySuccessCounts[cat] = { total: 0, success: 0 };
}
categorySuccessCounts[cat].total += taskResults.length;
categorySuccessCounts[cat].success += successCount;
}
}

// Compute percentage success per category
const categories: Record<string, number> = {};
for (const [cat, counts] of Object.entries(categorySuccessCounts)) {
categories[cat] = Math.round((counts.success / counts.total) * 100);
}

// Compute percentage success per model
const models: Record<string, number> = {};
const allModels = [...new Set(results.map((r) => r.input.modelName))];
for (const model of allModels) {
const modelResults = results.filter((r) => r.input.modelName === model);
const successCount = modelResults.filter((r) => r.output._success).length;
models[model] = Math.round((successCount / modelResults.length) * 100);
}

// Format and write the summary to a JSON file
const formattedSummary = {
experimentName,
passed,
failed,
categories,
models,
};

fs.writeFileSync(
"eval-summary.json",
JSON.stringify(formattedSummary, null, 2),
);
console.log("Evaluation summary written to eval-summary.json");
};

/**
* generateFilteredTestcases:
* Based on the chosen filters (category or specific eval name) and environment,
Expand Down Expand Up @@ -187,8 +108,25 @@ const generateFilteredTestcases = (): Testcase[] => {
currentModels,
);

// Create a list of all testcases using the determined task names and models
let allTestcases = currentModels.flatMap((model) =>
// Special handling: fan out GAIA (WebVoyager) dataset for agent/webarena_gaia
const isGAIATaskIncluded = taskNamesToRun.includes("agent/webarena_gaia");
// Special handling: fan out WebVoyager dataset for agent/webvoyager
const isWebVoyagerTaskIncluded = taskNamesToRun.includes("agent/webvoyager");

let allTestcases: Testcase[] = [];

if (isGAIATaskIncluded) {
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webarena_gaia");
allTestcases.push(...buildGAIATestcases(currentModels));
}

if (isWebVoyagerTaskIncluded) {
taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webvoyager");
allTestcases.push(...buildWebVoyagerTestcases(currentModels));
}

// Create a list of all remaining testcases using the determined task names and models
const regularTestcases = currentModels.flatMap((model) =>
taskNamesToRun.map((testName) => ({
input: { name: testName, modelName: model as AvailableModel },
name: testName,
Expand All @@ -202,12 +140,13 @@ const generateFilteredTestcases = (): Testcase[] => {
metadata: {
model: model as AvailableModel,
test: testName,
categories: tasksConfig.find((t) => t.name === testName)?.categories,
},
expected: true,
})),
);

allTestcases = [...allTestcases, ...regularTestcases];

// This filtering step might now be redundant if taskNamesToRun is already filtered
if (filterByCategory) {
allTestcases = allTestcases.filter((testcase) =>
Expand All @@ -227,7 +166,7 @@ const generateFilteredTestcases = (): Testcase[] => {
allTestcases
.map(
(t, i) =>
`${i}: ${t.name} (${t.input.modelName}): ${t.metadata.categories}`,
`${i}: ${t.name} (${t.input.modelName}): ${tasksByName[t.name].categories}`,
)
.join("\n"),
);
Expand Down Expand Up @@ -266,7 +205,7 @@ const generateFilteredTestcases = (): Testcase[] => {
experimentName,
data: generateFilteredTestcases,
// Each test is a function that runs the corresponding task module
task: async (input: { name: string; modelName: AvailableModel }) => {
task: async (input: EvalInput) => {
const logger = new EvalLogger();
try {
// Dynamically import the task based on its name
Expand Down Expand Up @@ -323,7 +262,19 @@ const generateFilteredTestcases = (): Testcase[] => {
let taskInput: Awaited<ReturnType<typeof initStagehand>>;

if (USE_API) {
const [provider] = input.modelName.split("/") as [string, string];
// Derive provider from model. Prefer explicit "provider/model"; otherwise infer for agent models
let provider: string;
if (input.modelName.includes("/")) {
provider = input.modelName.split("/")[0];
} else {
// Fall back to agent provider inference for bare agent model names (e.g., "computer-use-preview")
try {
provider = AgentProvider.getAgentProvider(input.modelName);
} catch {
// If not an agent model, leave provider undefined to trigger helpful error below
provider = undefined as unknown as string;
}
}

const logFn = (line: LogLine): void => logger.log(line);
const apiKey = loadApiKeyFromEnv(provider, logFn);
Expand Down Expand Up @@ -367,6 +318,8 @@ const generateFilteredTestcases = (): Testcase[] => {
modelName: input.modelName,
});
}
// Attach per-test parameters (for data-driven tasks)
taskInput.taskParams = input.params;
let result;
try {
result = await taskFunction(taskInput);
Expand Down
9 changes: 9 additions & 0 deletions evals/initStagehand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,15 @@ export const initStagehand = async ({

const { debugUrl, sessionUrl } = await stagehand.init();

// Increase runtime verbosity for evals explicitly
try {
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore - setVerbosity is available on Stagehand logger
stagehand.logger?.setVerbosity?.(2);
} catch {
// ignore if logger does not expose setVerbosity
}

// Set navigation timeout to 60 seconds for evaluations
stagehand.context.setDefaultNavigationTimeout(60_000);

Expand Down
Loading
Loading