browserbase · filip-michalsky · Aug 12, 2025 · Aug 12, 2025 · Aug 13, 2025 · Aug 13, 2025
diff --git a/.changeset/pretty-jokes-own.md b/.changeset/pretty-jokes-own.md
@@ -2,4 +2,4 @@
 "@browserbasehq/stagehand": patch
 ---
 
-Properly handle images in evaluator + clean up response parsing logic 
+Properly handle images in evaluator + clean up response parsing logic
diff --git a/evals/core/summary.ts b/evals/core/summary.ts
@@ -0,0 +1,69 @@
+import fs from "fs";
+import { tasksByName } from "../taskConfig";
+import type { SummaryResult } from "@/types/evals";
+
+export const generateSummary = async (
+  results: SummaryResult[],
+  experimentName: string,
+) => {
+  const passed = results
+    .filter((r) => r.output._success)
+    .map((r) => ({
+      eval: r.input.name,
+      model: r.input.modelName,
+      categories: tasksByName[r.input.name].categories,
+    }));
+
+  const failed = results
+    .filter((r) => !r.output._success)
+    .map((r) => ({
+      eval: r.input.name,
+      model: r.input.modelName,
+      categories: tasksByName[r.input.name].categories,
+    }));
+
+  const categorySuccessCounts: Record<
+    string,
+    { total: number; success: number }
+  > = {};
+  for (const taskName of Object.keys(tasksByName)) {
+    const taskCategories = tasksByName[taskName].categories;
+    const taskResults = results.filter((r) => r.input.name === taskName);
+    const successCount = taskResults.filter((r) => r.output._success).length;
+
+    for (const cat of taskCategories) {
+      if (!categorySuccessCounts[cat]) {
+        categorySuccessCounts[cat] = { total: 0, success: 0 };
+      }
+      categorySuccessCounts[cat].total += taskResults.length;
+      categorySuccessCounts[cat].success += successCount;
+    }
+  }
+
+  const categories: Record<string, number> = {};
+  for (const [cat, counts] of Object.entries(categorySuccessCounts)) {
+    categories[cat] = Math.round((counts.success / counts.total) * 100);
+  }
+
+  const models: Record<string, number> = {};
+  const allModels = [...new Set(results.map((r) => r.input.modelName))];
+  for (const model of allModels) {
+    const modelResults = results.filter((r) => r.input.modelName === model);
+    const successCount = modelResults.filter((r) => r.output._success).length;
+    models[model] = Math.round((successCount / modelResults.length) * 100);
+  }
+
+  const formattedSummary = {
+    experimentName,
+    passed,
+    failed,
+    categories,
+    models,
+  };
+
+  fs.writeFileSync(
+    "eval-summary.json",
+    JSON.stringify(formattedSummary, null, 2),
+  );
+  console.log("Evaluation summary written to eval-summary.json");
+};
diff --git a/evals/datasets/gaia/GAIA_web.jsonl b/evals/datasets/gaia/GAIA_web.jsonl
diff --git a/evals/datasets/webvoyager/WebVoyager_data.jsonl b/evals/datasets/webvoyager/WebVoyager_data.jsonl
diff --git a/evals/evals.config.json b/evals/evals.config.json
@@ -419,5 +419,15 @@
       "name": "agent/sign_in",
       "categories": ["agent"]
     }
+    ,
+    {
+      "name": "agent/webarena_gaia",
+      "categories": ["agent"]
+    }
+    ,
+    {
+      "name": "agent/webvoyager",
+      "categories": ["agent"]
+    }
   ]
 }
diff --git a/evals/index.eval.ts b/evals/index.eval.ts
@@ -12,7 +12,6 @@
  * - Runs each selected task against each selected model in parallel, collecting results.
  * - Saves a summary of the evaluation results to `eval-summary.json`.
  */
-import fs from "fs";
 import path from "path";
 import process from "process";
 import {
@@ -24,7 +23,7 @@ import { generateExperimentName } from "./utils";
 import { exactMatch, errorMatch } from "./scoring";
 import { tasksByName, tasksConfig, getModelList } from "./taskConfig";
 import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust";
-import { SummaryResult, Testcase } from "@/types/evals";
+import { SummaryResult, Testcase, EvalInput } from "@/types/evals";
 import { EvalLogger } from "./logger";
 import { AvailableModel, LLMClient } from "@browserbasehq/stagehand";
 import { env } from "./env";
@@ -37,6 +36,9 @@ import { AISdkClient } from "@/examples/external_clients/aisdk";
 import { getAISDKLanguageModel } from "@/lib/llm/LLMProvider";
 import { loadApiKeyFromEnv } from "@/lib/utils";
 import { LogLine } from "@/types/log";
+import { generateSummary } from "./core/summary";
+import { buildGAIATestcases } from "./suites/gaia";
+import { buildWebVoyagerTestcases } from "./suites/webvoyager";
 
 dotenv.config();
 
@@ -54,88 +56,6 @@ const TRIAL_COUNT = process.env.EVAL_TRIAL_COUNT
 
 const USE_API: boolean = (process.env.USE_API ?? "").toLowerCase() === "true";
 
-/**
- * generateSummary:
- * After all evaluations have finished, aggregate the results into a summary.
- * This summary includes:
- * - Which tasks passed or failed (with model and categories).
- * - Category-wise success percentages.
- * - Model-wise success percentages.
- *
- * The summary is written to `eval-summary.json` for further analysis.
- */
-const generateSummary = async (
-  results: SummaryResult[],
-  experimentName: string,
-) => {
-  // Determine passed testcases (those with _success: true)
-  const passed = results
-    .filter((r) => r.output._success)
-    .map((r) => ({
-      eval: r.input.name,
-      model: r.input.modelName,
-      categories: tasksByName[r.input.name].categories,
-    }));
-
-  // Determine failed testcases (those with _success: false)
-  const failed = results
-    .filter((r) => !r.output._success)
-    .map((r) => ({
-      eval: r.input.name,
-      model: r.input.modelName,
-      categories: tasksByName[r.input.name].categories,
-    }));
-
-  // Calculate success counts for each category
-  const categorySuccessCounts: Record<
-    string,
-    { total: number; success: number }
-  > = {};
-  for (const taskName of Object.keys(tasksByName)) {
-    const taskCategories = tasksByName[taskName].categories;
-    const taskResults = results.filter((r) => r.input.name === taskName);
-    const successCount = taskResults.filter((r) => r.output._success).length;
-
-    for (const cat of taskCategories) {
-      if (!categorySuccessCounts[cat]) {
-        categorySuccessCounts[cat] = { total: 0, success: 0 };
-      }
-      categorySuccessCounts[cat].total += taskResults.length;
-      categorySuccessCounts[cat].success += successCount;
-    }
-  }
-
-  // Compute percentage success per category
-  const categories: Record<string, number> = {};
-  for (const [cat, counts] of Object.entries(categorySuccessCounts)) {
-    categories[cat] = Math.round((counts.success / counts.total) * 100);
-  }
-
-  // Compute percentage success per model
-  const models: Record<string, number> = {};
-  const allModels = [...new Set(results.map((r) => r.input.modelName))];
-  for (const model of allModels) {
-    const modelResults = results.filter((r) => r.input.modelName === model);
-    const successCount = modelResults.filter((r) => r.output._success).length;
-    models[model] = Math.round((successCount / modelResults.length) * 100);
-  }
-
-  // Format and write the summary to a JSON file
-  const formattedSummary = {
-    experimentName,
-    passed,
-    failed,
-    categories,
-    models,
-  };
-
-  fs.writeFileSync(
-    "eval-summary.json",
-    JSON.stringify(formattedSummary, null, 2),
-  );
-  console.log("Evaluation summary written to eval-summary.json");
-};
-
 /**
  * generateFilteredTestcases:
  * Based on the chosen filters (category or specific eval name) and environment,
@@ -187,8 +107,25 @@ const generateFilteredTestcases = (): Testcase[] => {
     currentModels,
   );
 
-  // Create a list of all testcases using the determined task names and models
-  let allTestcases = currentModels.flatMap((model) =>
+  // Special handling: fan out GAIA (WebVoyager) dataset for agent/webarena_gaia
+  const isGAIATaskIncluded = taskNamesToRun.includes("agent/webarena_gaia");
+  // Special handling: fan out WebVoyager dataset for agent/webvoyager
+  const isWebVoyagerTaskIncluded = taskNamesToRun.includes("agent/webvoyager");
+
+  let allTestcases: Testcase[] = [];
+
+  if (isGAIATaskIncluded) {
+    taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webarena_gaia");
+    allTestcases.push(...buildGAIATestcases(currentModels));
+  }
+
+  if (isWebVoyagerTaskIncluded) {
+    taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webvoyager");
+    allTestcases.push(...buildWebVoyagerTestcases(currentModels));
+  }
+
+  // Create a list of all remaining testcases using the determined task names and models
+  const regularTestcases = currentModels.flatMap((model) =>
     taskNamesToRun.map((testName) => ({
       input: { name: testName, modelName: model as AvailableModel },
       name: testName,
@@ -202,12 +139,13 @@ const generateFilteredTestcases = (): Testcase[] => {
       metadata: {
         model: model as AvailableModel,
         test: testName,
-        categories: tasksConfig.find((t) => t.name === testName)?.categories,
       },
       expected: true,
     })),
   );
 
+  allTestcases = [...allTestcases, ...regularTestcases];
+
   // This filtering step might now be redundant if taskNamesToRun is already filtered
   if (filterByCategory) {
     allTestcases = allTestcases.filter((testcase) =>
@@ -227,7 +165,7 @@ const generateFilteredTestcases = (): Testcase[] => {
     allTestcases
       .map(
         (t, i) =>
-          `${i}: ${t.name} (${t.input.modelName}): ${t.metadata.categories}`,
+          `${i}: ${t.name} (${t.input.modelName}): ${tasksByName[t.name].categories}`,
       )
       .join("\n"),
   );
@@ -266,7 +204,7 @@ const generateFilteredTestcases = (): Testcase[] => {
       experimentName,
       data: generateFilteredTestcases,
       // Each test is a function that runs the corresponding task module
-      task: async (input: { name: string; modelName: AvailableModel }) => {
+      task: async (input: EvalInput) => {
         const logger = new EvalLogger();
         try {
           // Dynamically import the task based on its name
@@ -367,6 +305,8 @@ const generateFilteredTestcases = (): Testcase[] => {
               modelName: input.modelName,
             });
           }
+          // Attach per-test parameters (for data-driven tasks)
+          taskInput.taskParams = input.params;
           let result;
           try {
             result = await taskFunction(taskInput);

diff --git a/evals/suites/gaia.ts b/evals/suites/gaia.ts
@@ -0,0 +1,120 @@
+import fs from "fs";
+import path from "path";
+import type { Testcase, EvalInput } from "@/types/evals";
+import type { AvailableModel } from "@/types/model";
+import { tasksConfig } from "../taskConfig";
+
+export const buildGAIATestcases = (models: string[]): Testcase[] => {
+  const gaiaFilePath =
+    process.env.EVAL_GAIA_FILE ||
+    path.join(__dirname, "..", "datasets", "gaia", "GAIA_web.jsonl");
+
+  let gaiaLines: string[] = [];
+  try {
+    const content = fs.readFileSync(gaiaFilePath, "utf-8");
+    gaiaLines = content.split(/\r?\n/).filter((l) => l.trim().length > 0);
+  } catch (e) {
+    console.warn(
+      `Could not read GAIA file at ${gaiaFilePath}. Set EVAL_GAIA_FILE to override. Error: ${e instanceof Error ? e.message : String(e)}`,
+    );
+    gaiaLines = [];
+  }
+
+  const levelFilter = process.env.EVAL_GAIA_LEVEL
+    ? Number(process.env.EVAL_GAIA_LEVEL)
+    : undefined;
+  const maxCases = process.env.EVAL_GAIA_LIMIT
+    ? Number(process.env.EVAL_GAIA_LIMIT)
+    : 25;
+  const sampleCount = process.env.EVAL_GAIA_SAMPLE
+    ? Number(process.env.EVAL_GAIA_SAMPLE)
+    : undefined;
+
+  type GaiaRow = {
+    id: string;
+    Level?: number;
+    web: string;
+    ques: string;
+    [key: string]: unknown;
+  };
+
+  const gaiaRows: GaiaRow[] = [];
+  const candidates: GaiaRow[] = [];
+  for (const line of gaiaLines) {
+    try {
+      const parsed = JSON.parse(line) as GaiaRow;
+      if (
+        typeof parsed.id === "string" &&
+        typeof parsed.web === "string" &&
+        typeof parsed.ques === "string"
+      ) {
+        if (!levelFilter || parsed.Level === levelFilter) {
+          candidates.push(parsed);
+        }
+      }
+    } catch {
+      // skip invalid lines
+    }
+  }
+  if (sampleCount && sampleCount > 0) {
+    gaiaRows.push(...sampleUniform(candidates, sampleCount));
+  } else {
+    for (const row of candidates) {
+      gaiaRows.push(row);
+      if (gaiaRows.length >= maxCases) break;
+    }
+  }
+
+  const allTestcases: Testcase[] = [];
+  for (const model of models) {
+    for (const row of gaiaRows) {
+      const finalAnswer = (row as Record<string, unknown>)[
+        "Final answer"
+      ] as unknown;
+      const input: EvalInput = {
+        name: "agent/webarena_gaia",
+        modelName: model as AvailableModel,
+        params: {
+          id: row.id,
+          level: row.Level,
+          web: row.web,
+          ques: row.ques,
+          expected: typeof finalAnswer === "string" ? finalAnswer : undefined,
+        },
+      };
+      allTestcases.push({
+        input,
+        name: input.name,
+        tags: [
+          model,
+          input.name,
+          ...(
+            tasksConfig.find((t) => t.name === input.name)?.categories || []
+          ).map((x) => `category/${x}`),
+          `gaia/id/${row.id}`,
+          row.Level ? `gaia/level/${row.Level}` : "gaia/level/unknown",
+        ],
+        metadata: {
+          model: model as AvailableModel,
+          test: `${input.name}:${row.id}`,
+        },
+        expected: true,
+      });
+    }
+  }
+
+  return allTestcases;
+};
+
+function sampleUniform<T>(arr: T[], k: number): T[] {
+  const n = arr.length;
+  if (k >= n) return arr.slice();
+  const copy = arr.slice();
+  for (let i = n - 1; i > 0; i--) {
+    const j = Math.floor(Math.random() * (i + 1));
+    const tmp = copy[i];
+    copy[i] = copy[j];
+    copy[j] = tmp;
+  }
+  return copy.slice(0, k);
+}