Skip to content

Commit 09b5e1e

Browse files
filip-michalskyclaudemiguelg719
authored
work on adding voyager to evals (#959)
⏺ ## Why Add WebVoyager and GAIA evaluation suites to benchmark Stagehand's web navigation and reasoning capabilities against industry-standard datasets. ## What Changed - Added WebVoyager eval suite with 643 test cases for web navigation tasks - Added GAIA eval suite with 90 test cases for general AI assistant tasks - Refactored eval infrastructure to support sampling and filtering - Created reusable utilities for JSONL parsing and test case generation - Added configuration for new eval suites in `evals.config.json` ### Environment Variables - `EVAL_WEBVOYAGER_SAMPLE`: Random sample size from WebVoyager dataset - `EVAL_WEBVOYAGER_LIMIT`: Max cases to run (default: 25) - `EVAL_GAIA_SAMPLE`: Random sample size from GAIA dataset - `EVAL_GAIA_LIMIT`: Max cases to run (default: 25) - `EVAL_GAIA_LEVEL`: Filter GAIA by difficulty level (1, 2, or 3) ### Sampling Strategy The sampling implementation uses Fisher-Yates shuffle for unbiased random selection when `SAMPLE` is specified, otherwise takes the first `LIMIT` cases. This allows for both deterministic (first N) and randomized (sample N) test runs. ## Test Plan ```bash # Test WebVoyager with OpenAI EVAL_SUITE=webvoyager EVAL_WEBVOYAGER_SAMPLE=1 EVAL_MODEL=openai/gpt-4o-computer-use-preview pnpm run evals # Test WebVoyager with Claude EVAL_SUITE=webvoyager EVAL_WEBVOYAGER_SAMPLE=1 EVAL_MODEL=anthropic/claude-3-5-sonnet-20241022 pnpm run evals # Test GAIA with OpenAI EVAL_SUITE=gaia EVAL_GAIA_SAMPLE=1 EVAL_MODEL=openai/gpt-4o-computer-use-preview pnpm run evals # Test GAIA with Claude EVAL_SUITE=gaia EVAL_GAIA_SAMPLE=1 EVAL_MODEL=anthropic/claude-3-5-sonnet-20241022 pnpm run evals # Verify existing evals still work pnpm run evals ``` --------- Co-authored-by: Claude <[email protected]> Co-authored-by: Miguel <[email protected]>
1 parent c17ac41 commit 09b5e1e

File tree

17 files changed

+1397
-102
lines changed

17 files changed

+1397
-102
lines changed

.changeset/chilly-laws-smile.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
add webvoyager evals

.github/workflows/ci.yml

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ on:
1212

1313
env:
1414
EVAL_MODELS: "openai/gpt-4.1,google/gemini-2.0-flash,anthropic/claude-3-5-sonnet-latest"
15-
EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract"
15+
EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract,agent"
1616
EVAL_MAX_CONCURRENCY: 25
1717
EVAL_TRIAL_COUNT: 5
1818

@@ -29,6 +29,7 @@ jobs:
2929
run-act: ${{ steps.check-labels.outputs.run-act }}
3030
run-observe: ${{ steps.check-labels.outputs.run-observe }}
3131
run-targeted-extract: ${{ steps.check-labels.outputs.run-targeted-extract }}
32+
run-agent: ${{ steps.check-labels.outputs.run-agent }}
3233
steps:
3334
- id: check-labels
3435
run: |
@@ -40,6 +41,7 @@ jobs:
4041
echo "run-act=true" >> $GITHUB_OUTPUT
4142
echo "run-observe=true" >> $GITHUB_OUTPUT
4243
echo "run-targeted-extract=true" >> $GITHUB_OUTPUT
44+
echo "run-agent=true" >> $GITHUB_OUTPUT
4345
exit 0
4446
fi
4547
@@ -49,6 +51,7 @@ jobs:
4951
echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT
5052
echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT
5153
echo "run-targeted-extract=${{ contains(github.event.pull_request.labels.*.name, 'targeted-extract') }}" >> $GITHUB_OUTPUT
54+
echo "run-agent=${{ contains(github.event.pull_request.labels.*.name, 'agent') }}" >> $GITHUB_OUTPUT
5255
5356
run-lint:
5457
runs-on: ubuntu-latest
@@ -562,3 +565,73 @@ jobs:
562565
echo "Eval summary not found for targeted_extract category. Failing CI."
563566
exit 1
564567
fi
568+
569+
run-agent-evals:
570+
needs: [run-targeted-extract-evals, determine-evals]
571+
runs-on: ubuntu-latest
572+
timeout-minutes: 90 # Agent evals can be long-running
573+
env:
574+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
575+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
576+
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
577+
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
578+
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
579+
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
580+
HEADLESS: true
581+
EVAL_ENV: browserbase
582+
# Use agent models for agent evals in CI
583+
EVAL_AGENT_MODELS: "computer-use-preview-2025-03-11,claude-3-7-sonnet-latest"
584+
EVAL_TRIAL_COUNT: 2 # Reduce trials for agent evals
585+
EVAL_MAX_CONCURRENCY: 10 # Lower concurrency for agent evals
586+
steps:
587+
- name: Check out repository code
588+
uses: actions/checkout@v4
589+
590+
- name: Check for 'agent' label
591+
id: label-check
592+
run: |
593+
if [ "${{ needs.determine-evals.outputs.run-agent }}" != "true" ]; then
594+
echo "has_label=false" >> $GITHUB_OUTPUT
595+
echo "No label for AGENT. Exiting with success."
596+
else
597+
echo "has_label=true" >> $GITHUB_OUTPUT
598+
fi
599+
600+
- name: Set up Node.js
601+
if: needs.determine-evals.outputs.run-agent == 'true'
602+
uses: actions/setup-node@v4
603+
with:
604+
node-version: "20"
605+
606+
- name: Install dependencies
607+
if: needs.determine-evals.outputs.run-agent == 'true'
608+
run: |
609+
rm -rf node_modules
610+
npm i -g pnpm
611+
pnpm install --no-frozen-lockfile
612+
613+
- name: Build Stagehand
614+
if: needs.determine-evals.outputs.run-agent == 'true'
615+
run: pnpm run build
616+
617+
- name: Run Agent Evals
618+
if: needs.determine-evals.outputs.run-agent == 'true'
619+
run: pnpm run evals category agent
620+
621+
- name: Log Agent Evals Performance
622+
if: needs.determine-evals.outputs.run-agent == 'true'
623+
run: |
624+
experimentName=$(jq -r '.experimentName' eval-summary.json)
625+
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
626+
if [ -f eval-summary.json ]; then
627+
agent_score=$(jq '.categories.agent' eval-summary.json)
628+
echo "Agent category score: $agent_score%"
629+
# Lower threshold for agent evals since they're complex
630+
if (( $(echo "$agent_score < 50" | bc -l) )); then
631+
echo "Agent category score is below 50%. Failing CI."
632+
exit 1
633+
fi
634+
else
635+
echo "Eval summary not found for agent category. Failing CI."
636+
exit 1
637+
fi

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ pnpm playwright install
125125
pnpm run build
126126
pnpm run example # run the blank script at ./examples/example.ts
127127
pnpm run example 2048 # run the 2048 example at ./examples/2048.ts
128+
pnpm run evals -man # see evaluation suite options
128129
```
129130

130131
Stagehand is best when you have an API key for an LLM provider and Browserbase credentials. To add these to your project, run:

evals/args.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ const parsedArgs: {
1515
trials?: number;
1616
concurrency?: number;
1717
provider?: string;
18+
dataset?: string;
19+
max_k?: number;
1820
leftover: string[];
1921
} = {
2022
leftover: [],
@@ -39,6 +41,13 @@ for (const arg of rawArgs) {
3941
}
4042
} else if (arg.startsWith("provider=")) {
4143
parsedArgs.provider = arg.split("=")[1]?.toLowerCase();
44+
} else if (arg.startsWith("--dataset=")) {
45+
parsedArgs.dataset = arg.split("=")[1]?.toLowerCase();
46+
} else if (arg.startsWith("max_k=")) {
47+
const val = parseInt(arg.split("=")[1], 10);
48+
if (!isNaN(val)) {
49+
parsedArgs.max_k = val;
50+
}
4251
} else {
4352
parsedArgs.leftover.push(arg);
4453
}
@@ -63,6 +72,12 @@ if (parsedArgs.trials !== undefined) {
6372
if (parsedArgs.concurrency !== undefined) {
6473
process.env.EVAL_MAX_CONCURRENCY = String(parsedArgs.concurrency);
6574
}
75+
if (parsedArgs.max_k !== undefined) {
76+
process.env.EVAL_MAX_K = String(parsedArgs.max_k);
77+
}
78+
if (parsedArgs.dataset !== undefined) {
79+
process.env.EVAL_DATASET = parsedArgs.dataset;
80+
}
6681

6782
const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
6883
? process.env.EVAL_CATEGORIES.split(",")
@@ -77,6 +92,7 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
7792
"regression",
7893
"llm_clients",
7994
"agent",
95+
"external_agent_benchmarks",
8096
];
8197

8298
const providerDefault = process.env.EVAL_PROVIDER ?? undefined;
@@ -104,6 +120,12 @@ function buildUsage(detailed = false): string {
104120
${chalk.cyan("provider".padEnd(12))} ${"override LLM provider".padEnd(24)}
105121
(default ${chalk.dim(providerDefault)}) [${chalk.yellow("OPENAI")}, ${chalk.yellow("ANTHROPIC")}, ${chalk.yellow("GOOGLE")}, ${chalk.yellow("TOGETHER")}, ${chalk.yellow("GROQ")}, ${chalk.yellow("CEREBRAS")}]
106122
123+
${chalk.cyan("max_k".padEnd(12))} ${"max test cases per dataset".padEnd(24)}
124+
(default ${chalk.dim("25")})
125+
126+
${chalk.cyan("--dataset".padEnd(12))} ${"filter dataset for benchmarks".padEnd(24)}
127+
(optional) [${chalk.yellow("gaia")}, ${chalk.yellow("webvoyager")}]
128+
107129
108130
${chalk.magenta.underline("Positional filters\n")}
109131
category <category_name> one of: ${DEFAULT_EVAL_CATEGORIES.map((c) =>

evals/core/summary.ts

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import fs from "fs";
2+
import { tasksByName } from "../taskConfig";
3+
import type { SummaryResult } from "@/types/evals";
4+
5+
export const generateSummary = async (
6+
results: SummaryResult[],
7+
experimentName: string,
8+
) => {
9+
const passed = results
10+
.filter((r) => r.output._success)
11+
.map((r) => ({
12+
eval: r.input.name,
13+
model: r.input.modelName,
14+
categories: tasksByName[r.input.name].categories,
15+
}));
16+
17+
const failed = results
18+
.filter((r) => !r.output._success)
19+
.map((r) => ({
20+
eval: r.input.name,
21+
model: r.input.modelName,
22+
categories: tasksByName[r.input.name].categories,
23+
}));
24+
25+
const categorySuccessCounts: Record<
26+
string,
27+
{ total: number; success: number }
28+
> = {};
29+
for (const taskName of Object.keys(tasksByName)) {
30+
const taskCategories = tasksByName[taskName].categories;
31+
const taskResults = results.filter((r) => r.input.name === taskName);
32+
const successCount = taskResults.filter((r) => r.output._success).length;
33+
34+
for (const cat of taskCategories) {
35+
if (!categorySuccessCounts[cat]) {
36+
categorySuccessCounts[cat] = { total: 0, success: 0 };
37+
}
38+
categorySuccessCounts[cat].total += taskResults.length;
39+
categorySuccessCounts[cat].success += successCount;
40+
}
41+
}
42+
43+
const categories: Record<string, number> = {};
44+
for (const [cat, counts] of Object.entries(categorySuccessCounts)) {
45+
categories[cat] = Math.round((counts.success / counts.total) * 100);
46+
}
47+
48+
const models: Record<string, number> = {};
49+
const allModels = [...new Set(results.map((r) => r.input.modelName))];
50+
for (const model of allModels) {
51+
const modelResults = results.filter((r) => r.input.modelName === model);
52+
const successCount = modelResults.filter((r) => r.output._success).length;
53+
models[model] = Math.round((successCount / modelResults.length) * 100);
54+
}
55+
56+
const formattedSummary = {
57+
experimentName,
58+
passed,
59+
failed,
60+
categories,
61+
models,
62+
};
63+
64+
fs.writeFileSync(
65+
"eval-summary.json",
66+
JSON.stringify(formattedSummary, null, 2),
67+
);
68+
console.log("Evaluation summary written to eval-summary.json");
69+
};

0 commit comments

Comments
 (0)