Skip to content
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2e3e151
work on adding voyager to evals
filip-michalsky Aug 12, 2025
d272670
add gaia evals
filip-michalsky Aug 12, 2025
1e27e07
Merge branch 'main' into fm/stg-661-add-web-voyager
filip-michalsky Aug 13, 2025
71cfcb0
refactor
filip-michalsky Aug 13, 2025
16e3a52
add sampling of suites
filip-michalsky Aug 13, 2025
0461322
updating evals
filip-michalsky Aug 14, 2025
4aaca65
linting
filip-michalsky Aug 14, 2025
1922e38
Merge branch 'main' into fm/stg-661-add-web-voyager
filip-michalsky Aug 14, 2025
b9b1702
remove logs, small updates
filip-michalsky Aug 15, 2025
22c9fe7
remove logs
filip-michalsky Aug 15, 2025
dcaeb83
revert unwanted change
filip-michalsky Aug 15, 2025
df880ac
more revert
filip-michalsky Aug 15, 2025
087f8cd
load env at root
filip-michalsky Aug 16, 2025
7c1d5a0
add changeset
filip-michalsky Aug 16, 2025
bd7352b
update
filip-michalsky Aug 16, 2025
4be8864
update ci
filip-michalsky Aug 16, 2025
b78d824
ci update
filip-michalsky Aug 16, 2025
70a7b7c
Merge main branch
filip-michalsky Aug 21, 2025
9a7c057
Merge main branch
filip-michalsky Aug 21, 2025
f9817c6
Merge fm/stg-670-add-agent-to-ci into fm/stg-661-add-web-voyager
filip-michalsky Aug 21, 2025
c04a226
Update .github/workflows/ci.yml
filip-michalsky Aug 27, 2025
bc85c12
lint
filip-michalsky Aug 27, 2025
5081d3b
exclude gaia and voyager from agent ci
filip-michalsky Aug 27, 2025
ce6abd4
update stagehandInitType to send eval inputs in EvalInput
filip-michalsky Aug 27, 2025
f5abf91
add external agent benchmarks as a category to CI
filip-michalsky Aug 27, 2025
51246f6
Update .github/workflows/ci.yml
filip-michalsky Aug 27, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/chilly-laws-smile.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
---

add webvoyager evals
79 changes: 78 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,13 @@ on:

env:
EVAL_MODELS: "openai/gpt-4.1,google/gemini-2.0-flash,anthropic/claude-3-5-sonnet-latest"
EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract"
EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract,agent"
EVAL_MAX_CONCURRENCY: 25
EVAL_TRIAL_COUNT: 5
# Agent eval sampling configuration
EVAL_WEBVOYAGER_SAMPLE: 3 # Random sample of 3 cases from WebVoyager dataset
EVAL_GAIA_SAMPLE: 3 # Random sample of 3 cases from GAIA dataset
EVAL_GAIA_LEVEL: 1 # GAIA difficulty level (1=easiest)

concurrency:
group: ${{ github.ref }}
Expand All @@ -29,6 +33,7 @@ jobs:
run-act: ${{ steps.check-labels.outputs.run-act }}
run-observe: ${{ steps.check-labels.outputs.run-observe }}
run-targeted-extract: ${{ steps.check-labels.outputs.run-targeted-extract }}
run-agent: ${{ steps.check-labels.outputs.run-agent }}
steps:
- id: check-labels
run: |
Expand All @@ -40,6 +45,7 @@ jobs:
echo "run-act=true" >> $GITHUB_OUTPUT
echo "run-observe=true" >> $GITHUB_OUTPUT
echo "run-targeted-extract=true" >> $GITHUB_OUTPUT
echo "run-agent=true" >> $GITHUB_OUTPUT
exit 0
fi

Expand All @@ -49,6 +55,7 @@ jobs:
echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT
echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT
echo "run-targeted-extract=${{ contains(github.event.pull_request.labels.*.name, 'targeted-extract') }}" >> $GITHUB_OUTPUT
echo "run-agent=${{ contains(github.event.pull_request.labels.*.name, 'agent') }}" >> $GITHUB_OUTPUT

run-lint:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -562,3 +569,73 @@ jobs:
echo "Eval summary not found for targeted_extract category. Failing CI."
exit 1
fi

run-agent-evals:
needs: [run-targeted-extract-evals, determine-evals]
runs-on: ubuntu-latest
timeout-minutes: 90 # Agent evals can be long-running
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
HEADLESS: true
EVAL_ENV: browserbase
# Use agent models for agent evals in CI
EVAL_AGENT_MODELS: "computer-use-preview-2025-03-11,claude-3-7-sonnet-latest"
EVAL_TRIAL_COUNT: 2 # Reduce trials for agent evals
EVAL_MAX_CONCURRENCY: 10 # Lower concurrency for agent evals
steps:
- name: Check out repository code
uses: actions/checkout@v4

- name: Check for 'agent' label
id: label-check
run: |
if [ "${{ needs.determine-evals.outputs.run-agent }}" != "true" ]; then
echo "has_label=false" >> $GITHUB_OUTPUT
echo "No label for AGENT. Exiting with success."
else
echo "has_label=true" >> $GITHUB_OUTPUT
fi

- name: Set up Node.js
if: needs.determine-evals.outputs.run-agent == 'true'
uses: actions/setup-node@v4
with:
node-version: "20"

- name: Install dependencies
if: needs.determine-evals.outputs.run-agent == 'true'
run: |
rm -rf node_modules
npm i -g pnpm
pnpm install --no-frozen-lockfile

- name: Build Stagehand
if: needs.determine-evals.outputs.run-agent == 'true'
run: pnpm run build

- name: Run Agent Evals
if: needs.determine-evals.outputs.run-agent == 'true'
run: pnpm run evals category agent

- name: Log Agent Evals Performance
if: needs.determine-evals.outputs.run-agent == 'true'
run: |
experimentName=$(jq -r '.experimentName' eval-summary.json)
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
if [ -f eval-summary.json ]; then
agent_score=$(jq '.categories.agent' eval-summary.json)
echo "Agent category score: $agent_score%"
# Lower threshold for agent evals since they're complex
if (( $(echo "$agent_score < 50" | bc -l) )); then
echo "Agent category score is below 50%. Failing CI."
exit 1
fi
else
echo "Eval summary not found for agent category. Failing CI."
exit 1
fi
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ pnpm playwright install
pnpm run build
pnpm run example # run the blank script at ./examples/example.ts
pnpm run example 2048 # run the 2048 example at ./examples/2048.ts
pnpm run evals -man # see evaluation suite options
```

Stagehand is best when you have an API key for an LLM provider and Browserbase credentials. To add these to your project, run:
Expand Down
22 changes: 22 additions & 0 deletions evals/args.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ const parsedArgs: {
trials?: number;
concurrency?: number;
provider?: string;
dataset?: string;
max_k?: number;
leftover: string[];
} = {
leftover: [],
Expand All @@ -39,6 +41,13 @@ for (const arg of rawArgs) {
}
} else if (arg.startsWith("provider=")) {
parsedArgs.provider = arg.split("=")[1]?.toLowerCase();
} else if (arg.startsWith("--dataset=")) {
parsedArgs.dataset = arg.split("=")[1]?.toLowerCase();
} else if (arg.startsWith("max_k=")) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note for later: we should make this arg a bit more intuitive (along max number of evals or sth)

const val = parseInt(arg.split("=")[1], 10);
if (!isNaN(val)) {
parsedArgs.max_k = val;
}
} else {
parsedArgs.leftover.push(arg);
}
Expand All @@ -63,6 +72,12 @@ if (parsedArgs.trials !== undefined) {
if (parsedArgs.concurrency !== undefined) {
process.env.EVAL_MAX_CONCURRENCY = String(parsedArgs.concurrency);
}
if (parsedArgs.max_k !== undefined) {
process.env.EVAL_MAX_K = String(parsedArgs.max_k);
}
if (parsedArgs.dataset !== undefined) {
process.env.EVAL_DATASET = parsedArgs.dataset;
}

const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
? process.env.EVAL_CATEGORIES.split(",")
Expand All @@ -77,6 +92,7 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
"regression",
"llm_clients",
"agent",
"external_agent_benchmarks",
];

const providerDefault = process.env.EVAL_PROVIDER ?? undefined;
Expand Down Expand Up @@ -104,6 +120,12 @@ function buildUsage(detailed = false): string {
${chalk.cyan("provider".padEnd(12))} ${"override LLM provider".padEnd(24)}
(default ${chalk.dim(providerDefault)}) [${chalk.yellow("OPENAI")}, ${chalk.yellow("ANTHROPIC")}, ${chalk.yellow("GOOGLE")}, ${chalk.yellow("TOGETHER")}, ${chalk.yellow("GROQ")}, ${chalk.yellow("CEREBRAS")}]

${chalk.cyan("max_k".padEnd(12))} ${"max test cases per dataset".padEnd(24)}
(default ${chalk.dim("25")})

${chalk.cyan("--dataset".padEnd(12))} ${"filter dataset for benchmarks".padEnd(24)}
(optional) [${chalk.yellow("gaia")}, ${chalk.yellow("webvoyager")}]


${chalk.magenta.underline("Positional filters\n")}
category <category_name> one of: ${DEFAULT_EVAL_CATEGORIES.map((c) =>
Expand Down
69 changes: 69 additions & 0 deletions evals/core/summary.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import fs from "fs";
import { tasksByName } from "../taskConfig";
import type { SummaryResult } from "@/types/evals";

export const generateSummary = async (
results: SummaryResult[],
experimentName: string,
) => {
const passed = results
.filter((r) => r.output._success)
.map((r) => ({
eval: r.input.name,
model: r.input.modelName,
categories: tasksByName[r.input.name].categories,
}));

const failed = results
.filter((r) => !r.output._success)
.map((r) => ({
eval: r.input.name,
model: r.input.modelName,
categories: tasksByName[r.input.name].categories,
}));

const categorySuccessCounts: Record<
string,
{ total: number; success: number }
> = {};
for (const taskName of Object.keys(tasksByName)) {
const taskCategories = tasksByName[taskName].categories;
const taskResults = results.filter((r) => r.input.name === taskName);
const successCount = taskResults.filter((r) => r.output._success).length;

for (const cat of taskCategories) {
if (!categorySuccessCounts[cat]) {
categorySuccessCounts[cat] = { total: 0, success: 0 };
}
categorySuccessCounts[cat].total += taskResults.length;
categorySuccessCounts[cat].success += successCount;
}
}

const categories: Record<string, number> = {};
for (const [cat, counts] of Object.entries(categorySuccessCounts)) {
categories[cat] = Math.round((counts.success / counts.total) * 100);
}

const models: Record<string, number> = {};
const allModels = [...new Set(results.map((r) => r.input.modelName))];
for (const model of allModels) {
const modelResults = results.filter((r) => r.input.modelName === model);
const successCount = modelResults.filter((r) => r.output._success).length;
models[model] = Math.round((successCount / modelResults.length) * 100);
}

const formattedSummary = {
experimentName,
passed,
failed,
categories,
models,
};

fs.writeFileSync(
"eval-summary.json",
JSON.stringify(formattedSummary, null, 2),
);
console.log("Evaluation summary written to eval-summary.json");
};
Loading