Skip to content

Commit 1788ee9

Browse files
Add onlineMind2web evals (#1059)
# why To increase the number of relevant agent benchmarks # what changed Added OnlineMind2Web eval set for web agents # test plan tests run locally and on Browserbase
1 parent 6ae809d commit 1788ee9

File tree

6 files changed

+660
-46
lines changed

6 files changed

+660
-46
lines changed

evals/args.ts

Lines changed: 152 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -103,89 +103,195 @@ function buildUsage(detailed = false): string {
103103
`pnpm run evals [key=value]… [category <name>] | name=<evalName>`,
104104
);
105105

106+
const examplesSection = `
107+
${chalk.magenta.underline("Examples")}
108+
109+
${chalk.dim("# Run every evaluation locally with default settings")}
110+
${chalk.green("pnpm run evals")}
111+
112+
${chalk.dim("# Same as above but in Browserbase with three trials")}
113+
${chalk.green("pnpm run evals")} ${chalk.cyan("env=")}${chalk.yellow("browserbase")} ${chalk.cyan("trials=")}${chalk.yellow("3")}
114+
115+
${chalk.dim("# Run evals using the Stagehand API")}
116+
${chalk.green("pnpm run evals")} ${chalk.cyan("env=")}${chalk.yellow("browserbase")} ${chalk.cyan("api=")}${chalk.yellow("true")}
117+
118+
${chalk.dim("# Run evals from only the 'act' category with a max of 4 running at any given time")}
119+
${chalk.green("pnpm run evals")} ${chalk.cyan("category")} ${chalk.yellow("act")} ${chalk.cyan("concurrency=")}${chalk.yellow("4")}
120+
121+
${chalk.dim("# Execute a specific eval by filename")}
122+
${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("my_eval_name")}
123+
`;
124+
106125
const body = dedent`
107126
${chalk.magenta.underline("Keys\n")}
108127
${chalk.cyan("env".padEnd(12))} ${"target environment".padEnd(24)}
109-
(default ${chalk.dim("LOCAL")}) [${chalk.yellow("BROWSERBASE")}, ${chalk.yellow("LOCAL")}] ${chalk.gray("← LOCAL sets api=false")}
128+
(default ${chalk.dim("LOCAL")}) [${chalk.yellow("browserbase")}, ${chalk.yellow("local")}]
110129
111130
${chalk.cyan("api".padEnd(12))} ${"use the Stagehand API".padEnd(24)}
112-
(default ${chalk.dim("false")}) [${chalk.yellow("true")}, ${chalk.yellow("false")}]
131+
(default ${chalk.dim("false")}) [${chalk.yellow("true")}, ${chalk.yellow("false")}]
113132
114-
${chalk.cyan("trials".padEnd(12))} ${"number of trials".padEnd(24)}
115-
(default ${chalk.dim("10")})
133+
${chalk.cyan("trials".padEnd(12))} ${"number of trials per task".padEnd(24)}
134+
(default ${chalk.dim("3")})
116135
117136
${chalk.cyan("concurrency".padEnd(12))} ${"max parallel sessions".padEnd(24)}
118-
(default ${chalk.dim("10")})
137+
(default ${chalk.dim("3")})
119138
120139
${chalk.cyan("provider".padEnd(12))} ${"override LLM provider".padEnd(24)}
121-
(default ${chalk.dim(providerDefault)}) [${chalk.yellow("OPENAI")}, ${chalk.yellow("ANTHROPIC")}, ${chalk.yellow("GOOGLE")}, ${chalk.yellow("TOGETHER")}, ${chalk.yellow("GROQ")}, ${chalk.yellow("CEREBRAS")}]
140+
(default ${chalk.dim(providerDefault || "varies by model")}) [${chalk.yellow("openai")}, ${chalk.yellow("anthropic")}, ${chalk.yellow("google")}, ${chalk.yellow("together")}, ${chalk.yellow("groq")}, ${chalk.yellow("cerebras")}]
122141
123142
${chalk.cyan("max_k".padEnd(12))} ${"max test cases per dataset".padEnd(24)}
124143
(default ${chalk.dim("25")})
125144
126-
${chalk.cyan("--dataset".padEnd(12))} ${"filter dataset for benchmarks".padEnd(24)}
127-
(optional) [${chalk.yellow("gaia")}, ${chalk.yellow("webvoyager")}]
145+
${chalk.cyan("--dataset".padEnd(12))} ${"filter to specific benchmark".padEnd(24)}
146+
(optional) [${chalk.yellow("gaia")}, ${chalk.yellow("webvoyager")}, ${chalk.yellow("webbench")}, ${chalk.yellow("osworld")}, ${chalk.yellow("onlineMind2Web")}]
128147
129148
130149
${chalk.magenta.underline("Positional filters\n")}
131-
category <category_name> one of: ${DEFAULT_EVAL_CATEGORIES.map((c) =>
132-
chalk.yellow(c),
133-
).join(", ")}
150+
151+
category <category_name>
152+
153+
${chalk.gray("Available categories:")}
154+
${DEFAULT_EVAL_CATEGORIES.slice(0, 5)
155+
.map((c) => chalk.yellow(c))
156+
.join(", ")},
157+
${DEFAULT_EVAL_CATEGORIES.slice(5, 10)
158+
.map((c) => chalk.yellow(c))
159+
.join(", ")}${DEFAULT_EVAL_CATEGORIES.slice(10).length > 0 ? "," : ""}
160+
${DEFAULT_EVAL_CATEGORIES.slice(10)
161+
.map((c) => chalk.yellow(c))
162+
.join(", ")}
163+
`;
134164

135-
${chalk.magenta.underline("\nExamples")}
165+
if (!detailed)
166+
return `${header}\n\n${synopsis}\n\nFor more details: ${chalk.bold(
167+
"pnpm run evals -man\n",
168+
)}`;
169+
170+
const externalBenchmarksSection = dedent`
171+
${chalk.magenta.underline("\nExternal Benchmarks\n")}
172+
173+
${chalk.cyan.bold("WebBench")} - 5,607 real-world web automation tasks across 452 live websites
174+
175+
${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/webbench")}
136176
137-
${chalk.dim("# Run every evaluation locally with default settings")}
177+
${chalk.dim("Or:")} ${chalk.green("EVAL_DATASET=webbench pnpm run evals")}
138178
139-
${chalk.green("pnpm run evals")}
179+
${chalk.gray("Environment Variables:")}
140180
181+
EVAL_WEBBENCH_LIMIT max tasks to run (default: 25)
182+
EVAL_WEBBENCH_SAMPLE random sample count before limit
183+
EVAL_WEBBENCH_DIFFICULTY filter: [${chalk.yellow("easy")}, ${chalk.yellow("hard")}] (254 easy, 61 hard tasks)
184+
EVAL_WEBBENCH_CATEGORY filter: [${chalk.yellow("READ")}, ${chalk.yellow("CREATE")}, ${chalk.yellow("UPDATE")}, ${chalk.yellow("DELETE")}, ${chalk.yellow("FILE_MANIPULATION")}]
185+
EVAL_WEBBENCH_USE_HITL use only HITL dataset with difficulty ratings (true/false)
141186
142-
${chalk.dim("# Same as above but in Browserbase with three trials")}
187+
${chalk.dim("Examples:")}
143188
144-
${chalk.green("pnpm run evals")} ${chalk.cyan("env=")}${chalk.yellow("BROWSERBASE")} ${chalk.cyan(
145-
"trials=",
146-
)}${chalk.yellow("3")}
189+
${chalk.green("EVAL_WEBBENCH_DIFFICULTY=easy EVAL_WEBBENCH_LIMIT=10 pnpm run evals name=agent/webbench")}
147190
191+
${chalk.green("EVAL_DATASET=webbench EVAL_WEBBENCH_CATEGORY=READ pnpm run evals")}
192+
193+
194+
${chalk.cyan.bold("GAIA")} - General AI Assistant benchmark for complex reasoning
195+
196+
${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/gaia")}
148197
149-
${chalk.dim("# Run evals using the Stagehand API")}
198+
${chalk.dim("Or:")} ${chalk.green("EVAL_DATASET=gaia pnpm run evals")}
150199
151-
${chalk.green("pnpm run evals")} ${chalk.cyan("env=")}${chalk.yellow("BROWSERBASE")} ${chalk.cyan(
152-
"api=",
153-
)}${chalk.yellow("true")}
154-
155-
156-
${chalk.dim(
157-
"# Run evals from only the 'act' category with a max of 4 running at any given time",
158-
)}
200+
${chalk.gray("Environment Variables:")}
159201
160-
${chalk.green("pnpm run evals")} ${chalk.cyan("category")} ${chalk.yellow("act")} ${chalk.cyan(
161-
"concurrency=",
162-
)}${chalk.yellow("4")}
163-
164-
165-
${chalk.dim("# Execute a specific eval by filename")}
202+
EVAL_GAIA_LIMIT max tasks to run (default: 25)
203+
EVAL_GAIA_SAMPLE random sample count before limit
204+
EVAL_GAIA_LEVEL filter by difficulty level [${chalk.yellow("1")}, ${chalk.yellow("2")}, ${chalk.yellow("3")}]
166205
167-
${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("my_eval_name")}
206+
${chalk.dim("Example:")}
207+
208+
${chalk.green("EVAL_GAIA_LEVEL=1 EVAL_GAIA_LIMIT=10 pnpm run evals name=agent/gaia")}
209+
210+
211+
${chalk.cyan.bold("WebVoyager")} - Web navigation and task completion benchmark
212+
213+
${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/webvoyager")}
214+
215+
${chalk.dim("Or:")} ${chalk.green("EVAL_DATASET=webvoyager pnpm run evals")}
216+
217+
${chalk.gray("Environment Variables:")}
218+
219+
EVAL_WEBVOYAGER_LIMIT max tasks to run (default: 25)
220+
EVAL_WEBVOYAGER_SAMPLE random sample count before limit
221+
222+
${chalk.gray("Ground Truth Evaluation:")}
223+
224+
WebVoyager uses ground truth answers for improved accuracy:
225+
• Checks agent's "Final Answer:" against reference answers
226+
• Supports golden (ideal) and possible (acceptable) answers
227+
• Falls back to screenshot evaluation when uncertain
228+
• Reference data: evals/datasets/webvoyager/reference-answers.json
229+
230+
${chalk.dim("Example:")}
231+
232+
${chalk.green("EVAL_WEBVOYAGER_SAMPLE=50 EVAL_WEBVOYAGER_LIMIT=10 pnpm run evals name=agent/webvoyager")}
233+
234+
235+
${chalk.cyan.bold("OSWorld")} - Chrome browser automation tasks from the OSWorld benchmark
236+
237+
${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/osworld")}
238+
239+
${chalk.dim("Or:")} ${chalk.green("EVAL_DATASET=osworld pnpm run evals")}
240+
241+
${chalk.gray("Environment Variables:")}
242+
243+
EVAL_OSWORLD_LIMIT max tasks to run (default: 25)
244+
EVAL_OSWORLD_SAMPLE random sample count before limit
245+
EVAL_OSWORLD_SOURCE filter by source: [${chalk.yellow("Mind2Web")}, ${chalk.yellow("test_task_1")}, ...]
246+
EVAL_OSWORLD_EVALUATION_TYPE filter by eval type: [${chalk.yellow("url_match")}, ${chalk.yellow("string_match")}, ${chalk.yellow("dom_state")}, ${chalk.yellow("custom")}]
247+
EVAL_OSWORLD_TIMEOUT timeout per task in milliseconds (default: 60000)
248+
249+
${chalk.dim("Examples:")}
250+
251+
${chalk.green("EVAL_OSWORLD_SOURCE=Mind2Web EVAL_OSWORLD_LIMIT=10 pnpm run evals name=agent/osworld")}
252+
253+
${chalk.green("EVAL_DATASET=osworld EVAL_OSWORLD_EVALUATION_TYPE=url_match pnpm run evals")}
254+
255+
256+
${chalk.cyan.bold("Mind2Web")} - Real-world web interaction tasks for evaluating web agents
257+
258+
${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/onlineMind2Web")}
259+
260+
${chalk.dim("Or:")} ${chalk.green("EVAL_DATASET=onlineMind2Web pnpm run evals")}
261+
262+
${chalk.gray("Environment Variables:")}
263+
264+
EVAL_ONLINEMIND2WEB_LIMIT max tasks to run (default: 25)
265+
EVAL_ONLINEMIND2WEB_SAMPLE random sample count before limit
266+
267+
${chalk.dim("Example:")}
268+
269+
${chalk.green("EVAL_ONLINEMIND2WEB_SAMPLE=50 EVAL_ONLINEMIND2WEB_LIMIT=10 pnpm run evals name=agent/onlineMind2Web")}
168270
`;
169271

170-
if (!detailed)
171-
return `${header}\n\n${synopsis}\n\nFor more details: ${chalk.bold(
172-
"pnpm run evals -man\n",
173-
)}`;
174-
175272
const envSection = dedent`
176-
${chalk.magenta.underline("\nEnvironment variables\n")}
177-
EVAL_ENV overridable via ${chalk.cyan("env=")}
273+
${chalk.magenta.underline("\nGlobal Environment Variables\n")}
274+
275+
EVAL_ENV target environment, overridable via ${chalk.cyan("env=")}
276+
277+
EVAL_TRIAL_COUNT number of trials, overridable via ${chalk.cyan("trials=")}
278+
279+
EVAL_MAX_CONCURRENCY parallel sessions, overridable via ${chalk.cyan("concurrency=")}
280+
281+
EVAL_PROVIDER LLM provider, overridable via ${chalk.cyan("provider=")}
282+
283+
EVAL_MAX_K global limit for all benchmarks (overrides individual limits)
178284
179-
EVAL_TRIAL_COUNT overridable via ${chalk.cyan("trials=")}
285+
EVAL_DATASET filter to specific benchmark, overridable via ${chalk.cyan("--dataset=")}
180286
181-
EVAL_MAX_CONCURRENCY overridable via ${chalk.cyan("concurrency=")}
287+
USE_API use Stagehand API, overridable via ${chalk.cyan("api=")}
182288
183-
EVAL_PROVIDER overridable via ${chalk.cyan("provider=")}
289+
EVAL_MODELS comma-separated list of models to use
184290
185-
USE_API overridable via ${chalk.cyan("api=true")}
291+
AGENT_EVAL_MAX_STEPS max steps for agent tasks (default: 50)
186292
`;
187293

188-
return `${header}\n\n${synopsis}\n\n${body}\n${envSection}\n`;
294+
return `${header}\n\n${synopsis}\n\n${body}\n${examplesSection}\n${externalBenchmarksSection}\n${envSection}\n`;
189295
}
190296

191297
const wantsHelp = rawArgs.some((a) => HELP_REGEX.test(a));

0 commit comments

Comments
 (0)