@@ -103,89 +103,195 @@ function buildUsage(detailed = false): string {
103
103
`pnpm run evals [key=value]… [category <name>] | name=<evalName>` ,
104
104
) ;
105
105
106
+ const examplesSection = `
107
+ ${ chalk . magenta . underline ( "Examples" ) }
108
+
109
+ ${ chalk . dim ( "# Run every evaluation locally with default settings" ) }
110
+ ${ chalk . green ( "pnpm run evals" ) }
111
+
112
+ ${ chalk . dim ( "# Same as above but in Browserbase with three trials" ) }
113
+ ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "env=" ) } ${ chalk . yellow ( "browserbase" ) } ${ chalk . cyan ( "trials=" ) } ${ chalk . yellow ( "3" ) }
114
+
115
+ ${ chalk . dim ( "# Run evals using the Stagehand API" ) }
116
+ ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "env=" ) } ${ chalk . yellow ( "browserbase" ) } ${ chalk . cyan ( "api=" ) } ${ chalk . yellow ( "true" ) }
117
+
118
+ ${ chalk . dim ( "# Run evals from only the 'act' category with a max of 4 running at any given time" ) }
119
+ ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "category" ) } ${ chalk . yellow ( "act" ) } ${ chalk . cyan ( "concurrency=" ) } ${ chalk . yellow ( "4" ) }
120
+
121
+ ${ chalk . dim ( "# Execute a specific eval by filename" ) }
122
+ ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "my_eval_name" ) }
123
+ ` ;
124
+
106
125
const body = dedent `
107
126
${ chalk . magenta . underline ( "Keys\n" ) }
108
127
${ chalk . cyan ( "env" . padEnd ( 12 ) ) } ${ "target environment" . padEnd ( 24 ) }
109
- (default ${ chalk . dim ( "LOCAL" ) } ) [${ chalk . yellow ( "BROWSERBASE " ) } , ${ chalk . yellow ( "LOCAL " ) } ] ${ chalk . gray ( "← LOCAL sets api=false" ) }
128
+ (default ${ chalk . dim ( "LOCAL" ) } ) [${ chalk . yellow ( "browserbase " ) } , ${ chalk . yellow ( "local " ) } ]
110
129
111
130
${ chalk . cyan ( "api" . padEnd ( 12 ) ) } ${ "use the Stagehand API" . padEnd ( 24 ) }
112
- (default ${ chalk . dim ( "false" ) } ) [${ chalk . yellow ( "true" ) } , ${ chalk . yellow ( "false" ) } ]
131
+ (default ${ chalk . dim ( "false" ) } ) [${ chalk . yellow ( "true" ) } , ${ chalk . yellow ( "false" ) } ]
113
132
114
- ${ chalk . cyan ( "trials" . padEnd ( 12 ) ) } ${ "number of trials" . padEnd ( 24 ) }
115
- (default ${ chalk . dim ( "10 " ) } )
133
+ ${ chalk . cyan ( "trials" . padEnd ( 12 ) ) } ${ "number of trials per task " . padEnd ( 24 ) }
134
+ (default ${ chalk . dim ( "3 " ) } )
116
135
117
136
${ chalk . cyan ( "concurrency" . padEnd ( 12 ) ) } ${ "max parallel sessions" . padEnd ( 24 ) }
118
- (default ${ chalk . dim ( "10 " ) } )
137
+ (default ${ chalk . dim ( "3 " ) } )
119
138
120
139
${ chalk . cyan ( "provider" . padEnd ( 12 ) ) } ${ "override LLM provider" . padEnd ( 24 ) }
121
- (default ${ chalk . dim ( providerDefault ) } ) [${ chalk . yellow ( "OPENAI " ) } , ${ chalk . yellow ( "ANTHROPIC " ) } , ${ chalk . yellow ( "GOOGLE " ) } , ${ chalk . yellow ( "TOGETHER " ) } , ${ chalk . yellow ( "GROQ " ) } , ${ chalk . yellow ( "CEREBRAS " ) } ]
140
+ (default ${ chalk . dim ( providerDefault || "varies by model" ) } ) [${ chalk . yellow ( "openai " ) } , ${ chalk . yellow ( "anthropic " ) } , ${ chalk . yellow ( "google " ) } , ${ chalk . yellow ( "together " ) } , ${ chalk . yellow ( "groq " ) } , ${ chalk . yellow ( "cerebras " ) } ]
122
141
123
142
${ chalk . cyan ( "max_k" . padEnd ( 12 ) ) } ${ "max test cases per dataset" . padEnd ( 24 ) }
124
143
(default ${ chalk . dim ( "25" ) } )
125
144
126
- ${ chalk . cyan ( "--dataset" . padEnd ( 12 ) ) } ${ "filter dataset for benchmarks " . padEnd ( 24 ) }
127
- (optional) [${ chalk . yellow ( "gaia" ) } , ${ chalk . yellow ( "webvoyager" ) } ]
145
+ ${ chalk . cyan ( "--dataset" . padEnd ( 12 ) ) } ${ "filter to specific benchmark " . padEnd ( 24 ) }
146
+ (optional) [${ chalk . yellow ( "gaia" ) } , ${ chalk . yellow ( "webvoyager" ) } , ${ chalk . yellow ( "webbench" ) } , ${ chalk . yellow ( "osworld" ) } , ${ chalk . yellow ( "onlineMind2Web" ) } ]
128
147
129
148
130
149
${ chalk . magenta . underline ( "Positional filters\n" ) }
131
- category <category_name> one of: ${ DEFAULT_EVAL_CATEGORIES . map ( ( c ) =>
132
- chalk . yellow ( c ) ,
133
- ) . join ( ", " ) }
150
+
151
+ category <category_name>
152
+
153
+ ${ chalk . gray ( "Available categories:" ) }
154
+ ${ DEFAULT_EVAL_CATEGORIES . slice ( 0 , 5 )
155
+ . map ( ( c ) => chalk . yellow ( c ) )
156
+ . join ( ", " ) } ,
157
+ ${ DEFAULT_EVAL_CATEGORIES . slice ( 5 , 10 )
158
+ . map ( ( c ) => chalk . yellow ( c ) )
159
+ . join ( ", " ) } ${ DEFAULT_EVAL_CATEGORIES . slice ( 10 ) . length > 0 ? "," : "" }
160
+ ${ DEFAULT_EVAL_CATEGORIES . slice ( 10 )
161
+ . map ( ( c ) => chalk . yellow ( c ) )
162
+ . join ( ", " ) }
163
+ ` ;
134
164
135
- ${ chalk . magenta . underline ( "\nExamples" ) }
165
+ if ( ! detailed )
166
+ return `${ header } \n\n${ synopsis } \n\nFor more details: ${ chalk . bold (
167
+ "pnpm run evals -man\n" ,
168
+ ) } `;
169
+
170
+ const externalBenchmarksSection = dedent `
171
+ ${ chalk . magenta . underline ( "\nExternal Benchmarks\n" ) }
172
+
173
+ ${ chalk . cyan . bold ( "WebBench" ) } - 5,607 real-world web automation tasks across 452 live websites
174
+
175
+ ${ chalk . dim ( "Run:" ) } ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "agent/webbench" ) }
136
176
137
- ${ chalk . dim ( "# Run every evaluation locally with default settings " ) }
177
+ ${ chalk . dim ( "Or:" ) } ${ chalk . green ( "EVAL_DATASET=webbench pnpm run evals ") }
138
178
139
- ${ chalk . green ( "pnpm run evals ") }
179
+ ${ chalk . gray ( "Environment Variables: ") }
140
180
181
+ EVAL_WEBBENCH_LIMIT max tasks to run (default: 25)
182
+ EVAL_WEBBENCH_SAMPLE random sample count before limit
183
+ EVAL_WEBBENCH_DIFFICULTY filter: [${ chalk . yellow ( "easy" ) } , ${ chalk . yellow ( "hard" ) } ] (254 easy, 61 hard tasks)
184
+ EVAL_WEBBENCH_CATEGORY filter: [${ chalk . yellow ( "READ" ) } , ${ chalk . yellow ( "CREATE" ) } , ${ chalk . yellow ( "UPDATE" ) } , ${ chalk . yellow ( "DELETE" ) } , ${ chalk . yellow ( "FILE_MANIPULATION" ) } ]
185
+ EVAL_WEBBENCH_USE_HITL use only HITL dataset with difficulty ratings (true/false)
141
186
142
- ${ chalk . dim ( "# Same as above but in Browserbase with three trials " ) }
187
+ ${ chalk . dim ( "Examples: " ) }
143
188
144
- ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "env=" ) } ${ chalk . yellow ( "BROWSERBASE" ) } ${ chalk . cyan (
145
- "trials=" ,
146
- ) } ${ chalk . yellow ( "3" ) }
189
+ ${ chalk . green ( "EVAL_WEBBENCH_DIFFICULTY=easy EVAL_WEBBENCH_LIMIT=10 pnpm run evals name=agent/webbench" ) }
147
190
191
+ ${ chalk . green ( "EVAL_DATASET=webbench EVAL_WEBBENCH_CATEGORY=READ pnpm run evals" ) }
192
+
193
+
194
+ ${ chalk . cyan . bold ( "GAIA" ) } - General AI Assistant benchmark for complex reasoning
195
+
196
+ ${ chalk . dim ( "Run:" ) } ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "agent/gaia" ) }
148
197
149
- ${ chalk . dim ( "# Run evals using the Stagehand API " ) }
198
+ ${ chalk . dim ( "Or:" ) } ${ chalk . green ( "EVAL_DATASET=gaia pnpm run evals ") }
150
199
151
- ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "env=" ) } ${ chalk . yellow ( "BROWSERBASE" ) } ${ chalk . cyan (
152
- "api=" ,
153
- ) } ${ chalk . yellow ( "true" ) }
154
-
155
-
156
- ${ chalk . dim (
157
- "# Run evals from only the 'act' category with a max of 4 running at any given time" ,
158
- ) }
200
+ ${ chalk . gray ( "Environment Variables:" ) }
159
201
160
- ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "category" ) } ${ chalk . yellow ( "act" ) } ${ chalk . cyan (
161
- "concurrency=" ,
162
- ) } ${ chalk . yellow ( "4" ) }
163
-
164
-
165
- ${ chalk . dim ( "# Execute a specific eval by filename" ) }
202
+ EVAL_GAIA_LIMIT max tasks to run (default: 25)
203
+ EVAL_GAIA_SAMPLE random sample count before limit
204
+ EVAL_GAIA_LEVEL filter by difficulty level [${ chalk . yellow ( "1" ) } , ${ chalk . yellow ( "2" ) } , ${ chalk . yellow ( "3" ) } ]
166
205
167
- ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "my_eval_name" ) }
206
+ ${ chalk . dim ( "Example:" ) }
207
+
208
+ ${ chalk . green ( "EVAL_GAIA_LEVEL=1 EVAL_GAIA_LIMIT=10 pnpm run evals name=agent/gaia" ) }
209
+
210
+
211
+ ${ chalk . cyan . bold ( "WebVoyager" ) } - Web navigation and task completion benchmark
212
+
213
+ ${ chalk . dim ( "Run:" ) } ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "agent/webvoyager" ) }
214
+
215
+ ${ chalk . dim ( "Or:" ) } ${ chalk . green ( "EVAL_DATASET=webvoyager pnpm run evals" ) }
216
+
217
+ ${ chalk . gray ( "Environment Variables:" ) }
218
+
219
+ EVAL_WEBVOYAGER_LIMIT max tasks to run (default: 25)
220
+ EVAL_WEBVOYAGER_SAMPLE random sample count before limit
221
+
222
+ ${ chalk . gray ( "Ground Truth Evaluation:" ) }
223
+
224
+ WebVoyager uses ground truth answers for improved accuracy:
225
+ • Checks agent's "Final Answer:" against reference answers
226
+ • Supports golden (ideal) and possible (acceptable) answers
227
+ • Falls back to screenshot evaluation when uncertain
228
+ • Reference data: evals/datasets/webvoyager/reference-answers.json
229
+
230
+ ${ chalk . dim ( "Example:" ) }
231
+
232
+ ${ chalk . green ( "EVAL_WEBVOYAGER_SAMPLE=50 EVAL_WEBVOYAGER_LIMIT=10 pnpm run evals name=agent/webvoyager" ) }
233
+
234
+
235
+ ${ chalk . cyan . bold ( "OSWorld" ) } - Chrome browser automation tasks from the OSWorld benchmark
236
+
237
+ ${ chalk . dim ( "Run:" ) } ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "agent/osworld" ) }
238
+
239
+ ${ chalk . dim ( "Or:" ) } ${ chalk . green ( "EVAL_DATASET=osworld pnpm run evals" ) }
240
+
241
+ ${ chalk . gray ( "Environment Variables:" ) }
242
+
243
+ EVAL_OSWORLD_LIMIT max tasks to run (default: 25)
244
+ EVAL_OSWORLD_SAMPLE random sample count before limit
245
+ EVAL_OSWORLD_SOURCE filter by source: [${ chalk . yellow ( "Mind2Web" ) } , ${ chalk . yellow ( "test_task_1" ) } , ...]
246
+ EVAL_OSWORLD_EVALUATION_TYPE filter by eval type: [${ chalk . yellow ( "url_match" ) } , ${ chalk . yellow ( "string_match" ) } , ${ chalk . yellow ( "dom_state" ) } , ${ chalk . yellow ( "custom" ) } ]
247
+ EVAL_OSWORLD_TIMEOUT timeout per task in milliseconds (default: 60000)
248
+
249
+ ${ chalk . dim ( "Examples:" ) }
250
+
251
+ ${ chalk . green ( "EVAL_OSWORLD_SOURCE=Mind2Web EVAL_OSWORLD_LIMIT=10 pnpm run evals name=agent/osworld" ) }
252
+
253
+ ${ chalk . green ( "EVAL_DATASET=osworld EVAL_OSWORLD_EVALUATION_TYPE=url_match pnpm run evals" ) }
254
+
255
+
256
+ ${ chalk . cyan . bold ( "Mind2Web" ) } - Real-world web interaction tasks for evaluating web agents
257
+
258
+ ${ chalk . dim ( "Run:" ) } ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "agent/onlineMind2Web" ) }
259
+
260
+ ${ chalk . dim ( "Or:" ) } ${ chalk . green ( "EVAL_DATASET=onlineMind2Web pnpm run evals" ) }
261
+
262
+ ${ chalk . gray ( "Environment Variables:" ) }
263
+
264
+ EVAL_ONLINEMIND2WEB_LIMIT max tasks to run (default: 25)
265
+ EVAL_ONLINEMIND2WEB_SAMPLE random sample count before limit
266
+
267
+ ${ chalk . dim ( "Example:" ) }
268
+
269
+ ${ chalk . green ( "EVAL_ONLINEMIND2WEB_SAMPLE=50 EVAL_ONLINEMIND2WEB_LIMIT=10 pnpm run evals name=agent/onlineMind2Web" ) }
168
270
` ;
169
271
170
- if ( ! detailed )
171
- return `${ header } \n\n${ synopsis } \n\nFor more details: ${ chalk . bold (
172
- "pnpm run evals -man\n" ,
173
- ) } `;
174
-
175
272
const envSection = dedent `
176
- ${ chalk . magenta . underline ( "\nEnvironment variables\n" ) }
177
- EVAL_ENV overridable via ${ chalk . cyan ( "env=" ) }
273
+ ${ chalk . magenta . underline ( "\nGlobal Environment Variables\n" ) }
274
+
275
+ EVAL_ENV target environment, overridable via ${ chalk . cyan ( "env=" ) }
276
+
277
+ EVAL_TRIAL_COUNT number of trials, overridable via ${ chalk . cyan ( "trials=" ) }
278
+
279
+ EVAL_MAX_CONCURRENCY parallel sessions, overridable via ${ chalk . cyan ( "concurrency=" ) }
280
+
281
+ EVAL_PROVIDER LLM provider, overridable via ${ chalk . cyan ( "provider=" ) }
282
+
283
+ EVAL_MAX_K global limit for all benchmarks (overrides individual limits)
178
284
179
- EVAL_TRIAL_COUNT overridable via ${ chalk . cyan ( "trials =" ) }
285
+ EVAL_DATASET filter to specific benchmark, overridable via ${ chalk . cyan ( "--dataset =" ) }
180
286
181
- EVAL_MAX_CONCURRENCY overridable via ${ chalk . cyan ( "concurrency =" ) }
287
+ USE_API use Stagehand API, overridable via ${ chalk . cyan ( "api =" ) }
182
288
183
- EVAL_PROVIDER overridable via ${ chalk . cyan ( "provider=" ) }
289
+ EVAL_MODELS comma-separated list of models to use
184
290
185
- USE_API overridable via ${ chalk . cyan ( "api=true" ) }
291
+ AGENT_EVAL_MAX_STEPS max steps for agent tasks (default: 50)
186
292
` ;
187
293
188
- return `${ header } \n\n${ synopsis } \n\n${ body } \n${ envSection } \n` ;
294
+ return `${ header } \n\n${ synopsis } \n\n${ body } \n${ examplesSection } \n ${ externalBenchmarksSection } \n ${ envSection } \n` ;
189
295
}
190
296
191
297
const wantsHelp = rawArgs . some ( ( a ) => HELP_REGEX . test ( a ) ) ;
0 commit comments