Skip to content

Commit ae514f5

Browse files
authored
improve validation criteria of webvoyager/gaia evals (#1038)
# why fast follow to add better success criteria # what changed added evaluator to webvoyager / gaia evals # test plan tested locally
1 parent be85b19 commit ae514f5

File tree

2 files changed

+21
-19
lines changed

2 files changed

+21
-19
lines changed

evals/tasks/agent/gaia.ts

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { EvalFunction } from "@/types/evals";
2+
import { Evaluator } from "../../evaluator";
23

34
/**
45
* Data-driven GAIA agent eval
@@ -40,7 +41,6 @@ export const gaia: EvalFunction = async ({
4041
logs: logger.getLogs(),
4142
};
4243
}
43-
4444
await stagehand.page.goto(params.web);
4545

4646
const agent = stagehand.agent({
@@ -51,25 +51,23 @@ export const gaia: EvalFunction = async ({
5151

5252
const result = await agent.execute({
5353
instruction: params.ques,
54-
maxSteps: 20,
54+
maxSteps: 50,
5555
});
5656

57-
const message = result?.message || "";
58-
const hasFinal =
59-
typeof message === "string" && /Final Answer\s*:\s*(.+)/i.test(message);
60-
const providedAnswer = hasFinal
61-
? (message.match(/Final Answer\s*:\s*(.+)/i)?.[1] || "").trim()
62-
: "";
63-
6457
const expected = (params as Record<string, unknown>).expected as
6558
| string
6659
| undefined;
67-
const success = expected
68-
? hasFinal && providedAnswer.trim() === expected.trim()
69-
: hasFinal;
60+
const evaluator = new Evaluator(stagehand);
61+
const evalResult = await evaluator.ask({
62+
question: `Did the agent provide the expected answer: "${expected}"?`,
63+
answer: result?.message || "",
64+
screenshot: false,
65+
});
7066

7167
return {
72-
_success: !!success,
68+
_success: evalResult.evaluation === "YES",
69+
reasoning: evalResult.reasoning,
70+
expectedAnswer: expected,
7371
debugUrl,
7472
sessionUrl,
7573
logs: logger.getLogs(),

evals/tasks/agent/webvoyager.ts

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { EvalFunction } from "@/types/evals";
2+
import { Evaluator } from "../../evaluator";
23

34
export const webvoyager: EvalFunction = async ({
45
stagehand,
@@ -34,17 +35,20 @@ export const webvoyager: EvalFunction = async ({
3435
instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}`,
3536
});
3637

37-
const result = await agent.execute({
38+
await agent.execute({
3839
instruction: params.ques,
39-
maxSteps: 20,
40+
maxSteps: 50,
4041
});
4142

42-
const message = result?.message || "";
43-
const success =
44-
typeof message === "string" && /Final Answer\s*:/i.test(message);
43+
const evaluator = new Evaluator(stagehand);
44+
const evalResult = await evaluator.ask({
45+
question: `Did the agent successfully complete this task: "${params.ques}"? Look at the current state of the page to verify if the task was completed successfully.`,
46+
screenshot: true,
47+
});
4548

4649
return {
47-
_success: !!success,
50+
_success: evalResult.evaluation === "YES",
51+
reasoning: evalResult.reasoning,
4852
debugUrl,
4953
sessionUrl,
5054
logs: logger.getLogs(),

0 commit comments

Comments
 (0)