improve validation criteria of webvoyager/gaia evals (#1038)

tkattkat · web-flow · commit ae514f5e24e7 · 2025-08-28T14:01:22.000-07:00
# why

fast follow to add better success criteria 

# what changed

added evaluator to webvoyager / gaia evals 

# test plan
tested locally
diff --git a/evals/tasks/agent/gaia.ts b/evals/tasks/agent/gaia.ts
@@ -1,4 +1,5 @@
 import { EvalFunction } from "@/types/evals";
+import { Evaluator } from "../../evaluator";
 
 /**
  * Data-driven GAIA agent eval
@@ -40,7 +41,6 @@ export const gaia: EvalFunction = async ({
         logs: logger.getLogs(),
       };
     }
-
     await stagehand.page.goto(params.web);
 
     const agent = stagehand.agent({
@@ -51,25 +51,23 @@ export const gaia: EvalFunction = async ({
 
     const result = await agent.execute({
       instruction: params.ques,
-      maxSteps: 20,
+      maxSteps: 50,
     });
 
-    const message = result?.message || "";
-    const hasFinal =
-      typeof message === "string" && /Final Answer\s*:\s*(.+)/i.test(message);
-    const providedAnswer = hasFinal
-      ? (message.match(/Final Answer\s*:\s*(.+)/i)?.[1] || "").trim()
-      : "";
-
     const expected = (params as Record<string, unknown>).expected as
       | string
       | undefined;
-    const success = expected
-      ? hasFinal && providedAnswer.trim() === expected.trim()
-      : hasFinal;
+    const evaluator = new Evaluator(stagehand);
+    const evalResult = await evaluator.ask({
+      question: `Did the agent provide the expected answer: "${expected}"?`,
+      answer: result?.message || "",
+      screenshot: false,
+    });
 
     return {
-      _success: !!success,
+      _success: evalResult.evaluation === "YES",
+      reasoning: evalResult.reasoning,
+      expectedAnswer: expected,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),
diff --git a/evals/tasks/agent/webvoyager.ts b/evals/tasks/agent/webvoyager.ts
@@ -1,4 +1,5 @@
 import { EvalFunction } from "@/types/evals";
+import { Evaluator } from "../../evaluator";
 
 export const webvoyager: EvalFunction = async ({
   stagehand,
@@ -34,17 +35,20 @@ export const webvoyager: EvalFunction = async ({
       instructions: `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await stagehand.page.title()}`,
     });
 
-    const result = await agent.execute({
+    await agent.execute({
       instruction: params.ques,
-      maxSteps: 20,
+      maxSteps: 50,
     });
 
-    const message = result?.message || "";
-    const success =
-      typeof message === "string" && /Final Answer\s*:/i.test(message);
+    const evaluator = new Evaluator(stagehand);
+    const evalResult = await evaluator.ask({
+      question: `Did the agent successfully complete this task: "${params.ques}"? Look at the current state of the page to verify if the task was completed successfully.`,
+      screenshot: true,
+    });
 
     return {
-      _success: !!success,
+      _success: evalResult.evaluation === "YES",
+      reasoning: evalResult.reasoning,
       debugUrl,
       sessionUrl,
       logs: logger.getLogs(),