Skip to content

Commit 2633d82

Browse files
tkattkatmiguelg719
andauthored
add evaluate text to evaluator (#1015)
# why adds "evaluateText" to evaluator # what changed evals can now use the evaluateText method to use the llms messages to evaluate if it came across the proper information required by the eval. This removes the dependency on extract within evals + makes it easier to evaluate within the webvoyager / gaia evals when they are implemented # test plan tested locally --------- Co-authored-by: Miguel <[email protected]>
1 parent 5d668b8 commit 2633d82

24 files changed

+226
-169
lines changed

evals/evals.config.json

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -515,10 +515,6 @@
515515
{
516516
"name": "agent/all_recipes",
517517
"categories": ["agent"]
518-
},
519-
{
520-
"name": "agent/google_shopping",
521-
"categories": ["agent"]
522518
}
523519
]
524520
}

evals/evaluator.ts

Lines changed: 91 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ import {
1313
import dotenv from "dotenv";
1414
import {
1515
EvaluateOptions,
16+
BatchAskOptions,
1617
EvaluationResult,
17-
BatchEvaluateOptions,
1818
} from "@/types/evaluator";
1919
import { LLMParsedResponse } from "@/lib/inference";
2020
import { LLMResponse } from "@/lib/llm/LLMClient";
@@ -46,30 +46,30 @@ export class Evaluator {
4646
this.modelClientOptions = modelClientOptions || {
4747
apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY || "",
4848
};
49-
// Create a silent logger function that doesn't output anything
50-
this.silentLogger = () => {};
5149
}
5250

53-
/**
54-
* Evaluates the current state of the page against a specific question.
55-
* Uses structured response generation to ensure proper format.
56-
* Returns the evaluation result with normalized response and success status.
57-
*
58-
* @param options - The options for evaluation
59-
* @returns A promise that resolves to an EvaluationResult
60-
*/
61-
async evaluate(options: EvaluateOptions): Promise<EvaluationResult> {
51+
async ask(options: EvaluateOptions): Promise<EvaluationResult> {
6252
const {
6353
question,
64-
systemPrompt = `You are an expert evaluator that confidently returns YES or NO given the state of a task (most times in the form of a screenshot) and a question. Provide a detailed reasoning for your answer.
65-
Return your response as a JSON object with the following format:
66-
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
67-
Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
68-
screenshotDelayMs = 1000,
54+
answer,
55+
screenshot = true,
56+
systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
57+
Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
58+
Today's date is ${new Date().toLocaleDateString()}`,
59+
screenshotDelayMs = 250,
6960
} = options;
61+
if (!question) {
62+
throw new Error("Question cannot be an empty string");
63+
}
64+
if (!answer && !screenshot) {
65+
throw new Error("Either answer (text) or screenshot must be provided");
66+
}
7067

7168
await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs));
72-
const imageBuffer = await this.stagehand.page.screenshot();
69+
let imageBuffer: Buffer;
70+
if (screenshot) {
71+
imageBuffer = await this.stagehand.page.screenshot();
72+
}
7373
const llmClient = this.stagehand.llmProvider.getClient(
7474
this.modelName,
7575
this.modelClientOptions,
@@ -86,12 +86,24 @@ export class Evaluator {
8686
role: "user",
8787
content: [
8888
{ type: "text", text: question },
89-
{
90-
type: "image_url",
91-
image_url: {
92-
url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
93-
},
94-
},
89+
...(screenshot
90+
? [
91+
{
92+
type: "image_url",
93+
image_url: {
94+
url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
95+
},
96+
},
97+
]
98+
: []),
99+
...(answer
100+
? [
101+
{
102+
type: "text",
103+
text: `the answer is ${answer}`,
104+
},
105+
]
106+
: []),
95107
],
96108
},
97109
],
@@ -106,15 +118,10 @@ export class Evaluator {
106118
const result = response.data as unknown as z.infer<
107119
typeof EvaluationSchema
108120
>;
109-
110-
return {
111-
evaluation: result.evaluation,
112-
reasoning: result.reasoning,
113-
};
121+
return { evaluation: result.evaluation, reasoning: result.reasoning };
114122
} catch (error) {
115123
const errorMessage =
116124
error instanceof Error ? error.message : String(error);
117-
118125
return {
119126
evaluation: "INVALID" as const,
120127
reasoning: `Failed to get structured response: ${errorMessage}`,
@@ -123,43 +130,65 @@ export class Evaluator {
123130
}
124131

125132
/**
126-
* Evaluates the current state of the page against multiple questions in a single screenshot.
127-
* Uses structured response generation to ensure proper format.
133+
* Evaluates multiple questions with optional answers and/or screenshot.
134+
* Similar to ask() but processes multiple questions in a single call.
128135
* Returns an array of evaluation results.
129136
*
130137
* @param options - The options for batch evaluation
131138
* @returns A promise that resolves to an array of EvaluationResults
132139
*/
133-
async batchEvaluate(
134-
options: BatchEvaluateOptions,
135-
): Promise<EvaluationResult[]> {
140+
async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {
136141
const {
137142
questions,
138-
systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task in the screenshot. Provide a detailed reasoning for your answer.
139-
Return your response as a JSON array, where each object corresponds to a question and has the following format:
140-
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }
141-
Be critical about the question and the answer, the slightest detail might be the difference between yes and no.`,
143+
screenshot = true,
144+
systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
145+
Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
146+
Today's date is ${new Date().toLocaleDateString()}`,
142147
screenshotDelayMs = 1000,
143148
} = options;
144149

150+
// Validate inputs
151+
if (!questions || questions.length === 0) {
152+
throw new Error("Questions array cannot be empty");
153+
}
154+
155+
for (const item of questions) {
156+
if (!item.question) {
157+
throw new Error("Question cannot be an empty string");
158+
}
159+
if (!item.answer && !screenshot) {
160+
throw new Error(
161+
"Either answer (text) or screenshot must be provided for each question",
162+
);
163+
}
164+
}
165+
145166
// Wait for the specified delay before taking screenshot
146167
await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs));
147168

148-
// Take a screenshot of the current page state
149-
const imageBuffer = await this.stagehand.page.screenshot();
150-
151-
// Create a numbered list of questions for the VLM
152-
const formattedQuestions = questions
153-
.map((q, i) => `${i + 1}. ${q}`)
154-
.join("\n");
169+
let imageBuffer: Buffer;
170+
if (screenshot) {
171+
imageBuffer = await this.stagehand.page.screenshot();
172+
}
155173

156174
// Get the LLM client with our preferred model
157175
const llmClient = this.stagehand.llmProvider.getClient(
158176
this.modelName,
159177
this.modelClientOptions,
160178
);
161179

162-
// Use the model-specific LLM client to evaluate the screenshot with all questions
180+
// Format all questions with their optional answers
181+
const formattedQuestions = questions
182+
.map((item, i) => {
183+
let text = `${i + 1}. ${item.question}`;
184+
if (item.answer) {
185+
text += `\n Answer: ${item.answer}`;
186+
}
187+
return text;
188+
})
189+
.join("\n\n");
190+
191+
// Use the model-specific LLM client to evaluate
163192
const response = await llmClient.createChatCompletion<
164193
LLMParsedResponse<LLMResponse>
165194
>({
@@ -168,18 +197,22 @@ export class Evaluator {
168197
messages: [
169198
{
170199
role: "system",
171-
content: `${systemPrompt}\n\nYou will be given multiple questions. Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,
200+
content: `${systemPrompt}\n\nYou will be given multiple questions${screenshot ? " with a screenshot" : ""}. ${questions.some((q) => q.answer) ? "Some questions include answers to evaluate." : ""} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,
172201
},
173202
{
174203
role: "user",
175204
content: [
176205
{ type: "text", text: formattedQuestions },
177-
{
178-
type: "image_url",
179-
image_url: {
180-
url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
181-
},
182-
},
206+
...(screenshot
207+
? [
208+
{
209+
type: "image_url",
210+
image_url: {
211+
url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
212+
},
213+
},
214+
]
215+
: []),
183216
],
184217
},
185218
],
@@ -194,29 +227,13 @@ export class Evaluator {
194227
const results = response.data as unknown as z.infer<
195228
typeof BatchEvaluationSchema
196229
>;
197-
198-
// Pad with INVALID results if we got fewer than expected
199-
const finalResults: EvaluationResult[] = [];
200-
for (let i = 0; i < questions.length; i++) {
201-
if (i < results.length) {
202-
finalResults.push({
203-
evaluation: results[i].evaluation,
204-
reasoning: results[i].reasoning,
205-
});
206-
} else {
207-
finalResults.push({
208-
evaluation: "INVALID",
209-
reasoning: "No response found for this question.",
210-
});
211-
}
212-
}
213-
214-
return finalResults;
230+
return results.map((r) => ({
231+
evaluation: r.evaluation,
232+
reasoning: r.reasoning,
233+
}));
215234
} catch (error) {
216235
const errorMessage =
217236
error instanceof Error ? error.message : String(error);
218-
219-
// Fallback: return INVALID for all questions
220237
return questions.map(() => ({
221238
evaluation: "INVALID" as const,
222239
reasoning: `Failed to get structured response: ${errorMessage}`,

evals/tasks/agent/all_recipes.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ export const all_recipes: EvalFunction = async ({
1717
maxSteps: 30,
1818
});
1919

20-
const { evaluation, reasoning } = await evaluator.evaluate({
20+
const { evaluation, reasoning } = await evaluator.ask({
2121
question: "Did the agent find a recipe for Beef Wellington",
2222
});
2323

evals/tasks/agent/apple_trade_in.ts

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//this eval is expected to fail due to issues scrolling within the trade in dialog
22
import { EvalFunction } from "@/types/evals";
3-
import { z } from "zod";
3+
import { Evaluator } from "../../evaluator";
44

55
export const apple_trade_in: EvalFunction = async ({
66
debugUrl,
@@ -11,27 +11,26 @@ export const apple_trade_in: EvalFunction = async ({
1111
}) => {
1212
try {
1313
await stagehand.page.goto("https://www.apple.com/shop/trade-in");
14+
const evaluator = new Evaluator(stagehand);
1415
const agentResult = await agent.execute({
1516
instruction:
1617
"Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
1718
maxSteps: 30,
1819
});
1920

20-
const { tradeInValue } = await stagehand.page.extract({
21-
modelName: "google/gemini-2.5-flash",
22-
instruction:
23-
"Extract the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website. it will be inside this text : Get x trade-in credit toward a new iPhone', provide just the number",
24-
schema: z.object({
25-
tradeInValue: z.number(),
26-
}),
21+
const { evaluation, reasoning } = await evaluator.ask({
22+
question:
23+
"Did the agent find the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website?",
24+
screenshot: false,
25+
answer: "360",
2726
});
2827

29-
const success = agentResult.success && tradeInValue === 360;
28+
const success = agentResult.success && evaluation === "YES";
3029

3130
if (!success) {
3231
return {
3332
_success: false,
34-
message: agentResult.message,
33+
message: reasoning,
3534
debugUrl,
3635
sessionUrl,
3736
logs: logger.getLogs(),

evals/tasks/agent/arxiv_gpt_report.ts

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//agent often fails on this one,
22
import { EvalFunction } from "@/types/evals";
3-
import { z } from "zod";
3+
import { Evaluator } from "../../evaluator";
44
export const arxiv_gpt_report: EvalFunction = async ({
55
debugUrl,
66
sessionUrl,
@@ -9,6 +9,7 @@ export const arxiv_gpt_report: EvalFunction = async ({
99
agent,
1010
}) => {
1111
try {
12+
const evaluator = new Evaluator(stagehand);
1213
await stagehand.page.goto("https://arxiv.org/");
1314

1415
const agentResult = await agent.execute({
@@ -18,23 +19,22 @@ export const arxiv_gpt_report: EvalFunction = async ({
1819
});
1920

2021
// Mon, 27 Mar 2023 17:46:54 UTC
21-
const { date } = await stagehand.page.extract({
22-
modelName: "google/gemini-2.5-flash",
23-
instruction:
24-
"Extract the date of the v3 submission history, it should be in the format 'MM-DD-YYYY'",
25-
schema: z.object({
26-
date: z.string().describe("The date of the v3 submission history"),
27-
}),
22+
23+
const { evaluation, reasoning } = await evaluator.ask({
24+
question:
25+
"Did the agent find the published paper 'GPT-4 Technical Report' and the date it was submitted?",
26+
screenshot: false,
27+
answer: "03-27-2023",
2828
});
2929

30-
console.log(`date: ${date}`);
30+
console.log(`reasoning: ${reasoning}`);
3131

32-
const success = agentResult.success && date === "03-27-2023";
32+
const success = agentResult.success && evaluation === "YES";
3333

3434
if (!success) {
3535
return {
3636
_success: false,
37-
message: agentResult.message,
37+
message: reasoning,
3838
debugUrl,
3939
sessionUrl,
4040
logs: logger.getLogs(),

evals/tasks/agent/github.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ export const github: EvalFunction = async ({
1717
});
1818
logger.log(agentResult);
1919

20-
const { evaluation, reasoning } = await evaluator.evaluate({
20+
const { evaluation, reasoning } = await evaluator.ask({
2121
question:
2222
"Ruby repository on GitHub that has been updated in the past 3 days and has at least 1000 stars.",
2323
});

evals/tasks/agent/github_react_version.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export const github_react_version: EvalFunction = async ({
1515
"Check the latest release version of React and the date it was published. ",
1616
maxSteps: 20,
1717
});
18-
const { evaluation, reasoning } = await evaluator.evaluate({
18+
const { evaluation, reasoning } = await evaluator.ask({
1919
question:
2020
"Does the page show the latest version of react and the date it was published",
2121
});

evals/tasks/agent/google_flights.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ export const google_flights: EvalFunction = async ({
1919
logger.log(agentResult);
2020

2121
const evaluator = new Evaluator(stagehand);
22-
const result = await evaluator.evaluate({
22+
const result = await evaluator.ask({
2323
question:
2424
"Does the page show flights (options, available flights, not a search form) from San Francisco to New York?",
2525
});

0 commit comments

Comments
 (0)