Skip to content

Commit 444da19

Browse files
authored
add agent reasoning to evaluator, improve prompts & paramaterize max … (#1050)
…steps # why - Evaluator often is too strict in its evaluation, resulting in false positives - max step limits in evals are very brittle and can cause false negatives on tasks that exceed the current limit - Evaluator can sometimes view screenshots as a single source of truth resulting in false negatives. Providing the agents reasoning alongside these heps mitigate this # what changed - paramaterized max steps to allow for easy configuration across all evals through env - adjusted evaluator prompting - added "agent reasoning" param which can be used to pass in the agents reasoning alongside bulk screenshots in a format evaluator can understand well # test plan tested locally
1 parent 749ce6c commit 444da19

30 files changed

+56
-40
lines changed

.env.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ ENABLE_CACHING=false
99
EVAL_MODELS="gpt-4o,claude-3-5-sonnet-latest"
1010
EXPERIMENTAL_EVAL_MODELS="gpt-4o,claude-3-5-sonnet-latest,o1-mini,o1-preview"
1111
EVAL_CATEGORIES="observe,act,combination,extract,experimental"
12+
AGENT_EVAL_MAX_STEPS=50
1213
STAGEHAND_API_URL="http://localhost:80"

evals/evaluator.ts

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ export class Evaluator {
5555
screenshot = true,
5656
systemPrompt,
5757
screenshotDelayMs = 250,
58+
agentReasoning,
5859
} = options;
5960
if (!question) {
6061
throw new Error("Question cannot be an empty string");
@@ -69,12 +70,12 @@ export class Evaluator {
6970
question,
7071
screenshots: screenshot,
7172
systemPrompt,
73+
agentReasoning,
7274
});
7375
}
7476

7577
// Single screenshot case (existing logic)
76-
const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
77-
Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
78+
const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? "a screenshot" : "the agents reasoning and actions throughout the task"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.
7879
Today's date is ${new Date().toLocaleDateString()}`;
7980

8081
await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs));
@@ -97,7 +98,12 @@ export class Evaluator {
9798
{
9899
role: "user",
99100
content: [
100-
{ type: "text", text: question },
101+
{
102+
type: "text",
103+
text: agentReasoning
104+
? `Question: ${question}\n\nAgent's reasoning and actions taken:\n${agentReasoning}`
105+
: question,
106+
},
101107
...(screenshot
102108
? [
103109
{
@@ -153,8 +159,7 @@ export class Evaluator {
153159
const {
154160
questions,
155161
screenshot = true,
156-
systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task (in the form of a screenshot, or an answer). Provide a detailed reasoning for your answer.
157-
Be critical about the question and the answer, the slightest detail might be the difference between yes and no. for text, be lenient and allow for slight variations in wording. we will be comparing the agents trajectory to see if it contains the information we were looking for in the answer.
162+
systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task based on the original goal. You have access to ${screenshot ? "a screenshot" : "the agents reasoning and actions throughout the task"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.
158163
Today's date is ${new Date().toLocaleDateString()}`,
159164
screenshotDelayMs = 1000,
160165
} = options;
@@ -260,14 +265,17 @@ export class Evaluator {
260265
question: string;
261266
screenshots: Buffer[];
262267
systemPrompt?: string;
268+
agentReasoning?: string;
263269
}): Promise<EvaluationResult> {
264270
const {
265271
question,
266272
screenshots,
273+
agentReasoning,
267274
systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
275+
${agentReasoning ? "You also have access to the agent's detailed reasoning and thought process throughout the task." : ""}
268276
Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
269277
Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
270-
Be critical about the question but consider the ENTIRE sequence when making your determination.
278+
${agentReasoning ? "The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation." : ""}
271279
Today's date is ${new Date().toLocaleDateString()}`,
272280
} = options;
273281

@@ -303,7 +311,9 @@ export class Evaluator {
303311
content: [
304312
{
305313
type: "text",
306-
text: `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
314+
text: agentReasoning
315+
? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`
316+
: `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
307317
},
308318
...imageContents,
309319
],

evals/tasks/agent/all_recipes.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ export const all_recipes: EvalFunction = async ({
1414
const agentResult = await agent.execute({
1515
instruction:
1616
"Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
17-
maxSteps: 30,
17+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
1818
});
1919

2020
const { evaluation, reasoning } = await evaluator.ask({

evals/tasks/agent/apple_trade_in.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export const apple_trade_in: EvalFunction = async ({
1515
const agentResult = await agent.execute({
1616
instruction:
1717
"Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
18-
maxSteps: 30,
18+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
1919
});
2020

2121
const { evaluation, reasoning } = await evaluator.ask({

evals/tasks/agent/apple_tv.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ export const apple_tv: EvalFunction = async ({
1414
const agentResult = await agent.execute({
1515
instruction:
1616
"Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
17-
maxSteps: 30,
17+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
1818
});
1919

2020
const { height, width } = await stagehand.page.extract({

evals/tasks/agent/arxiv_gpt_report.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export const arxiv_gpt_report: EvalFunction = async ({
1515
const agentResult = await agent.execute({
1616
instruction:
1717
"Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
18-
maxSteps: 25,
18+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 25,
1919
});
2020

2121
// Mon, 27 Mar 2023 17:46:54 UTC

evals/tasks/agent/gaia.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ export const gaia: EvalFunction = async ({
5151

5252
const result = await agent.execute({
5353
instruction: params.ques,
54-
maxSteps: 50,
54+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
5555
});
5656

5757
const expected = (params as Record<string, unknown>).expected as

evals/tasks/agent/github.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ export const github: EvalFunction = async ({
1313
const agentResult = await agent.execute({
1414
instruction:
1515
"Find a Ruby repository on GitHub that has been updated in the past 3 days and has at least 1000 stars.",
16-
maxSteps: 20,
16+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 20,
1717
});
1818
logger.log(agentResult);
1919

evals/tasks/agent/github_react_version.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ export const github_react_version: EvalFunction = async ({
1313
await agent.execute({
1414
instruction:
1515
"Check the latest release version of React and the date it was published. ",
16-
maxSteps: 20,
16+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 20,
1717
});
1818
const { evaluation, reasoning } = await evaluator.ask({
1919
question:

evals/tasks/agent/google_flights.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ export const google_flights: EvalFunction = async ({
1414
const agentResult = await agent.execute({
1515
instruction:
1616
"Search for flights from San Francisco to New York for next weekend",
17-
maxSteps: 30,
17+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
1818
});
1919
logger.log(agentResult);
2020

0 commit comments

Comments
 (0)