Skip to content

Commit 6ae809d

Browse files
miguelg719tkattkat
andauthored
Patch a few agent evals (#1055)
# why # what changed # test plan --------- Co-authored-by: tkattkat <[email protected]> Co-authored-by: tkattkat <[email protected]>
1 parent 444da19 commit 6ae809d

19 files changed

+40
-45
lines changed

evals/tasks/agent/all_recipes.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ export const all_recipes: EvalFunction = async ({
2424
logger.log(agentResult);
2525

2626
const success =
27-
agentResult.success &&
2827
evaluation === "YES" &&
2928
stagehand.page.url() ===
3029
"https://www.allrecipes.com/recipe/16899/beef-wellington/";

evals/tasks/agent/apple_trade_in.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ export const apple_trade_in: EvalFunction = async ({
1212
try {
1313
await stagehand.page.goto("https://www.apple.com/shop/trade-in");
1414
const evaluator = new Evaluator(stagehand);
15-
const agentResult = await agent.execute({
15+
await agent.execute({
1616
instruction:
1717
"Find out the trade-in value for an iPhone 13 Pro Max in good condition on the Apple website.",
1818
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
@@ -25,7 +25,7 @@ export const apple_trade_in: EvalFunction = async ({
2525
answer: "360",
2626
});
2727

28-
const success = agentResult.success && evaluation === "YES";
28+
const success = evaluation === "YES";
2929

3030
if (!success) {
3131
return {

evals/tasks/agent/apple_tv.ts

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { EvalFunction } from "@/types/evals";
2-
import { z } from "zod";
32

3+
import { Evaluator } from "../../evaluator";
44
export const apple_tv: EvalFunction = async ({
55
debugUrl,
66
sessionUrl,
@@ -14,24 +14,19 @@ export const apple_tv: EvalFunction = async ({
1414
const agentResult = await agent.execute({
1515
instruction:
1616
"Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.",
17-
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
17+
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
1818
});
1919

20-
const { height, width } = await stagehand.page.extract({
21-
modelName: "google/gemini-2.5-flash",
22-
instruction: "Extract the size and weight of the Apple TV 4K",
23-
schema: z.object({
24-
height: z.number().describe("The height of the Apple TV 4K in inches"),
25-
width: z.number().describe("The width of the Apple TV 4K in inches"),
26-
}),
20+
const evaluator = new Evaluator(stagehand);
21+
const result = await evaluator.ask({
22+
question:
23+
"did the agent find the height and width of the Apple TV 4K in its reasoning which is 1.2 and 3.66?",
24+
answer: agentResult.message,
2725
});
2826

2927
const success =
30-
agentResult.success &&
31-
height === 1.2 &&
32-
width === 3.66 &&
28+
result.evaluation === "YES" &&
3329
stagehand.page.url().includes("https://www.apple.com/apple-tv-4k/specs/");
34-
3530
if (!success) {
3631
return {
3732
_success: false,

evals/tasks/agent/arxiv_gpt_report.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ export const arxiv_gpt_report: EvalFunction = async ({
1212
const evaluator = new Evaluator(stagehand);
1313
await stagehand.page.goto("https://arxiv.org/");
1414

15-
const agentResult = await agent.execute({
15+
await agent.execute({
1616
instruction:
1717
"Find the paper 'GPT-4 Technical Report', when was v3 submitted?",
1818
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 25,
@@ -29,7 +29,7 @@ export const arxiv_gpt_report: EvalFunction = async ({
2929

3030
console.log(`reasoning: ${reasoning}`);
3131

32-
const success = agentResult.success && evaluation === "YES";
32+
const success = evaluation === "YES";
3333

3434
if (!success) {
3535
return {

evals/tasks/agent/github.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ export const github: EvalFunction = async ({
2222
"Ruby repository on GitHub that has been updated in the past 3 days and has at least 1000 stars.",
2323
});
2424

25-
const success = agentResult.success && evaluation === "YES";
25+
const success = evaluation === "YES";
2626

2727
if (!success) {
2828
return {

evals/tasks/agent/google_maps_2.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ export const google_maps_2: EvalFunction = async ({
4646
}
4747

4848
if (result.evaluation === "YES") {
49-
if (distance !== 1.5) {
49+
if (distance <= 1.3 || distance >= 1.6) {
5050
return {
5151
_success: false,
5252
observations: "Distance is not 1.5 km",

evals/tasks/agent/google_maps_3.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ export const google_maps_3: EvalFunction = async ({
1010
try {
1111
await stagehand.page.goto("https://maps.google.com/");
1212
const evaluator = new Evaluator(stagehand);
13-
const agentResult = await agent.execute({
13+
await agent.execute({
1414
instruction:
1515
"Search for locksmiths open now but not open 24 hours in Texas City.",
1616
maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 35,
@@ -21,7 +21,7 @@ export const google_maps_3: EvalFunction = async ({
2121
"Does the page show a locksmiths open now but not open 24 hours in Texas City?",
2222
});
2323

24-
const success = agentResult.success && evaluation === "YES";
24+
const success = evaluation === "YES";
2525

2626
if (!success) {
2727
return {

evals/tasks/agent/google_shopping.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ export const google_shopping: EvalFunction = async ({
2424
"Does the page show a drip coffee maker that is on sale and within $25-60 and has a black finish?",
2525
});
2626

27-
const success = agentResult.success && evaluation === "YES";
27+
const success = evaluation === "YES";
2828

2929
if (!success) {
3030
return {

evals/tasks/agent/hotel_booking.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ export const hotel_booking: EvalFunction = async ({
2424
"Does the page show a hotel in Sydney with a rating of 8 or higher, providing free Wi-Fi and parking, available for a four-night stay starting on December 10, 2025?",
2525
});
2626

27-
const success = agentResult.success && evaluation === "YES";
27+
const success = evaluation === "YES";
2828

2929
if (!success) {
3030
return {

evals/tasks/agent/hugging_face.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ export const hugging_face: EvalFunction = async ({
2424
screenshot: false,
2525
});
2626

27-
const success = agentResult.success && evaluation === "YES";
27+
const success = evaluation === "YES";
2828

2929
console.log(`reasoning: ${reasoning}`);
3030
if (!success) {

0 commit comments

Comments
 (0)