Skip to content

Commit d584fc1

Browse files
[evals] wrap all evals in try/catch/finally (#889)
# why - some eval tasks may fail and leave the browser open if the code never reaches `stagehand.close()` - this is problematic when running a lot of evals with high concurrency (you'll get rate limited because of a bunch of sessions left running) # what changed - wrapped all eval tasks in try/catch/finally blocks # test plan - this is it
1 parent a462877 commit d584fc1

File tree

85 files changed

+4057
-3475
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+4057
-3475
lines changed

evals/tasks/agent/google_flights.ts

Lines changed: 49 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -8,56 +8,65 @@ export const google_flights: EvalFunction = async ({
88
logger,
99
modelName,
1010
}) => {
11-
await stagehand.page.goto("https://google.com/travel/flights");
11+
try {
12+
await stagehand.page.goto("https://google.com/travel/flights");
1213

13-
const agent = stagehand.agent({
14-
model: modelName,
15-
provider: modelName.startsWith("claude") ? "anthropic" : "openai",
16-
instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. Today is ${new Date().toISOString().slice(0, 10)}. The current page is ${await stagehand.page.title()}`,
17-
});
14+
const agent = stagehand.agent({
15+
model: modelName,
16+
provider: modelName.startsWith("claude") ? "anthropic" : "openai",
17+
instructions: `You are a helpful assistant that can help me with my tasks. You are given a task and you need to complete it without asking follow up questions. Today is ${new Date().toISOString().slice(0, 10)}. The current page is ${await stagehand.page.title()}`,
18+
});
1819

19-
const agentResult = await agent.execute({
20-
instruction:
21-
"Search for flights from San Francisco to New York for next weekend",
22-
maxSteps: 15,
23-
});
24-
logger.log(agentResult);
20+
const agentResult = await agent.execute({
21+
instruction:
22+
"Search for flights from San Francisco to New York for next weekend",
23+
maxSteps: 15,
24+
});
25+
logger.log(agentResult);
2526

26-
const evaluator = new Evaluator(stagehand);
27-
const result = await evaluator.evaluate({
28-
question:
29-
"Does the page show flights (options, available flights, not a search form) from San Francisco to New York?",
30-
strictResponse: true,
31-
});
27+
const evaluator = new Evaluator(stagehand);
28+
const result = await evaluator.evaluate({
29+
question:
30+
"Does the page show flights (options, available flights, not a search form) from San Francisco to New York?",
31+
strictResponse: true,
32+
});
3233

33-
if (result.evaluation !== "YES" && result.evaluation !== "NO") {
34-
await stagehand.close();
35-
return {
36-
_success: false,
37-
observations: "Evaluator provided an invalid response",
38-
debugUrl,
39-
sessionUrl,
40-
logs: logger.getLogs(),
41-
};
42-
}
34+
if (result.evaluation !== "YES" && result.evaluation !== "NO") {
35+
return {
36+
_success: false,
37+
observations: "Evaluator provided an invalid response",
38+
debugUrl,
39+
sessionUrl,
40+
logs: logger.getLogs(),
41+
};
42+
}
4343

44-
if (result.evaluation === "YES") {
45-
await stagehand.close();
46-
return {
47-
_success: true,
48-
observations: result.reasoning,
49-
debugUrl,
50-
sessionUrl,
51-
logs: logger.getLogs(),
52-
};
53-
} else {
54-
await stagehand.close();
44+
if (result.evaluation === "YES") {
45+
return {
46+
_success: true,
47+
observations: result.reasoning,
48+
debugUrl,
49+
sessionUrl,
50+
logs: logger.getLogs(),
51+
};
52+
} else {
53+
return {
54+
_success: false,
55+
observations: result.reasoning,
56+
debugUrl,
57+
sessionUrl,
58+
logs: logger.getLogs(),
59+
};
60+
}
61+
} catch (error) {
5562
return {
5663
_success: false,
57-
observations: result.reasoning,
64+
error: error,
5865
debugUrl,
5966
sessionUrl,
6067
logs: logger.getLogs(),
6168
};
69+
} finally {
70+
await stagehand.close();
6271
}
6372
};

evals/tasks/agent/iframe_form.ts

Lines changed: 66 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -8,77 +8,85 @@ export const iframe_form: EvalFunction = async ({
88
logger,
99
modelName,
1010
}) => {
11-
await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/");
11+
try {
12+
await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/");
1213

13-
const agent = stagehand.agent({
14-
provider: "anthropic",
15-
model: modelName,
16-
});
14+
const agent = stagehand.agent({
15+
provider: "anthropic",
16+
model: modelName,
17+
});
1718

18-
const agentResult = await agent.execute({
19-
instruction: "Fill in the form name with 'John Smith'",
20-
maxSteps: 3,
21-
});
22-
logger.log(agentResult);
19+
const agentResult = await agent.execute({
20+
instruction: "Fill in the form name with 'John Smith'",
21+
maxSteps: 3,
22+
});
23+
logger.log(agentResult);
2324

24-
await stagehand.page.mouse.wheel(0, -1000);
25-
const evaluator = new Evaluator(stagehand);
26-
const result = await evaluator.evaluate({
27-
question: "Is the form name input filled with 'John Smith'?",
28-
strictResponse: true,
29-
});
25+
await stagehand.page.mouse.wheel(0, -1000);
26+
const evaluator = new Evaluator(stagehand);
27+
const result = await evaluator.evaluate({
28+
question: "Is the form name input filled with 'John Smith'?",
29+
strictResponse: true,
30+
});
3031

31-
if (result.evaluation !== "YES" && result.evaluation !== "NO") {
32-
await stagehand.close();
33-
return {
34-
_success: false,
35-
observations: "Evaluator provided an invalid response",
36-
debugUrl,
37-
sessionUrl,
38-
logs: logger.getLogs(),
39-
};
40-
}
32+
if (result.evaluation !== "YES" && result.evaluation !== "NO") {
33+
return {
34+
_success: false,
35+
observations: "Evaluator provided an invalid response",
36+
debugUrl,
37+
sessionUrl,
38+
logs: logger.getLogs(),
39+
};
40+
}
4141

42-
const agentResult2 = await agent.execute({
43-
instruction: "Fill in the form email with '[email protected]'",
44-
maxSteps: 3,
45-
});
46-
logger.log(agentResult2);
42+
const agentResult2 = await agent.execute({
43+
instruction: "Fill in the form email with '[email protected]'",
44+
maxSteps: 3,
45+
});
46+
logger.log(agentResult2);
4747

48-
await stagehand.page.mouse.wheel(0, -1000);
49-
const result2 = await evaluator.evaluate({
50-
question: "Is the form email input filled with '[email protected]'?",
51-
strictResponse: true,
52-
});
48+
await stagehand.page.mouse.wheel(0, -1000);
49+
const result2 = await evaluator.evaluate({
50+
question: "Is the form email input filled with '[email protected]'?",
51+
strictResponse: true,
52+
});
5353

54-
if (result2.evaluation !== "YES" && result2.evaluation !== "NO") {
55-
await stagehand.close();
56-
return {
57-
_success: false,
58-
observations: "Evaluator provided an invalid response",
59-
debugUrl,
60-
sessionUrl,
61-
logs: logger.getLogs(),
62-
};
63-
}
54+
if (result2.evaluation !== "YES" && result2.evaluation !== "NO") {
55+
return {
56+
_success: false,
57+
observations: "Evaluator provided an invalid response",
58+
debugUrl,
59+
sessionUrl,
60+
logs: logger.getLogs(),
61+
};
62+
}
6463

65-
if (result.evaluation === "YES" && result2.evaluation === "YES") {
66-
await stagehand.close();
67-
return {
68-
_success: true,
69-
observations: "All fields were filled correctly",
70-
debugUrl,
71-
sessionUrl,
72-
logs: logger.getLogs(),
73-
};
74-
} else {
75-
await stagehand.close();
64+
if (result.evaluation === "YES" && result2.evaluation === "YES") {
65+
return {
66+
_success: true,
67+
observations: "All fields were filled correctly",
68+
debugUrl,
69+
sessionUrl,
70+
logs: logger.getLogs(),
71+
};
72+
} else {
73+
return {
74+
_success: false,
75+
observations: "One or more fields were not filled correctly",
76+
debugUrl,
77+
sessionUrl,
78+
logs: logger.getLogs(),
79+
};
80+
}
81+
} catch (error) {
7682
return {
7783
_success: false,
78-
observations: "One or more fields were not filled correctly",
84+
error: error,
7985
debugUrl,
8086
sessionUrl,
8187
logs: logger.getLogs(),
8288
};
89+
} finally {
90+
await stagehand.close();
8391
}
8492
};

evals/tasks/agent/iframe_form_multiple.ts

Lines changed: 59 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -8,60 +8,69 @@ export const iframe_form_multiple: EvalFunction = async ({
88
logger,
99
modelName,
1010
}) => {
11-
await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/");
11+
try {
12+
await stagehand.page.goto("https://tucowsdomains.com/abuse-form/phishing/");
1213

13-
const agent = stagehand.agent({
14-
provider: modelName.startsWith("claude") ? "anthropic" : "openai",
15-
model: modelName,
16-
});
14+
const agent = stagehand.agent({
15+
provider: modelName.startsWith("claude") ? "anthropic" : "openai",
16+
model: modelName,
17+
});
1718

18-
const agentResult = await agent.execute({
19-
instruction:
20-
"Fill in the form name with 'John Smith', the email with '[email protected]', and select the 'Are you the domain owner?' option as 'No'",
21-
maxSteps: 10,
22-
});
23-
logger.log(agentResult);
19+
const agentResult = await agent.execute({
20+
instruction:
21+
"Fill in the form name with 'John Smith', the email with '[email protected]', and select the 'Are you the domain owner?' option as 'No'",
22+
maxSteps: 10,
23+
});
24+
logger.log(agentResult);
2425

25-
await stagehand.page.mouse.wheel(0, -1000);
26-
const evaluator = new Evaluator(stagehand);
27-
const results = await evaluator.batchEvaluate({
28-
questions: [
29-
"Is the form name input filled with 'John Smith'?",
30-
"Is the form email input filled with '[email protected]'?",
31-
"Is the 'Are you the domain owner?' option selected as 'No'?",
32-
],
33-
strictResponse: true,
34-
});
26+
await stagehand.page.mouse.wheel(0, -1000);
27+
const evaluator = new Evaluator(stagehand);
28+
const results = await evaluator.batchEvaluate({
29+
questions: [
30+
"Is the form name input filled with 'John Smith'?",
31+
"Is the form email input filled with '[email protected]'?",
32+
"Is the 'Are you the domain owner?' option selected as 'No'?",
33+
],
34+
strictResponse: true,
35+
});
3536

36-
for (const r of results) {
37-
if (r.evaluation !== "YES" && r.evaluation !== "NO") {
38-
await stagehand.close();
39-
return {
40-
_success: false,
41-
observations: "Evaluator provided an invalid response",
42-
debugUrl,
43-
sessionUrl,
44-
logs: logger.getLogs(),
45-
};
37+
for (const r of results) {
38+
if (r.evaluation !== "YES" && r.evaluation !== "NO") {
39+
return {
40+
_success: false,
41+
observations: "Evaluator provided an invalid response",
42+
debugUrl,
43+
sessionUrl,
44+
logs: logger.getLogs(),
45+
};
46+
}
47+
if (r.evaluation === "NO") {
48+
return {
49+
_success: false,
50+
observations: r.reasoning,
51+
debugUrl,
52+
sessionUrl,
53+
logs: logger.getLogs(),
54+
};
55+
}
4656
}
47-
if (r.evaluation === "NO") {
48-
await stagehand.close();
49-
return {
50-
_success: false,
51-
observations: r.reasoning,
52-
debugUrl,
53-
sessionUrl,
54-
logs: logger.getLogs(),
55-
};
56-
}
57-
}
5857

59-
await stagehand.close();
60-
return {
61-
_success: true,
62-
observations: "All fields were filled correctly",
63-
debugUrl,
64-
sessionUrl,
65-
logs: logger.getLogs(),
66-
};
58+
return {
59+
_success: true,
60+
observations: "All fields were filled correctly",
61+
debugUrl,
62+
sessionUrl,
63+
logs: logger.getLogs(),
64+
};
65+
} catch (error) {
66+
return {
67+
_success: false,
68+
error: error,
69+
debugUrl,
70+
sessionUrl,
71+
logs: logger.getLogs(),
72+
};
73+
} finally {
74+
await stagehand.close();
75+
}
6776
};

evals/tasks/agent/kayak.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,6 @@ export const kayak: EvalFunction = async ({
6565
logs: logger.getLogs(),
6666
};
6767
} finally {
68-
stagehand.close();
68+
await stagehand.close();
6969
}
7070
};

0 commit comments

Comments
 (0)