Skip to content

Commit 2d14001

Browse files
committed
add evals
1 parent 054103b commit 2d14001

File tree

9 files changed

+418
-0
lines changed

9 files changed

+418
-0
lines changed

evals/evals.config.json

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,42 @@
418418
{
419419
"name": "agent/sign_in",
420420
"categories": ["agent"]
421+
},
422+
{
423+
"name": "osr_in_oopif",
424+
"categories": ["act"]
425+
},
426+
{
427+
"name": "csr_in_oopif",
428+
"categories": ["act"]
429+
},
430+
{
431+
"name": "csr_in_spif",
432+
"categories": ["act"]
433+
},
434+
{
435+
"name": "csr_in_spif",
436+
"categories": ["act"]
437+
},
438+
{
439+
"name": "spif_in_osr",
440+
"categories": ["act"]
441+
},
442+
{
443+
"name": "oopif_in_osr",
444+
"categories": ["act"]
445+
},
446+
{
447+
"name": "spif_in_csr",
448+
"categories": ["act"]
449+
},
450+
{
451+
"name": "oopif_in_csr",
452+
"categories": ["act"]
453+
},
454+
{
455+
"name": "osr_in_spif",
456+
"categories": ["act"]
421457
}
422458
]
423459
}

evals/tasks/csr_in_oopif.ts

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import { EvalFunction } from "@/types/evals";
2+
3+
export const csr_in_oopif: EvalFunction = async ({
4+
debugUrl,
5+
sessionUrl,
6+
stagehand,
7+
logger,
8+
}) => {
9+
// this eval is designed to test whether stagehand can successfully
10+
// click inside an CSR (closed mode shadow) root that is inside an
11+
// OOPIF (out of process iframe)
12+
13+
const page = stagehand.page;
14+
try {
15+
await page.goto(
16+
"https://browserbase.github.io/stagehand-eval-sites/sites/closed-shadow-root-in-oopif/",
17+
);
18+
await page.act({ action: "click the button", iframes: true });
19+
20+
const extraction = await page.extract({
21+
instruction: "extract the entire page text",
22+
iframes: true,
23+
});
24+
25+
const pageText = extraction.extraction;
26+
27+
if (pageText.includes("button successfully clicked")) {
28+
return {
29+
_success: true,
30+
message: `successfully clicked the button`,
31+
debugUrl,
32+
sessionUrl,
33+
logs: logger.getLogs(),
34+
};
35+
}
36+
} catch (error) {
37+
return {
38+
_success: false,
39+
message: `error: ${error.message}`,
40+
debugUrl,
41+
sessionUrl,
42+
logs: logger.getLogs(),
43+
};
44+
} finally {
45+
await stagehand.close();
46+
}
47+
};

evals/tasks/csr_in_spif.ts

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import { EvalFunction } from "@/types/evals";
2+
3+
export const csr_in_spif: EvalFunction = async ({
4+
debugUrl,
5+
sessionUrl,
6+
stagehand,
7+
logger,
8+
}) => {
9+
// this eval is designed to test whether stagehand can successfully
10+
// click inside an CSR (closed mode shadow) root that is inside an
11+
// SPIF (same process iframe)
12+
13+
const page = stagehand.page;
14+
try {
15+
await page.goto(
16+
"https://browserbase.github.io/stagehand-eval-sites/sites/closed-shadow-dom-in-spif/",
17+
);
18+
await page.act({ action: "click the button", iframes: true });
19+
20+
const extraction = await page.extract({
21+
instruction: "extract the entire page text",
22+
iframes: true,
23+
});
24+
25+
const pageText = extraction.extraction;
26+
27+
if (pageText.includes("button successfully clicked")) {
28+
return {
29+
_success: true,
30+
message: `successfully clicked the button`,
31+
debugUrl,
32+
sessionUrl,
33+
logs: logger.getLogs(),
34+
};
35+
}
36+
} catch (error) {
37+
return {
38+
_success: false,
39+
message: `error: ${error.message}`,
40+
debugUrl,
41+
sessionUrl,
42+
logs: logger.getLogs(),
43+
};
44+
} finally {
45+
await stagehand.close();
46+
}
47+
};

evals/tasks/oopif_in_csr.ts

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import { EvalFunction } from "@/types/evals";
2+
3+
export const oopif_in_csr: EvalFunction = async ({
4+
debugUrl,
5+
sessionUrl,
6+
stagehand,
7+
logger,
8+
}) => {
9+
// this eval is designed to test whether stagehand can successfully
10+
// fill a form inside a OOPIF (out of process iframe) that is inside an
11+
// CSR (closed mode shadow) root
12+
13+
const page = stagehand.page;
14+
try {
15+
await page.goto(
16+
"https://browserbase.github.io/stagehand-eval-sites/sites/oopif-in-open-shadow-dom/",
17+
);
18+
await page.act({
19+
action: "fill 'nunya' into the first name field",
20+
iframes: true,
21+
});
22+
23+
const extraction = await page.extract({
24+
instruction: "extract the entire page text",
25+
iframes: true,
26+
});
27+
28+
const pageText = extraction.extraction;
29+
30+
if (pageText.includes("nunya")) {
31+
return {
32+
_success: true,
33+
message: `successfully clicked the button`,
34+
debugUrl,
35+
sessionUrl,
36+
logs: logger.getLogs(),
37+
};
38+
}
39+
} catch (error) {
40+
return {
41+
_success: false,
42+
message: `error: ${error.message}`,
43+
debugUrl,
44+
sessionUrl,
45+
logs: logger.getLogs(),
46+
};
47+
} finally {
48+
await stagehand.close();
49+
}
50+
};

evals/tasks/oopif_in_osr.ts

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import { EvalFunction } from "@/types/evals";
2+
3+
export const oopif_in_osr: EvalFunction = async ({
4+
debugUrl,
5+
sessionUrl,
6+
stagehand,
7+
logger,
8+
}) => {
9+
// this eval is designed to test whether stagehand can successfully
10+
// fill a form inside a OOPIF (out of process iframe) that is inside an
11+
// OSR (open mode shadow) root
12+
13+
const page = stagehand.page;
14+
try {
15+
await page.goto(
16+
"https://browserbase.github.io/stagehand-eval-sites/sites/oopif-in-open-shadow-dom/",
17+
);
18+
await page.act({
19+
action: "fill 'nunya' into the first name field",
20+
iframes: true,
21+
});
22+
23+
const extraction = await page.extract({
24+
instruction: "extract the entire page text",
25+
iframes: true,
26+
});
27+
28+
const pageText = extraction.extraction;
29+
30+
if (pageText.includes("nunya")) {
31+
return {
32+
_success: true,
33+
message: `successfully clicked the button`,
34+
debugUrl,
35+
sessionUrl,
36+
logs: logger.getLogs(),
37+
};
38+
}
39+
} catch (error) {
40+
return {
41+
_success: false,
42+
message: `error: ${error.message}`,
43+
debugUrl,
44+
sessionUrl,
45+
logs: logger.getLogs(),
46+
};
47+
} finally {
48+
await stagehand.close();
49+
}
50+
};

evals/tasks/osr_in_oopif.ts

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import { EvalFunction } from "@/types/evals";
2+
3+
export const osr_in_oopif: EvalFunction = async ({
4+
debugUrl,
5+
sessionUrl,
6+
stagehand,
7+
logger,
8+
}) => {
9+
// this eval is designed to test whether stagehand can successfully
10+
// click inside an OSR (open mode shadow) root that is inside an
11+
// OOPIF (out of process iframe)
12+
13+
const page = stagehand.page;
14+
try {
15+
await page.goto(
16+
"https://browserbase.github.io/stagehand-eval-sites/sites/open-shadow-root-in-oopif/",
17+
);
18+
await page.act({ action: "click the button", iframes: true });
19+
20+
const extraction = await page.extract({
21+
instruction: "extract the entire page text",
22+
iframes: true,
23+
});
24+
25+
const pageText = extraction.extraction;
26+
27+
if (pageText.includes("button successfully clicked")) {
28+
return {
29+
_success: true,
30+
message: `successfully clicked the button`,
31+
debugUrl,
32+
sessionUrl,
33+
logs: logger.getLogs(),
34+
};
35+
}
36+
} catch (error) {
37+
return {
38+
_success: false,
39+
message: `error: ${error.message}`,
40+
debugUrl,
41+
sessionUrl,
42+
logs: logger.getLogs(),
43+
};
44+
} finally {
45+
await stagehand.close();
46+
}
47+
};

evals/tasks/osr_in_spif.ts

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import { EvalFunction } from "@/types/evals";
2+
3+
export const osr_in_spif: EvalFunction = async ({
4+
debugUrl,
5+
sessionUrl,
6+
stagehand,
7+
logger,
8+
}) => {
9+
// this eval is designed to test whether stagehand can successfully
10+
// click inside an OSR (open mode shadow) root that is inside an
11+
// SPIF (same process iframe)
12+
13+
const page = stagehand.page;
14+
try {
15+
await page.goto(
16+
"https://browserbase.github.io/stagehand-eval-sites/sites/open-shadow-root-in-spif/",
17+
);
18+
await page.act({ action: "click the button", iframes: true });
19+
20+
const extraction = await page.extract({
21+
instruction: "extract the entire page text",
22+
iframes: true,
23+
});
24+
25+
const pageText = extraction.extraction;
26+
27+
if (pageText.includes("button successfully clicked")) {
28+
return {
29+
_success: true,
30+
message: `successfully clicked the button`,
31+
debugUrl,
32+
sessionUrl,
33+
logs: logger.getLogs(),
34+
};
35+
}
36+
} catch (error) {
37+
return {
38+
_success: false,
39+
message: `error: ${error.message}`,
40+
debugUrl,
41+
sessionUrl,
42+
logs: logger.getLogs(),
43+
};
44+
} finally {
45+
await stagehand.close();
46+
}
47+
};

evals/tasks/spif_in_csr.ts

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import { EvalFunction } from "@/types/evals";
2+
3+
export const spif_in_csr: EvalFunction = async ({
4+
debugUrl,
5+
sessionUrl,
6+
stagehand,
7+
logger,
8+
}) => {
9+
// this eval is designed to test whether stagehand can successfully
10+
// click inside a SPIF (same process iframe) that is inside an
11+
// CSR (closed mode shadow) root
12+
13+
const page = stagehand.page;
14+
try {
15+
await page.goto(
16+
"https://browserbase.github.io/stagehand-eval-sites/sites/spif-in-closed-shadow-dom/",
17+
);
18+
await page.act({ action: "click the button", iframes: true });
19+
20+
const extraction = await page.extract({
21+
instruction: "extract the entire page text",
22+
iframes: true,
23+
});
24+
25+
const pageText = extraction.extraction;
26+
27+
if (pageText.includes("button successfully clicked")) {
28+
return {
29+
_success: true,
30+
message: `successfully clicked the button`,
31+
debugUrl,
32+
sessionUrl,
33+
logs: logger.getLogs(),
34+
};
35+
}
36+
} catch (error) {
37+
return {
38+
_success: false,
39+
message: `error: ${error.message}`,
40+
debugUrl,
41+
sessionUrl,
42+
logs: logger.getLogs(),
43+
};
44+
} finally {
45+
await stagehand.close();
46+
}
47+
};

0 commit comments

Comments
 (0)