enable scrolling inside of iframes (#919)

seanmcguire12 · web-flow · commit 3d804210a106 · 2025-08-13T14:32:24.000-07:00
# why
this PR addresses two issues, both related to scrolling inside iframes:
1. we are not calling `.evaluate` inside frames:  
- currently, the scrolling functionality that exists inside
`performPlaywrightMethod` calls `page.evaluate`
- this means that scrolling can only ever happen at the page level, not
inside of iframes
- inside each of these scroll related helpers, we already have access to
a chained locator that optionally points to an element inside an iframe
- therefore, we should use this chained locator, and call
`locator.evaluate`
2. for SPIFs (same process iframes), we are not looking for scrollable
elements in the correct execution context:
- when we call `resolveObjectIdForXPath`, this executes in either the
page level execution context, or, for OOPIFs (out of process iframes),
the frame level execution context
- this is problematic because SPIFs share the the same CDP session as
the root document, which means that we are responsible for specifying
the execution context. since we aren't doing this,
`resolveObjectIdForXPath` only searches in the root document execution
context, and can't see anything inside of the SPIF
# what changed
- to address issue number 1, I updated `scrollToNextChunk`,
`scrollToPreviousChunk`, `scrollElementToPercentage` all use
`locator.evaluate` instead of `page.evaluate` which enables scrolling
inside (and outside) of iframes
- to address issue number 2, I added a function
`getFrameExecutionContextId` which creates an isolated world &amp; returns a
SPIF scoped execution context
- we use this downstream in `resolveObjectIdForXPath` which guarantees
that we are searching for scrollable elements in the correct execution
context
# test plan
- added an eval for scrolling inside a same-process iframe
- `act` evals
- `targeted_extract` evals
- `observe` evals
- `extract` evals
diff --git a/.changeset/open-donkeys-shine.md b/.changeset/open-donkeys-shine.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+enable scrolling inside of iframes
diff --git a/evals/evals.config.json b/evals/evals.config.json
@@ -419,6 +419,10 @@
       "name": "agent/sign_in",
       "categories": ["agent"]
     },
+    {
+      "name": "iframe_scroll",
+      "categories": ["act"]
+    },
     {
       "name": "namespace_xpath",
       "categories": ["act"]
diff --git a/evals/tasks/iframe_scroll.ts b/evals/tasks/iframe_scroll.ts
@@ -0,0 +1,60 @@
+import { EvalFunction } from "@/types/evals";
+
+export const iframe_scroll: EvalFunction = async ({
+  debugUrl,
+  sessionUrl,
+  stagehand,
+  logger,
+}) => {
+  try {
+    await stagehand.page.goto(
+      "https://browserbase.github.io/stagehand-eval-sites/sites/iframe-same-proc-scroll/",
+    );
+    await stagehand.page.act({
+      action: "scroll down 50% inside the iframe",
+      iframes: true,
+    });
+
+    const frames = stagehand.page.frames();
+    const frame = frames[1];
+
+    await new Promise((resolve) => setTimeout(resolve, 5000));
+
+    // Get the current scroll position and total scroll height
+    const scrollInfo = await frame.evaluate(() => {
+      return {
+        scrollTop: window.scrollY + window.innerHeight / 2,
+        scrollHeight: document.documentElement.scrollHeight,
+      };
+    });
+
+    const halfwayScroll = scrollInfo.scrollHeight / 2;
+    const halfwayReached = Math.abs(scrollInfo.scrollTop - halfwayScroll) <= 1;
+    const evaluationResult = halfwayReached
+      ? {
+          _success: true,
+          logs: logger.getLogs(),
+          debugUrl,
+          sessionUrl,
+        }
+      : {
+          _success: false,
+          logs: logger.getLogs(),
+          debugUrl,
+          sessionUrl,
+          message: `Scroll position (${scrollInfo.scrollTop}px) is not halfway down the page (${halfwayScroll}px).`,
+        };
+
+    return evaluationResult;
+  } catch (error) {
+    return {
+      _success: false,
+      error: error,
+      logs: logger.getLogs(),
+      debugUrl,
+      sessionUrl,
+    };
+  } finally {
+    await stagehand.close();
+  }
+};
diff --git a/lib/a11y/utils.ts b/lib/a11y/utils.ts
@@ -30,6 +30,8 @@ const PUA_END = 0xf8ff;
 
 const NBSP_CHARS = new Set<number>([0x00a0, 0x202f, 0x2007, 0xfeff]);
 
+const WORLD_NAME = "stagehand-world";
+
 /**
  * Clean a string by removing private-use unicode characters, normalizing whitespace,
  * and trimming the result.
@@ -1045,6 +1047,8 @@ export async function resolveObjectIdForXPath(
   xpath: string,
   targetFrame?: Frame,
 ): Promise<string | null> {
+  const contextId = await getFrameExecutionContextId(page, targetFrame);
+
   const { result } = await page.sendCDP<{
     result?: { objectId?: string };
   }>(
@@ -1063,13 +1067,42 @@ export async function resolveObjectIdForXPath(
         })();
       `,
       returnByValue: false,
+      ...(contextId !== undefined ? { contextId } : {}),
     },
     targetFrame,
   );
   if (!result?.objectId) throw new StagehandElementNotFoundError([xpath]);
   return result.objectId;
 }
 
+/**
+ * Returns a stable executionContextId for the given frame by creating (or reusing)
+ * an isolated world in that frame.
+ */
+async function getFrameExecutionContextId(
+  stagehandPage: StagehandPage,
+  frame: Frame,
+): Promise<number | undefined> {
+  if (!frame || frame === stagehandPage.page.mainFrame()) {
+    // Main frame (or no frame): use the default world.
+    return undefined;
+  }
+  const frameId: string = await getCDPFrameId(stagehandPage, frame);
+  const { executionContextId } = await stagehandPage.sendCDP<{
+    executionContextId: number;
+  }>(
+    "Page.createIsolatedWorld",
+    {
+      frameId,
+      worldName: WORLD_NAME,
+      grantUniversalAccess: true,
+    },
+    frame,
+  );
+
+  return executionContextId;
+}
+
 /**
  * Collapse consecutive whitespace characters (spaces, tabs, newlines, carriage returns)
  * into single ASCII spaces.
diff --git a/lib/handlers/handlerUtils/actHandlerUtils.ts b/lib/handlers/handlerUtils/actHandlerUtils.ts
@@ -1,7 +1,6 @@
 import { Page, Locator, FrameLocator } from "playwright";
 import { PlaywrightCommandException } from "../../../types/playwright";
 import { StagehandPage } from "../../StagehandPage";
-import { getNodeFromXpath } from "@/lib/dom/utils";
 import { Logger } from "../../../types/log";
 import { MethodHandlerContext } from "@/types/act";
 import { StagehandClickError } from "@/types/stagehandErrors";
@@ -59,7 +58,7 @@ export const methodHandlerMap: Record<
 };
 
 export async function scrollToNextChunk(ctx: MethodHandlerContext) {
-  const { stagehandPage, xpath, logger } = ctx;
+  const { locator, logger, xpath } = ctx;
 
   logger({
     category: "action",
@@ -71,40 +70,45 @@ export async function scrollToNextChunk(ctx: MethodHandlerContext) {
   });
 
   try {
-    await stagehandPage.page.evaluate(
-      ({ xpath }) => {
-        const elementNode = getNodeFromXpath(xpath);
-        if (!elementNode || elementNode.nodeType !== Node.ELEMENT_NODE) {
-          throw Error(`Could not locate element to scroll on.`);
-        }
+    await locator.evaluate(
+      (element) => {
+        const waitForScrollEnd = (el: HTMLElement | Element) =>
+          new Promise<void>((resolve) => {
+            let last = el.scrollTop ?? 0;
+            const check = () => {
+              const cur = el.scrollTop ?? 0;
+              if (cur === last) return resolve();
+              last = cur;
+              requestAnimationFrame(check);
+            };
+            requestAnimationFrame(check);
+          });
 
-        const element = elementNode as HTMLElement;
         const tagName = element.tagName.toLowerCase();
-        let height: number;
 
         if (tagName === "html" || tagName === "body") {
-          height = window.visualViewport.height;
-          window.scrollBy({
-            top: height,
-            left: 0,
-            behavior: "smooth",
-          });
+          const height = window.visualViewport?.height ?? window.innerHeight;
 
-          const scrollingEl =
-            document.scrollingElement || document.documentElement;
-          return window.waitForElementScrollEnd(scrollingEl as HTMLElement);
-        } else {
-          height = element.getBoundingClientRect().height;
-          element.scrollBy({
-            top: height,
-            left: 0,
-            behavior: "smooth",
-          });
+          window.scrollBy({ top: height, left: 0, behavior: "smooth" });
+
+          const scrollingRoot = (document.scrollingElement ??
+            document.documentElement) as HTMLElement;
 
-          return window.waitForElementScrollEnd(element);
+          return waitForScrollEnd(scrollingRoot);
         }
+
+        const height = (element as HTMLElement).getBoundingClientRect().height;
+
+        (element as HTMLElement).scrollBy({
+          top: height,
+          left: 0,
+          behavior: "smooth",
+        });
+
+        return waitForScrollEnd(element);
       },
-      { xpath },
+      undefined,
+      { timeout: 10_000 },
     );
   } catch (e) {
     logger({
@@ -122,7 +126,7 @@ export async function scrollToNextChunk(ctx: MethodHandlerContext) {
 }
 
 export async function scrollToPreviousChunk(ctx: MethodHandlerContext) {
-  const { stagehandPage, xpath, logger } = ctx;
+  const { locator, logger, xpath } = ctx;
 
   logger({
     category: "action",
@@ -134,39 +138,41 @@ export async function scrollToPreviousChunk(ctx: MethodHandlerContext) {
   });
 
   try {
-    await stagehandPage.page.evaluate(
-      ({ xpath }) => {
-        const elementNode = getNodeFromXpath(xpath);
-        if (!elementNode || elementNode.nodeType !== Node.ELEMENT_NODE) {
-          throw Error(`Could not locate element to scroll on.`);
-        }
+    await locator.evaluate(
+      (element) => {
+        const waitForScrollEnd = (el: HTMLElement | Element) =>
+          new Promise<void>((resolve) => {
+            let last = el.scrollTop ?? 0;
+            const check = () => {
+              const cur = el.scrollTop ?? 0;
+              if (cur === last) return resolve();
+              last = cur;
+              requestAnimationFrame(check);
+            };
+            requestAnimationFrame(check);
+          });
 
-        const element = elementNode as HTMLElement;
         const tagName = element.tagName.toLowerCase();
-        let height: number;
 
         if (tagName === "html" || tagName === "body") {
-          height = window.visualViewport.height;
-          window.scrollBy({
-            top: -height,
-            left: 0,
-            behavior: "smooth",
-          });
+          const height = window.visualViewport?.height ?? window.innerHeight;
+          window.scrollBy({ top: -height, left: 0, behavior: "smooth" });
 
-          const scrollingEl =
-            document.scrollingElement || document.documentElement;
-          return window.waitForElementScrollEnd(scrollingEl as HTMLElement);
-        } else {
-          height = element.getBoundingClientRect().height;
-          element.scrollBy({
-            top: -height,
-            left: 0,
-            behavior: "smooth",
-          });
-          return window.waitForElementScrollEnd(element);
+          const rootScrollingEl = (document.scrollingElement ??
+            document.documentElement) as HTMLElement;
+
+          return waitForScrollEnd(rootScrollingEl);
         }
+        const height = (element as HTMLElement).getBoundingClientRect().height;
+        (element as HTMLElement).scrollBy({
+          top: -height,
+          left: 0,
+          behavior: "smooth",
+        });
+        return waitForScrollEnd(element);
       },
-      { xpath },
+      undefined,
+      { timeout: 10_000 },
     );
   } catch (e) {
     logger({
@@ -215,7 +221,7 @@ export async function scrollElementIntoView(ctx: MethodHandlerContext) {
 }
 
 export async function scrollElementToPercentage(ctx: MethodHandlerContext) {
-  const { args, stagehandPage, xpath, logger } = ctx;
+  const { args, xpath, logger, locator } = ctx;
 
   logger({
     category: "action",
@@ -230,20 +236,14 @@ export async function scrollElementToPercentage(ctx: MethodHandlerContext) {
   try {
     const [yArg = "0%"] = args as string[];
 
-    await stagehandPage.page.evaluate(
-      ({ xpath, yArg }) => {
+    await locator.evaluate<void, { yArg: string }>(
+      (element, { yArg }) => {
         function parsePercent(val: string): number {
           const cleaned = val.trim().replace("%", "");
           const num = parseFloat(cleaned);
           return Number.isNaN(num) ? 0 : Math.max(0, Math.min(num, 100));
         }
 
-        const elementNode = getNodeFromXpath(xpath);
-        if (!elementNode || elementNode.nodeType !== Node.ELEMENT_NODE) {
-          throw Error(`Could not locate element to scroll on.`);
-        }
-
-        const element = elementNode as HTMLElement;
         const yPct = parsePercent(yArg);
 
         if (element.tagName.toLowerCase() === "html") {
@@ -266,7 +266,8 @@ export async function scrollElementToPercentage(ctx: MethodHandlerContext) {
           });
         }
       },
-      { xpath, yArg },
+      { yArg },
+      { timeout: 10_000 },
     );
   } catch (e) {
     logger({

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@browserbasehq/stagehand": patch
 +---
++
 +enable scrolling inside of iframes