browser-use · Diveyam-Mishra · Aug 10, 2025 · Aug 10, 2025 · Aug 10, 2025 · Aug 10, 2025
diff --git a/workflows/examples/test_iframes.json b/workflows/examples/test_iframes.json
@@ -0,0 +1,101 @@
+{
+  "name": "Recorded Workflow",
+  "description": "Recorded on 8/11/2025, 4:20:47 AM",
+  "version": "1.0.0",
+  "input_schema": [],
+  "steps": [
+    {
+      "type": "navigation",
+      "timestamp": 1754866228439,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "scroll",
+      "timestamp": 1754866228608,
+      "tabId": 388342781,
+      "targetId": 219,
+      "scrollX": 0,
+      "scrollY": 7,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866228634,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "click",
+      "timestamp": 1754866228849,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe",
+      "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe",
+      "frameIdPath": "0",
+      "xpath": "id(\"textareawrapper\")/div[1]/div[6]",
+      "cssSelector": "div.CodeMirror-scroll",
+      "elementTag": "DIV",
+      "elementText": "<!DOCTYPE html><html><body><h1>The iframe element</h1><iframe src=\"https://www.w3schools.com\" title=\"W3Schools Free Online Web Tutorials\"></iframe></body></html>"
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866230495,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "click",
+      "timestamp": 1754866231531,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe",
+      "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe",
+      "frameIdPath": "0",
+      "xpath": "body/div[2]/div[1]/a[4]",
+      "cssSelector": "a.w3-button.w3-bar-item.w3-hide-small.topnav-icons.fa.fa-rotate.ga-tryit[href=\"javascript:void(0);\"][title*=\"Change Orientation\"]",
+      "elementTag": "A",
+      "elementText": ""
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866237707,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "click",
+      "timestamp": 1754866238574,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/",
+      "frameUrl": "https://www.w3schools.com/",
+      "frameIdPath": "0.0",
+      "xpath": "id(\"subtopnav\")/a[3]",
+      "cssSelector": "a.ga-nav[href=\"/js/default.asp\"][title*=\"JavaScript Tutorial\"]",
+      "elementTag": "A",
+      "elementText": "JAVASCRIPT"
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866242778,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "click",
+      "timestamp": 1754866242882,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/js/default.asp",
+      "frameUrl": "https://www.w3schools.com/js/default.asp",
+      "frameIdPath": "0.0",
+      "xpath": "id(\"subtopnav\")/a[1]",
+      "cssSelector": "a.ga-nav.subtopnav_firstitem[href=\"/html/default.asp\"][title*=\"HTML Tutorial\"]",
+      "elementTag": "A",
+      "elementText": "HTML"
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866246395,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    }
+  ]
+}
diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py
@@ -3,8 +3,9 @@
 
 from browser_use import Browser
 from browser_use.agent.views import ActionResult
-from browser_use.controller import Controller
-from browser_use.llm.base import BaseChatModel
+from browser_use.controller.service import Controller
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.prompts import PromptTemplate
 
 from workflow_use.controller.utils import get_best_element_handle, truncate_selector
 from workflow_use.controller.views import (
@@ -19,7 +20,7 @@
 
 logger = logging.getLogger(__name__)
 
-DEFAULT_ACTION_TIMEOUT_MS = 1000
+DEFAULT_ACTION_TIMEOUT_MS = 2500
 
 # List of default actions from browser_use.controller.service.Controller to disable
 # todo: come up with a better way to filter out the actions (filter IN the actions would be much nicer in this case)
@@ -65,9 +66,7 @@ async def navigation(params: NavigationAction, browser_session: Browser) -> Acti
 			"""Navigate to the given URL."""
 			page = await browser_session.get_current_page()
 			await page.goto(params.url)
-			# Wait for page to load (CDP navigate doesn't wait automatically)
-			import asyncio
-			await asyncio.sleep(2)
+			await page.wait_for_load_state()
 
 			msg = f'🔗  Navigated to URL: {params.url}'
 			logger.info(msg)
@@ -84,16 +83,101 @@ async def click(params: ClickElementDeterministicAction, browser_session: Browse
 			page = await browser_session.get_current_page()
 			original_selector = params.cssSelector
 
+			# If frameUrl or frameIdPath are provided, narrow the search to that frame
+			def _select_context(pg):
+				try:
+					from playwright.async_api import Page, Frame
+					ctx: Page | Frame = pg
+					# If frame hints point to top document, stay on page
+					fid = getattr(params, 'frameIdPath', None)
+					furl = getattr(params, 'frameUrl', None)
+					curr_url = (pg.url or '').split('#')[0] if hasattr(pg, 'url') else ''
+					if furl and curr_url and furl.split('#')[0] == curr_url:
+						return pg
+					if fid:
+						segs = [s for s in str(fid).split('.') if s != '']
+						if all(s == '0' for s in segs):
+							return pg
+						f = pg.main_frame
+						for s in segs[1:]:  # skip top marker
+							idx = int(s)
+							if 0 <= idx < len(f.child_frames):
+								f = f.child_frames[idx]
+							else:
+								return pg
+						return f
+					if furl:
+						from urllib.parse import urlparse
+						pf = urlparse(furl)
+						# If frameUrl equals current page URL (origin+path), stay on page
+						try:
+							from urllib.parse import urlparse as _u
+							cu = _u(curr_url)
+							if (cu.scheme, cu.netloc, cu.path) == (pf.scheme, pf.netloc, pf.path):
+								return pg
+						except Exception:
+							pass
+						for fr in pg.frames:
+							try:
+								ff = urlparse(fr.url)
+								if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl):
+									return fr
+							except Exception:
+								continue
+				except Exception:
+					return pg
+				return ctx
+
+			# Fallback: search all frames for selector (prefer frames matching target origin)
+			async def _find_in_frames(pg, selector: str):
+				from urllib.parse import urlparse
+				prefer = getattr(params, 'frameUrl', None) or getattr(params, 'url', None) or ''
+				pref_o = urlparse(prefer) if prefer else None
+				frames = list(pg.frames)
+				def score(fr):
+					if not pref_o:
+						return 0
+					try:
+						fo = urlparse(fr.url)
+						return 2 if (fo.scheme, fo.netloc) == (pref_o.scheme, pref_o.netloc) else 0
+					except Exception:
+						return 0
+				frames.sort(key=score, reverse=True)
+				for fr in frames:
+					try:
+						loc, used = await get_best_element_handle(fr, selector, params, timeout_ms=max(800, DEFAULT_ACTION_TIMEOUT_MS // 2))
+						return fr, loc, used
+					except Exception:
+						continue
+				return None, None, None
+
 			try:
-				locator, selector_used = await get_best_element_handle(
-					page,
-					params.cssSelector,
-					params,
-					timeout_ms=DEFAULT_ACTION_TIMEOUT_MS,
-				)
+				# Only auto-navigate for top-document clicks (no frame hints) when a different URL is declared
+				curr = (page.url or '').split('#')[0]
+				declared_url = (getattr(params, 'url', None) or '').split('#')[0]
+				has_frame_hints = bool(getattr(params, 'frameIdPath', None) or getattr(params, 'frameUrl', None))
+				if declared_url and declared_url.startswith('http') and not has_frame_hints and curr != declared_url:
+					await page.goto(declared_url)
+					await page.wait_for_load_state()
+
+				ctx = _select_context(page)
+				try:
+					locator, selector_used = await get_best_element_handle(
+						ctx,
+						params.cssSelector,
+						params,
+						timeout_ms=DEFAULT_ACTION_TIMEOUT_MS,
+					)
+				except Exception:
+					# Fallback: search all frames
+					fr, locator, selector_used = await _find_in_frames(page, params.cssSelector)
+					if locator is None:
+						raise
+
 				await locator.click(force=True)
 
-				msg = f'🖱️  Clicked element with CSS selector: {truncate_selector(selector_used)} (original: {truncate_selector(original_selector)})'
+				used_str = selector_used if isinstance(selector_used, str) and selector_used else params.cssSelector
+				msg = f'🖱️  Clicked element with CSS selector: {truncate_selector(used_str)} (original: {truncate_selector(original_selector)})'
 				logger.info(msg)
 				return ActionResult(extracted_content=msg, include_in_memory=True)
 			except Exception as e:
@@ -201,7 +285,7 @@ async def key_press(params: KeyPressDeterministicAction, browser_session: Browse
 		async def scroll(params: ScrollDeterministicAction, browser_session: Browser) -> ActionResult:
 			"""Scroll the page by the given x/y pixel offsets."""
 			page = await browser_session.get_current_page()
-			await page.evaluate(f'() => window.scrollBy({params.scrollX}, {params.scrollY})')
+			await page.evaluate(f'window.scrollBy({params.scrollX}, {params.scrollY});')
 			msg = f'📜  Scrolled page by (x={params.scrollX}, y={params.scrollY})'
 			logger.info(msg)
 			return ActionResult(extracted_content=msg, include_in_memory=True)
@@ -220,18 +304,19 @@ async def extract_page_content(
 
 			strip = ['a', 'img']
 
-			# Get page HTML content using CDP evaluate
-			html_content = await page.evaluate('() => document.documentElement.outerHTML')
-			content = markdownify.markdownify(html_content, strip=strip)
+			content = markdownify.markdownify(await page.content(), strip=strip)
 
-			# Note: iframe content extraction is not yet supported in CDP-based implementation
-			# TODO: Implement iframe content extraction using CDP
+			# manually append iframe text into the content so it's readable by the LLM (includes cross-origin iframes)
+			for iframe in page.frames:
+				if iframe.url != page.url and not iframe.url.startswith('data:'):
+					content += f'\n\nIFRAME {iframe.url}:\n'
+					content += markdownify.markdownify(await iframe.content())
 
-			prompt = f'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {params.goal}, Page: {content}'
+			prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}'
+			template = PromptTemplate(input_variables=['goal', 'page'], template=prompt)
 			try:
-				from browser_use.llm import UserMessage
-				output = await page_extraction_llm.ainvoke([UserMessage(content=prompt)])
-				msg = f'📄  Extracted from page\n: {output.completion}\n'
+				output = await page_extraction_llm.ainvoke(template.format(goal=params.goal, page=content))
+				msg = f'📄  Extracted from page\n: {output.content}\n'
 				logger.info(msg)
 				return ActionResult(extracted_content=msg, include_in_memory=True)
 			except Exception as e:

diff --git a/workflows/workflow_use/controller/views.py b/workflows/workflow_use/controller/views.py
@@ -13,9 +13,8 @@ class Config:
 
 # Mixin for shared step metadata (timestamp and tab context)
 class StepMeta(_BaseExtra):
-	# timestamp: int
-	# tabId: int
-	pass
+	timestamp: int
+	tabId: int
 
 
 # Common optional fields present in recorder events
@@ -24,6 +23,8 @@ class RecorderBase(StepMeta):
 	elementTag: Optional[str] = None
 	elementText: Optional[str] = None
 	frameUrl: Optional[str] = None
+	frameIdPath: Optional[str] = None
+	url: Optional[str] = None
 	screenshot: Optional[str] = None
 
 

diff --git a/workflows/workflow_use/recorder/service.py b/workflows/workflow_use/recorder/service.py
@@ -7,6 +7,7 @@
 from browser_use import Browser
 from browser_use.browser.profile import BrowserProfile
 from fastapi import FastAPI
+from patchright.async_api import async_playwright as patchright_async_playwright
 
 # Assuming views.py is correctly located for this import path
 from workflow_use.recorder.views import (
@@ -84,7 +85,32 @@ async def _capture_and_signal_final_workflow(self, trigger_reason: str):
 		async with self.final_workflow_processed_lock:
 			if not self.final_workflow_processed_flag and self.last_workflow_update_event:
 				print(f'[Service] Capturing final workflow (Trigger: {trigger_reason}).')
-				self.final_workflow_output = self.last_workflow_update_event.payload
+				wf = self.last_workflow_update_event.payload
+				# Backend safety filter: drop about:blank and obvious ad/analytics iframe navigations
+				try:
+					clean_steps = []
+					for s in wf.steps:
+						st = getattr(s, 'type', None) or (s.get('type') if isinstance(s, dict) else None)
+						url = getattr(s, 'url', None) or (s.get('url') if isinstance(s, dict) else None)
+						if st == 'navigation':
+							if not url or url == 'about:blank':
+								continue
+							from urllib.parse import urlparse
+							host = urlparse(url).hostname or ''
+							blocked = any(
+								pat in host for pat in (
+									'doubleclick.net', 'googlesyndication.com', 'googleadservices.com',
+									'amazon-adsystem.com', '2mdn.net', 'recaptcha.google.com', 'recaptcha.net',
+									'googletagmanager.com', 'indexww.com', 'adtrafficquality.google'
+								)
+							)
+							if blocked:
+								continue
+						clean_steps.append(s)
+					wf.steps = clean_steps
+				except Exception as e:
+					print(f'[Service] Backend filter failed: {e}')
+				self.final_workflow_output = wf
 				self.final_workflow_processed_flag = True
 				processed_this_call = True
 
@@ -96,7 +122,7 @@ async def _capture_and_signal_final_workflow(self, trigger_reason: str):
 			if trigger_reason == 'RecordingStoppedEvent' and self.browser:
 				print('[Service] Attempting to close browser due to RecordingStoppedEvent...')
 				try:
-					await self.browser.stop()
+					await self.browser.close()
 					print('[Service] Browser close command issued.')
 				except Exception as e_close:
 					print(f'[Service] Error closing browser on recording stop: {e_close}')
@@ -127,7 +153,8 @@ async def _launch_browser_and_wait(self):
 			)
 
 			# Create and configure browser
-			self.browser = Browser(browser_profile=profile)
+			playwright = await patchright_async_playwright().start()
+			self.browser = Browser(browser_profile=profile, playwright=playwright)
 
 			print('[Service] Starting browser with extensions...')
 			await self.browser.start()
@@ -150,7 +177,7 @@ async def _launch_browser_and_wait(self):
 			print('[Service] Browser task cancelled.')
 			if self.browser:
 				try:
-					await self.browser.stop()
+					await self.browser.close()
 				except:
 					pass  # Best effort
 			raise  # Re-raise to be caught by gather
@@ -218,7 +245,7 @@ async def capture_workflow(self) -> Optional[WorkflowDefinitionSchema]:
 				print('[Service] Ensuring browser is closed in cleanup...')
 				try:
 					self.browser.browser_profile.keep_alive = False
-					await self.browser.stop()
+					await self.browser.close()
 				except Exception as e_browser_close:
 					print(f'[Service] Error closing browser in final cleanup: {e_browser_close}')
 				# self.browser = None