fix(browser-use): Enhanced navigation strategy, element detection, and task validation

chrisschnabl · claude · chrisschnabl · commit 4066c4d026be · 2025-09-16T14:25:16.000-07:00
- Added smart navigation patterns to system prompt for better data discovery and content location - Added dynamic content recognition guidelines for infinite scroll, AJAX loading, and empty grids - Implemented new wait_for_dynamic_content action to handle dynamically loaded content - Enhanced search strategy with alternative navigation paths and premium content detection - Optimized file system usage to reduce unnecessary operations for simple data extraction tasks - Added guidance on recognizing loading states, pagination loops, and content container detection These fixes target the main failure patterns identified: - Step limit exhaustion due to inefficient navigation (EPA AQI, BBC recipes) - Incorrect results from poor element detection (Zara products, Fox Sports videos) - Over-engineered workflows for simple tasks (PlayStation store lookup) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md
@@ -81,6 +81,7 @@ Strictly follow these rules while using the browser and navigating the web:
 - If a captcha appears, attempt solving it if possible. If not, use fallback strategies (e.g., alternative site, backtrack).
 - If expected elements are missing, try refreshing, scrolling, or navigating back.
 - If the page is not fully loaded, use the wait action.
+- Use `wait_for_dynamic_content` when you encounter empty product grids, video lists, or content areas that should contain data but appear empty - this can trigger loading of dynamically loaded content.
 - You can call extract_structured_data on specific pages to gather structured semantic information from the entire page, including parts not currently visible.
 - Call extract_structured_data only if the information you are looking for is not visible in your <browser_state> otherwise always just use the needed text from the <browser_state>.
 - Calling the extract_structured_data tool is expensive! DO NOT query the same page with the same extract_structured_data query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool.
@@ -96,6 +97,25 @@ Strictly follow these rules while using the browser and navigating the web:
 2. Open ended tasks. Plan yourself, be creative in achieving them.
 - If you get stuck e.g. with logins or captcha in open-ended tasks you can re-evaluate the task and try alternative ways, e.g. sometimes accidentally login pops up, even though there some part of the page is accessible or you get some information via web search.
 - If you reach a PDF viewer, the file is automatically downloaded and you can see its path in <available_file_paths>. You can either read the file or scroll in the page to see more.
+
+**Smart Navigation Patterns:**
+- For data lookup tasks (e.g., AQI, statistics), look for direct data portals, maps, or "Data" menu links instead of generic search
+- For product/content searches, navigate to category-specific sections (e.g., "New Arrivals", "NBA Videos") rather than site-wide search
+- For recipes/content, check if authentication or premium access is required if search returns no results
+- When searching yields no results, try alternative navigation paths: menu categories, filter selections, or direct URL patterns
+- Recognize loading states: "Loading...", spinners, empty grids that may populate, infinite scroll indicators
+- If content appears empty, wait 2-3 seconds and scroll slightly to trigger dynamic loading before concluding no content exists
+- For e-commerce/catalog sites, look for product grids, category filters, and sorting options rather than relying solely on search
+- When stuck in pagination loops, try category navigation or filters instead of continuing to paginate through search results
+- If a task requires specific data that should exist, try multiple navigation approaches: direct menu links, category browsing, filtered searches
+- Always verify you're on the correct content-displaying page before concluding data doesn't exist (e.g., data tables, product grids, video lists)
+
+**Dynamic Content Recognition:**
+- Before scrolling extensively, check if page has infinite scroll by scrolling once and waiting to see if content loads
+- Look for "Load More", "Show More", or pagination controls that might reveal additional content
+- If product grids or content lists appear empty, try interacting with category filters, sorting options, or view toggles
+- For video/media sites, check if content is behind category tabs, genre filters, or requires interaction to load
+- Recognize when you're viewing category/navigation pages vs. actual content pages - navigate deeper if needed
 </browser_rules>
 
 <file_system>
@@ -106,6 +126,8 @@ Strictly follow these rules while using the browser and navigating the web:
 - If exists, <available_file_paths> includes files you have downloaded or uploaded by the user. You can only read or upload these files but you don't have write access.
 - If the task is really long, initialize a `results.md` file to accumulate your results.
 - DO NOT use the file system if the task is less than 10 steps!
+- For simple data extraction tasks (e.g., getting product prices, release dates, single pieces of information), output results directly in the `done` action rather than creating files
+- Only save extracted content to files for complex tasks with multiple data points or when specifically requested by the user
 </file_system>
 
 <task_completion_rules>
diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py
@@ -48,6 +48,7 @@
 	StructuredOutputAction,
 	SwitchTabAction,
 	UploadFileAction,
+	WaitForDynamicContentAction,
 )
 from browser_use.utils import _log_pretty_url, time_execution_sync
 
@@ -761,6 +762,82 @@ async def scroll_to_text(text: str, browser_session: BrowserSession):  # type: i
 					long_term_memory=f"Tried scrolling to text '{text}' but it was not found",
 				)
 
+		@self.registry.action(
+			'Wait for dynamic content to load on the current page. Use when content appears empty or is loading. Optionally scroll slightly to trigger loading.',
+			param_model=WaitForDynamicContentAction,
+		)
+		async def wait_for_dynamic_content(params: WaitForDynamicContentAction, browser_session: BrowserSession):
+			import asyncio
+
+			# Get initial page state
+			try:
+				initial_state = await browser_session.get_browser_state_summary(include_screenshot=False)
+				initial_elements = len(initial_state.clickable_elements)
+
+				# Optionally trigger loading with a small scroll
+				if params.scroll_trigger:
+					try:
+						# Small scroll down and then back up to trigger loading
+						scroll_event = browser_session.event_bus.dispatch(
+							ScrollEvent(pages=0.1, down=True, node=None)
+						)
+						await scroll_event
+						await scroll_event.event_result(raise_if_any=False, raise_if_none=False)
+
+						# Wait a moment
+						await asyncio.sleep(1)
+
+						# Scroll back up
+						scroll_event = browser_session.event_bus.dispatch(
+							ScrollEvent(pages=0.1, down=False, node=None)
+						)
+						await scroll_event
+						await scroll_event.event_result(raise_if_any=False, raise_if_none=False)
+					except Exception:
+						pass  # Ignore scroll errors, just continue with waiting
+
+				# Wait for the specified time, checking periodically for new content
+				wait_time = params.timeout_seconds
+				check_interval = min(1, wait_time / 3)  # Check 3 times during wait period
+
+				for i in range(int(wait_time / check_interval)):
+					await asyncio.sleep(check_interval)
+
+					# Check if new elements appeared
+					current_state = await browser_session.get_browser_state_summary(include_screenshot=False)
+					current_elements = len(current_state.clickable_elements)
+
+					# If looking for specific pattern, check for it
+					if params.element_pattern:
+						page_text = ' '.join([elem.text for elem in current_state.clickable_elements if elem.text])
+						if params.element_pattern.lower() in page_text.lower():
+							memory = f'Found pattern "{params.element_pattern}" after {i * check_interval:.1f}s'
+							logger.info(f'⏳ {memory}')
+							return ActionResult(extracted_content=memory, long_term_memory=memory)
+
+					# Check if significant new content appeared
+					if current_elements > initial_elements + 3:  # More than 3 new elements
+						memory = f'New content loaded: {current_elements - initial_elements} new elements after {i * check_interval:.1f}s'
+						logger.info(f'⏳ {memory}')
+						return ActionResult(extracted_content=memory, long_term_memory=memory)
+
+				# Final wait period completed
+				final_state = await browser_session.get_browser_state_summary(include_screenshot=False)
+				final_elements = len(final_state.clickable_elements)
+
+				if final_elements > initial_elements:
+					memory = f'Waited {wait_time}s for dynamic content - {final_elements - initial_elements} new elements appeared'
+				else:
+					memory = f'Waited {wait_time}s for dynamic content - no significant changes detected'
+
+				logger.info(f'⏳ {memory}')
+				return ActionResult(extracted_content=memory, long_term_memory=memory)
+
+			except Exception as e:
+				error_msg = f'Failed to wait for dynamic content: {str(e)}'
+				logger.error(error_msg)
+				return ActionResult(error=error_msg)
+
 		# Dropdown Actions
 
 		@self.registry.action(
diff --git a/browser_use/tools/views.py b/browser_use/tools/views.py
@@ -91,3 +91,9 @@ class GetDropdownOptionsAction(BaseModel):
 class SelectDropdownOptionAction(BaseModel):
 	index: int = Field(ge=1, description='index of the dropdown element to select an option for')
 	text: str = Field(description='the text or exact value of the option to select')
+
+
+class WaitForDynamicContentAction(BaseModel):
+	timeout_seconds: int = Field(default=5, ge=1, le=10, description='seconds to wait for dynamic content to load')
+	scroll_trigger: bool = Field(default=True, description='whether to scroll slightly to trigger content loading')
+	element_pattern: str | None = Field(default=None, description='optional text pattern to wait for in elements')