feat: Support Prohibited Domains (#3115)

maticzav · web-flow · commit 122feb91f41f · 2025-09-14T19:01:26.000-07:00
This PR adds support for a list of prohibited domains.
    
&lt;!-- This is an auto-generated description by cubic. --&gt;
---

## Summary by cubic
Adds a prohibited_domains option to block navigation to specific domains
and URL patterns in the SecurityWatchdog. Allowlist still takes
precedence, and internal new-tab/blank pages remain allowed.

- New Features
  - BrowserProfile: new prohibited_domains list.
- SecurityWatchdog: uses allowlist if present; otherwise applies
prohibitlist; defaults to allow when neither is set.
- Pattern support: exact domains (case-insensitive, also blocks www),
wildcard subdomains (*.domain.com for http/https), and full URL/prefix
patterns (e.g., https://host, brave://*). Ignores credentials in the URL
when matching the host. Always allows about:blank and Chrome new-tab
pages.

- Refactors
  - Extracted _is_url_match for shared pattern matching.
- Added tests for precedence (allowlist over prohibitlist), wildcard and
scheme rules, internal URL exceptions, and credential edge cases.

&lt;!-- End of auto-generated description by cubic. --&gt;
diff --git a/browser_use/browser/profile.py b/browser_use/browser/profile.py
@@ -559,6 +559,10 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
 		default=None,
 		description='List of allowed domains for navigation e.g. ["*.google.com", "https://example.com", "chrome-extension://*"]',
 	)
+	prohibited_domains: list[str] | None = Field(
+		default=None,
+		description='List of prohibited domains for navigation e.g. ["*.google.com", "https://example.com", "chrome-extension://*"]. Allowed domains take precedence over prohibited domains.',
+	)
 	keep_alive: bool | None = Field(default=None, description='Keep browser alive after agent run.')
 
 	# --- Proxy settings ---
diff --git a/browser_use/browser/watchdogs/security_watchdog.py b/browser_use/browser/watchdogs/security_watchdog.py
@@ -127,8 +127,12 @@ def _is_url_allowed(self, url: str) -> bool:
 		Returns:
 			True if the URL is allowed, False otherwise
 		"""
+
 		# If no allowed_domains specified, allow all URLs
-		if not self.browser_session.browser_profile.allowed_domains:
+		if (
+			not self.browser_session.browser_profile.allowed_domains
+			and not self.browser_session.browser_profile.prohibited_domains
+		):
 			return True
 
 		# Always allow internal browser targets
@@ -149,48 +153,67 @@ def _is_url_allowed(self, url: str) -> bool:
 		if not host:
 			return False
 
-		# Full URL for matching (scheme + host)
-		full_url_pattern = f'{parsed.scheme}://{host}'
-
 		# Check each allowed domain pattern
-		for pattern in self.browser_session.browser_profile.allowed_domains:
-			# Handle glob patterns
-			if '*' in pattern:
-				self._log_glob_warning()
-				import fnmatch
-
-				# Check if pattern matches the host
-				if pattern.startswith('*.'):
-					# Pattern like *.example.com should match subdomains and main domain
-					domain_part = pattern[2:]  # Remove *.
-					if host == domain_part or host.endswith('.' + domain_part):
-						# Only match http/https URLs for domain-only patterns
-						if parsed.scheme in ['http', 'https']:
-							return True
-				elif pattern.endswith('/*'):
-					# Pattern like brave://* should match any brave:// URL
-					prefix = pattern[:-1]  # Remove the * at the end
-					if url.startswith(prefix):
-						return True
-				else:
-					# Use fnmatch for other glob patterns
-					if fnmatch.fnmatch(
-						full_url_pattern if '://' in pattern else host,
-						pattern,
-					):
+		if self.browser_session.browser_profile.allowed_domains:
+			for pattern in self.browser_session.browser_profile.allowed_domains:
+				if self._is_url_match(url, host, parsed.scheme, pattern):
+					return True
+
+			return False
+
+		# Check each prohibited domain pattern
+		if self.browser_session.browser_profile.prohibited_domains:
+			for pattern in self.browser_session.browser_profile.prohibited_domains:
+				if self._is_url_match(url, host, parsed.scheme, pattern):
+					return False
+
+			return True
+
+		return True
+
+	def _is_url_match(self, url: str, host: str, scheme: str, pattern: str) -> bool:
+		"""Check if a URL matches a pattern."""
+
+		# Full URL for matching (scheme + host)
+		full_url_pattern = f'{scheme}://{host}'
+
+		# Handle glob patterns
+		if '*' in pattern:
+			self._log_glob_warning()
+			import fnmatch
+
+			# Check if pattern matches the host
+			if pattern.startswith('*.'):
+				# Pattern like *.example.com should match subdomains and main domain
+				domain_part = pattern[2:]  # Remove *.
+				if host == domain_part or host.endswith('.' + domain_part):
+					# Only match http/https URLs for domain-only patterns
+					if scheme in ['http', 'https']:
 						return True
+			elif pattern.endswith('/*'):
+				# Pattern like brave://* should match any brave:// URL
+				prefix = pattern[:-1]  # Remove the * at the end
+				if url.startswith(prefix):
+					return True
 			else:
-				# Exact match
-				if '://' in pattern:
-					# Full URL pattern
-					if url.startswith(pattern):
-						return True
-				else:
-					# Domain-only pattern (case-insensitive comparison)
-					if host.lower() == pattern.lower():
-						return True
-					# If pattern is a root domain, also check www subdomain
-					if self._is_root_domain(pattern) and host.lower() == f'www.{pattern.lower()}':
-						return True
+				# Use fnmatch for other glob patterns
+				if fnmatch.fnmatch(
+					full_url_pattern if '://' in pattern else host,
+					pattern,
+				):
+					return True
+		else:
+			# Exact match
+			if '://' in pattern:
+				# Full URL pattern
+				if url.startswith(pattern):
+					return True
+			else:
+				# Domain-only pattern (case-insensitive comparison)
+				if host.lower() == pattern.lower():
+					return True
+				# If pattern is a root domain, also check www subdomain
+				if self._is_root_domain(pattern) and host.lower() == f'www.{pattern.lower()}':
+					return True
 
 		return False
diff --git a/tests/ci/test_browser_watchdog_security2.py b/tests/ci/test_browser_watchdog_security2.py
@@ -287,3 +287,149 @@ def test_is_root_domain_helper(self):
 		# Invalid domains - should return False
 		assert watchdog._is_root_domain('example') is False
 		assert watchdog._is_root_domain('') is False
+
+
+class TestUrlProhibitlistSecurity:
+	"""Tests for URL prohibitlist (blocked domains) behavior and matching semantics."""
+
+	def test_simple_prohibited_domains(self):
+		"""Domain-only patterns block exact host and www, but not other subdomains."""
+		from bubus import EventBus
+
+		from browser_use.browser.watchdogs.security_watchdog import SecurityWatchdog
+
+		browser_profile = BrowserProfile(prohibited_domains=['example.com', 'test.org'], headless=True, user_data_dir=None)
+		browser_session = BrowserSession(browser_profile=browser_profile)
+		event_bus = EventBus()
+		watchdog = SecurityWatchdog(browser_session=browser_session, event_bus=event_bus)
+
+		# Block exact and www
+		assert watchdog._is_url_allowed('https://example.com') is False
+		assert watchdog._is_url_allowed('https://www.example.com') is False
+		assert watchdog._is_url_allowed('https://test.org') is False
+		assert watchdog._is_url_allowed('https://www.test.org') is False
+
+		# Allow other subdomains when only root is prohibited
+		assert watchdog._is_url_allowed('https://mail.example.com') is True
+		assert watchdog._is_url_allowed('https://api.test.org') is True
+
+		# Allow unrelated domains
+		assert watchdog._is_url_allowed('https://notexample.com') is True
+
+	def test_glob_pattern_prohibited(self):
+		"""Wildcard patterns block subdomains and main domain for http/https only."""
+		from bubus import EventBus
+
+		from browser_use.browser.watchdogs.security_watchdog import SecurityWatchdog
+
+		browser_profile = BrowserProfile(prohibited_domains=['*.example.com'], headless=True, user_data_dir=None)
+		browser_session = BrowserSession(browser_profile=browser_profile)
+		event_bus = EventBus()
+		watchdog = SecurityWatchdog(browser_session=browser_session, event_bus=event_bus)
+
+		# Block subdomains and main domain
+		assert watchdog._is_url_allowed('https://example.com') is False
+		assert watchdog._is_url_allowed('https://www.example.com') is False
+		assert watchdog._is_url_allowed('https://mail.example.com') is False
+
+		# Allow other domains
+		assert watchdog._is_url_allowed('https://notexample.com') is True
+
+		# Wildcard with domain-only should not apply to non-http(s)
+		assert watchdog._is_url_allowed('chrome://abc.example.com') is True
+
+	def test_full_url_prohibited_patterns(self):
+		"""Full URL patterns block only matching scheme/host/prefix."""
+		from bubus import EventBus
+
+		from browser_use.browser.watchdogs.security_watchdog import SecurityWatchdog
+
+		browser_profile = BrowserProfile(prohibited_domains=['https://wiki.org', 'brave://*'], headless=True, user_data_dir=None)
+		browser_session = BrowserSession(browser_profile=browser_profile)
+		event_bus = EventBus()
+		watchdog = SecurityWatchdog(browser_session=browser_session, event_bus=event_bus)
+
+		# Scheme-specific blocking
+		assert watchdog._is_url_allowed('http://wiki.org') is True
+		assert watchdog._is_url_allowed('https://wiki.org') is False
+		assert watchdog._is_url_allowed('https://wiki.org/path') is False
+
+		# Internal URL prefix blocking
+		assert watchdog._is_url_allowed('brave://anything/') is False
+		assert watchdog._is_url_allowed('chrome://settings') is True
+
+	def test_internal_urls_allowed_even_when_prohibited(self):
+		"""Internal new-tab/blank URLs are always allowed regardless of prohibited list."""
+		from bubus import EventBus
+
+		from browser_use.browser.watchdogs.security_watchdog import SecurityWatchdog
+
+		browser_profile = BrowserProfile(prohibited_domains=['*'], headless=True, user_data_dir=None)
+		browser_session = BrowserSession(browser_profile=browser_profile)
+		event_bus = EventBus()
+		watchdog = SecurityWatchdog(browser_session=browser_session, event_bus=event_bus)
+
+		assert watchdog._is_url_allowed('about:blank') is True
+		assert watchdog._is_url_allowed('chrome://new-tab-page/') is True
+		assert watchdog._is_url_allowed('chrome://new-tab-page') is True
+		assert watchdog._is_url_allowed('chrome://newtab/') is True
+
+	def test_prohibited_ignored_when_allowlist_present(self):
+		"""When allowlist is set, prohibited list is ignored by design."""
+		from bubus import EventBus
+
+		from browser_use.browser.watchdogs.security_watchdog import SecurityWatchdog
+
+		browser_profile = BrowserProfile(
+			allowed_domains=['*.example.com'],
+			prohibited_domains=['https://example.com'],
+			headless=True,
+			user_data_dir=None,
+		)
+		browser_session = BrowserSession(browser_profile=browser_profile)
+		event_bus = EventBus()
+		watchdog = SecurityWatchdog(browser_session=browser_session, event_bus=event_bus)
+
+		# Allowed by allowlist even though exact URL is in prohibited list
+		assert watchdog._is_url_allowed('https://example.com') is True
+		assert watchdog._is_url_allowed('https://www.example.com') is True
+
+		# Not in allowlist => blocked (prohibited list is not consulted in this mode)
+		assert watchdog._is_url_allowed('https://api.example.com') is True  # wildcard allowlist includes this
+		# A domain outside the allowlist should be blocked
+		assert watchdog._is_url_allowed('https://notexample.com') is False
+
+	def test_auth_credentials_do_not_cause_false_block(self):
+		"""Credentials injection with prohibited domain in username should not block unrelated hosts."""
+		from bubus import EventBus
+
+		from browser_use.browser.watchdogs.security_watchdog import SecurityWatchdog
+
+		browser_profile = BrowserProfile(prohibited_domains=['example.com'], headless=True, user_data_dir=None)
+		browser_session = BrowserSession(browser_profile=browser_profile)
+		event_bus = EventBus()
+		watchdog = SecurityWatchdog(browser_session=browser_session, event_bus=event_bus)
+
+		# Host is malicious.com, should not be blocked just because username contains example.com
+		assert watchdog._is_url_allowed('https://example.com:password@malicious.com') is True
+		assert watchdog._is_url_allowed('https://example.com@malicious.com') is True
+		assert watchdog._is_url_allowed('https://example.com%20@malicious.com') is True
+		assert watchdog._is_url_allowed('https://example.com%3A@malicious.com') is True
+
+		# Legitimate credentials to a prohibited host should be blocked
+		assert watchdog._is_url_allowed('https://user:password@example.com') is False
+
+	def test_case_insensitive_prohibited_domains(self):
+		"""Prohibited domain matching should be case-insensitive."""
+		from bubus import EventBus
+
+		from browser_use.browser.watchdogs.security_watchdog import SecurityWatchdog
+
+		browser_profile = BrowserProfile(prohibited_domains=['Example.COM'], headless=True, user_data_dir=None)
+		browser_session = BrowserSession(browser_profile=browser_profile)
+		event_bus = EventBus()
+		watchdog = SecurityWatchdog(browser_session=browser_session, event_bus=event_bus)
+
+		assert watchdog._is_url_allowed('https://example.com') is False
+		assert watchdog._is_url_allowed('https://WWW.EXAMPLE.COM') is False
+		assert watchdog._is_url_allowed('https://mail.example.com') is True