fix: Improve error handling for RobotsTxtFile.load (#1524)

Mantisus · web-flow · commit 596a31184914 · 2025-11-03T10:06:18.000+01:00
### Description - This PR adds error handling for `RobotsTxtFile.load`. This prevents crawler failures related to network errors, DNS errors for non-existent domains (e.g., `https://placeholder.com/`), or unexpected data formats received from the /robots.txt page (e.g., https://avatars.githubusercontent.com/robots.txt).
diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from logging import getLogger
 from typing import TYPE_CHECKING
 
 from protego import Protego
@@ -15,6 +16,9 @@
     from crawlee.proxy_configuration import ProxyInfo
 
 
+logger = getLogger(__name__)
+
+
 class RobotsTxtFile:
     def __init__(
         self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
@@ -56,12 +60,20 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
             http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
             proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
         """
-        response = await http_client.send_request(url, proxy_info=proxy_info)
-        body = (
-            b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
-        )
+        try:
+            response = await http_client.send_request(url, proxy_info=proxy_info)
+
+            body = (
+                b'User-agent: *\nAllow: /'
+                if is_status_code_client_error(response.status_code)
+                else await response.read()
+            )
+            robots = Protego.parse(body.decode('utf-8'))
+
+        except Exception as e:
+            logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
 
-        robots = Protego.parse(body.decode('utf-8'))
+            robots = Protego.parse('User-agent: *\nAllow: /')
 
         return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
 
diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
@@ -6,7 +6,7 @@
 import pytest
 
 from crawlee import ConcurrencySettings, Glob, HttpHeaders, RequestTransformAction, SkippedReason
-from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 from crawlee.storages import RequestQueue
 
 if TYPE_CHECKING:
@@ -167,6 +167,40 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     }
 
 
+async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
+    """Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
+    visit = mock.Mock()
+    fail = mock.Mock()
+    crawler = BeautifulSoupCrawler(
+        http_client=http_client,
+        respect_robots_txt_file=True,
+        max_request_retries=0,
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        visit(context.request.url)
+        await context.enqueue_links(strategy='all')
+
+    @crawler.failed_request_handler
+    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
+        fail(context.request.url)
+
+    await crawler.run([str(server_url / 'problematic_links')])
+
+    visited = {call[0][0] for call in visit.call_args_list}
+    failed = {call[0][0] for call in fail.call_args_list}
+
+    # Email must be skipped
+    # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
+    assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}
+
+    # The budplaceholder.com does not exist.
+    assert failed == {
+        'https://budplaceholder.com/',
+    }
+
+
 async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
     crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)
     skip = mock.Mock()
diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py
@@ -14,7 +14,7 @@
     from yarl import URL
 
     from crawlee._request import RequestOptions
-    from crawlee.crawlers import ParselCrawlingContext
+    from crawlee.crawlers import BasicCrawlingContext, ParselCrawlingContext
     from crawlee.http_clients._base import HttpClient
 
 
@@ -261,6 +261,40 @@ async def request_handler(context: ParselCrawlingContext) -> None:
     }
 
 
+async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
+    """Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
+    visit = mock.Mock()
+    fail = mock.Mock()
+    crawler = ParselCrawler(
+        http_client=http_client,
+        respect_robots_txt_file=True,
+        max_request_retries=0,
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: ParselCrawlingContext) -> None:
+        visit(context.request.url)
+        await context.enqueue_links(strategy='all')
+
+    @crawler.failed_request_handler
+    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
+        fail(context.request.url)
+
+    await crawler.run([str(server_url / 'problematic_links')])
+
+    visited = {call[0][0] for call in visit.call_args_list}
+    failed = {call[0][0] for call in fail.call_args_list}
+
+    # Email must be skipped
+    # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
+    assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}
+
+    # The budplaceholder.com does not exist.
+    assert failed == {
+        'https://budplaceholder.com/',
+    }
+
+
 async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
     crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True)
     skip = mock.Mock()
diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
@@ -48,7 +48,7 @@
     from crawlee._request import RequestOptions
     from crawlee._types import HttpMethod, HttpPayload
     from crawlee.browsers._types import BrowserType
-    from crawlee.crawlers import PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
+    from crawlee.crawlers import BasicCrawlingContext, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
 
 
 @pytest.mark.parametrize(
@@ -671,6 +671,39 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
     }
 
 
+async def test_respect_robots_txt_with_problematic_links(server_url: URL) -> None:
+    """Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
+    visit = mock.Mock()
+    fail = mock.Mock()
+    crawler = PlaywrightCrawler(
+        respect_robots_txt_file=True,
+        max_request_retries=0,
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        visit(context.request.url)
+        await context.enqueue_links(strategy='all')
+
+    @crawler.failed_request_handler
+    async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
+        fail(context.request.url)
+
+    await crawler.run([str(server_url / 'problematic_links')])
+
+    visited = {call[0][0] for call in visit.call_args_list}
+    failed = {call[0][0] for call in fail.call_args_list}
+
+    # Email must be skipped
+    # https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
+    assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}
+
+    # The budplaceholder.com does not exist.
+    assert failed == {
+        'https://budplaceholder.com/',
+    }
+
+
 async def test_on_skipped_request(server_url: URL) -> None:
     crawler = PlaywrightCrawler(respect_robots_txt_file=True)
     skip = mock.Mock()
diff --git a/tests/unit/server.py b/tests/unit/server.py
@@ -18,6 +18,7 @@
     GENERIC_RESPONSE,
     HELLO_WORLD,
     INCAPSULA,
+    PROBLEMATIC_LINKS,
     ROBOTS_TXT,
     SECONDARY_INDEX,
     START_ENQUEUE,
@@ -102,6 +103,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
         'page_1': generic_response_endpoint,
         'page_2': generic_response_endpoint,
         'page_3': generic_response_endpoint,
+        'problematic_links': problematic_links_endpoint,
         'set_cookies': set_cookies,
         'set_complex_cookies': set_complex_cookies,
         'cookies': get_cookies,
@@ -287,6 +289,14 @@ async def generic_response_endpoint(_scope: dict[str, Any], _receive: Receive, s
     )
 
 
+async def problematic_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
+    """Handle requests with a page containing problematic links."""
+    await send_html_response(
+        send,
+        PROBLEMATIC_LINKS,
+    )
+
+
 async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
     """Handle requests that should redirect to a specified full URL."""
     query_params = get_query_params(scope.get('query_string', b''))
diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py
@@ -35,6 +35,16 @@
     </iframe>
 </body></html>"""
 
+PROBLEMATIC_LINKS = b"""\
+<html><head>
+    <title>Hello</title>
+</head>
+<body>
+    <a href="https://budplaceholder.com/">Placeholder</a>
+    <a href="mailto:test@test.com">test@test.com</a>
+    <a href=https://avatars.githubusercontent.com/apify>Apify avatar/a>
+</body></html>"""
+
 GENERIC_RESPONSE = b"""\
 <html><head>
     <title>Hello</title>