Skip to content

Commit 596a311

Browse files
authored
fix: Improve error handling for RobotsTxtFile.load (#1524)
### Description - This PR adds error handling for `RobotsTxtFile.load`. This prevents crawler failures related to network errors, DNS errors for non-existent domains (e.g., `https://placeholder.com/`), or unexpected data formats received from the /robots.txt page (e.g., https://avatars.githubusercontent.com/robots.txt).
1 parent a1fd20e commit 596a311

File tree

6 files changed

+141
-8
lines changed

6 files changed

+141
-8
lines changed

src/crawlee/_utils/robots.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
from logging import getLogger
34
from typing import TYPE_CHECKING
45

56
from protego import Protego
@@ -15,6 +16,9 @@
1516
from crawlee.proxy_configuration import ProxyInfo
1617

1718

19+
logger = getLogger(__name__)
20+
21+
1822
class RobotsTxtFile:
1923
def __init__(
2024
self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
@@ -56,12 +60,20 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
5660
http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
5761
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
5862
"""
59-
response = await http_client.send_request(url, proxy_info=proxy_info)
60-
body = (
61-
b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
62-
)
63+
try:
64+
response = await http_client.send_request(url, proxy_info=proxy_info)
65+
66+
body = (
67+
b'User-agent: *\nAllow: /'
68+
if is_status_code_client_error(response.status_code)
69+
else await response.read()
70+
)
71+
robots = Protego.parse(body.decode('utf-8'))
72+
73+
except Exception as e:
74+
logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
6375

64-
robots = Protego.parse(body.decode('utf-8'))
76+
robots = Protego.parse('User-agent: *\nAllow: /')
6577

6678
return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
6779

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pytest
77

88
from crawlee import ConcurrencySettings, Glob, HttpHeaders, RequestTransformAction, SkippedReason
9-
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
9+
from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext
1010
from crawlee.storages import RequestQueue
1111

1212
if TYPE_CHECKING:
@@ -167,6 +167,40 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
167167
}
168168

169169

170+
async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
171+
"""Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
172+
visit = mock.Mock()
173+
fail = mock.Mock()
174+
crawler = BeautifulSoupCrawler(
175+
http_client=http_client,
176+
respect_robots_txt_file=True,
177+
max_request_retries=0,
178+
)
179+
180+
@crawler.router.default_handler
181+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
182+
visit(context.request.url)
183+
await context.enqueue_links(strategy='all')
184+
185+
@crawler.failed_request_handler
186+
async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
187+
fail(context.request.url)
188+
189+
await crawler.run([str(server_url / 'problematic_links')])
190+
191+
visited = {call[0][0] for call in visit.call_args_list}
192+
failed = {call[0][0] for call in fail.call_args_list}
193+
194+
# Email must be skipped
195+
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
196+
assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}
197+
198+
# The budplaceholder.com does not exist.
199+
assert failed == {
200+
'https://budplaceholder.com/',
201+
}
202+
203+
170204
async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
171205
crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)
172206
skip = mock.Mock()

tests/unit/crawlers/_parsel/test_parsel_crawler.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from yarl import URL
1515

1616
from crawlee._request import RequestOptions
17-
from crawlee.crawlers import ParselCrawlingContext
17+
from crawlee.crawlers import BasicCrawlingContext, ParselCrawlingContext
1818
from crawlee.http_clients._base import HttpClient
1919

2020

@@ -261,6 +261,40 @@ async def request_handler(context: ParselCrawlingContext) -> None:
261261
}
262262

263263

264+
async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
265+
"""Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
266+
visit = mock.Mock()
267+
fail = mock.Mock()
268+
crawler = ParselCrawler(
269+
http_client=http_client,
270+
respect_robots_txt_file=True,
271+
max_request_retries=0,
272+
)
273+
274+
@crawler.router.default_handler
275+
async def request_handler(context: ParselCrawlingContext) -> None:
276+
visit(context.request.url)
277+
await context.enqueue_links(strategy='all')
278+
279+
@crawler.failed_request_handler
280+
async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
281+
fail(context.request.url)
282+
283+
await crawler.run([str(server_url / 'problematic_links')])
284+
285+
visited = {call[0][0] for call in visit.call_args_list}
286+
failed = {call[0][0] for call in fail.call_args_list}
287+
288+
# Email must be skipped
289+
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
290+
assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}
291+
292+
# The budplaceholder.com does not exist.
293+
assert failed == {
294+
'https://budplaceholder.com/',
295+
}
296+
297+
264298
async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
265299
crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True)
266300
skip = mock.Mock()

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
from crawlee._request import RequestOptions
4949
from crawlee._types import HttpMethod, HttpPayload
5050
from crawlee.browsers._types import BrowserType
51-
from crawlee.crawlers import PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
51+
from crawlee.crawlers import BasicCrawlingContext, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
5252

5353

5454
@pytest.mark.parametrize(
@@ -671,6 +671,39 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
671671
}
672672

673673

674+
async def test_respect_robots_txt_with_problematic_links(server_url: URL) -> None:
675+
"""Test checks the crawler behavior with links that may cause problems when attempting to retrieve robots.txt."""
676+
visit = mock.Mock()
677+
fail = mock.Mock()
678+
crawler = PlaywrightCrawler(
679+
respect_robots_txt_file=True,
680+
max_request_retries=0,
681+
)
682+
683+
@crawler.router.default_handler
684+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
685+
visit(context.request.url)
686+
await context.enqueue_links(strategy='all')
687+
688+
@crawler.failed_request_handler
689+
async def error_handler(context: BasicCrawlingContext, _error: Exception) -> None:
690+
fail(context.request.url)
691+
692+
await crawler.run([str(server_url / 'problematic_links')])
693+
694+
visited = {call[0][0] for call in visit.call_args_list}
695+
failed = {call[0][0] for call in fail.call_args_list}
696+
697+
# Email must be skipped
698+
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
699+
assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}
700+
701+
# The budplaceholder.com does not exist.
702+
assert failed == {
703+
'https://budplaceholder.com/',
704+
}
705+
706+
674707
async def test_on_skipped_request(server_url: URL) -> None:
675708
crawler = PlaywrightCrawler(respect_robots_txt_file=True)
676709
skip = mock.Mock()

tests/unit/server.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
GENERIC_RESPONSE,
1919
HELLO_WORLD,
2020
INCAPSULA,
21+
PROBLEMATIC_LINKS,
2122
ROBOTS_TXT,
2223
SECONDARY_INDEX,
2324
START_ENQUEUE,
@@ -102,6 +103,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
102103
'page_1': generic_response_endpoint,
103104
'page_2': generic_response_endpoint,
104105
'page_3': generic_response_endpoint,
106+
'problematic_links': problematic_links_endpoint,
105107
'set_cookies': set_cookies,
106108
'set_complex_cookies': set_complex_cookies,
107109
'cookies': get_cookies,
@@ -287,6 +289,14 @@ async def generic_response_endpoint(_scope: dict[str, Any], _receive: Receive, s
287289
)
288290

289291

292+
async def problematic_links_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
293+
"""Handle requests with a page containing problematic links."""
294+
await send_html_response(
295+
send,
296+
PROBLEMATIC_LINKS,
297+
)
298+
299+
290300
async def redirect_to_url(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
291301
"""Handle requests that should redirect to a specified full URL."""
292302
query_params = get_query_params(scope.get('query_string', b''))

tests/unit/server_endpoints.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,16 @@
3535
</iframe>
3636
</body></html>"""
3737

38+
PROBLEMATIC_LINKS = b"""\
39+
<html><head>
40+
<title>Hello</title>
41+
</head>
42+
<body>
43+
<a href="https://budplaceholder.com/">Placeholder</a>
44+
45+
<a href=https://avatars.githubusercontent.com/apify>Apify avatar/a>
46+
</body></html>"""
47+
3848
GENERIC_RESPONSE = b"""\
3949
<html><head>
4050
<title>Hello</title>

0 commit comments

Comments
 (0)