Skip to content

Commit 296f199

Browse files
committed
Handle the case when error_handler returns Request
1 parent 131f1f0 commit 296f199

File tree

2 files changed

+46
-3
lines changed

2 files changed

+46
-3
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1135,8 +1135,17 @@ async def _handle_request_retries(
11351135
except Exception as e:
11361136
raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
11371137
else:
1138-
if new_request is not None:
1139-
request = new_request
1138+
if new_request is not None and new_request != request:
1139+
await request_manager.add_request(new_request)
1140+
await wait_for(
1141+
lambda: request_manager.mark_request_as_handled(request),
1142+
timeout=self._internal_timeout,
1143+
timeout_message='Marking request as handled timed out after '
1144+
f'{self._internal_timeout.total_seconds()} seconds',
1145+
logger=self._logger,
1146+
max_retries=3,
1147+
)
1148+
return
11401149

11411150
await request_manager.reclaim_request(request)
11421151
else:

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import pytest
1919

2020
from crawlee import ConcurrencySettings, Glob, service_locator
21-
from crawlee._request import Request
21+
from crawlee._request import Request, RequestState
2222
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod
2323
from crawlee._utils.robots import RobotsTxtFile
2424
from crawlee.configuration import Configuration
@@ -1768,3 +1768,37 @@ async def handler(_: BasicCrawlingContext) -> None:
17681768

17691769
# Wait for crawler to finish
17701770
await crawler_task
1771+
1772+
1773+
async def test_new_request_error_handler() -> None:
1774+
"""Test that error in new_request_handler is handled properly."""
1775+
queue = await RequestQueue.open()
1776+
crawler = BasicCrawler(
1777+
request_manager=queue,
1778+
)
1779+
1780+
request = Request.from_url('https://a.placeholder.com')
1781+
1782+
@crawler.router.default_handler
1783+
async def handler(context: BasicCrawlingContext) -> None:
1784+
if '|test' in context.request.unique_key:
1785+
return
1786+
raise ValueError('This error should not be handled by error handler')
1787+
1788+
@crawler.error_handler
1789+
async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request | None:
1790+
return Request.from_url(
1791+
context.request.url,
1792+
unique_key=f'{context.request.unique_key}|test',
1793+
)
1794+
1795+
await crawler.run([request])
1796+
1797+
check_original_request = await queue.get_request(request.unique_key)
1798+
check_error_request = await queue.get_request(f'{request.unique_key}|test')
1799+
1800+
assert check_original_request is not None
1801+
assert check_original_request.state == RequestState.ERROR_HANDLER
1802+
1803+
assert check_error_request is not None
1804+
assert check_error_request.state == RequestState.REQUEST_HANDLER

0 commit comments

Comments
 (0)