Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1135,8 +1135,17 @@ async def _handle_request_retries(
except Exception as e:
raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
else:
if new_request is not None:
request = new_request
if new_request is not None and new_request != request:
await request_manager.add_request(new_request)
await wait_for(
lambda: request_manager.mark_request_as_handled(request),
timeout=self._internal_timeout,
timeout_message='Marking request as handled timed out after '
f'{self._internal_timeout.total_seconds()} seconds',
logger=self._logger,
max_retries=3,
)
Copy link
Collaborator

@Pijukatel Pijukatel Dec 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now this is repeated 6x times in BasicCrawler in some variants. I would consider creating a utility function for it.

async def some_very_good_name(self, request: Request)->None:
            request_manager = await self.get_request_manager()
            await wait_for(
                lambda: request_manager.mark_request_as_handled(request),
                timeout=self._internal_timeout,
                timeout_message='Marking request as handled timed out after '
                f'{self._internal_timeout.total_seconds()} seconds',
                logger=self._logger,
                max_retries=3,
            )

(no need to worry about request_manager = await self.get_request_manager(), it will resolve to self._request_manager)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO we can use just a private method async def _mark_request_as_handled(self, request: Request) -> None:.

return

await request_manager.reclaim_request(request)
else:
Expand Down
36 changes: 35 additions & 1 deletion tests/unit/crawlers/_basic/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pytest

from crawlee import ConcurrencySettings, Glob, service_locator
from crawlee._request import Request
from crawlee._request import Request, RequestState
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod
from crawlee._utils.robots import RobotsTxtFile
from crawlee.configuration import Configuration
Expand Down Expand Up @@ -1768,3 +1768,37 @@ async def handler(_: BasicCrawlingContext) -> None:

# Wait for crawler to finish
await crawler_task


async def test_new_request_error_handler() -> None:
"""Test that error in new_request_handler is handled properly."""
queue = await RequestQueue.open()
crawler = BasicCrawler(
request_manager=queue,
)

request = Request.from_url('https://a.placeholder.com')

@crawler.router.default_handler
async def handler(context: BasicCrawlingContext) -> None:
if '|test' in context.request.unique_key:
return
raise ValueError('This error should not be handled by error handler')

@crawler.error_handler
async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request | None:
return Request.from_url(
context.request.url,
unique_key=f'{context.request.unique_key}|test',
)

await crawler.run([request])

check_original_request = await queue.get_request(request.unique_key)
check_error_request = await queue.get_request(f'{request.unique_key}|test')

assert check_original_request is not None
assert check_original_request.state == RequestState.ERROR_HANDLER

assert check_error_request is not None
assert check_error_request.state == RequestState.REQUEST_HANDLER
Loading