Skip to content

Commit 002b3f8

Browse files
authored
chore: Prevent request manager change after crawler restart (#1432)
### Description - Prevent `request_manager` change in `BasicCrawler` after crawler restart. ### Testing - Added unit test
1 parent 6d3ccff commit 002b3f8

File tree

2 files changed

+27
-1
lines changed

2 files changed

+27
-1
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,10 @@ async def run(
659659
request_manager = await self.get_request_manager()
660660
if purge_request_queue and isinstance(request_manager, RequestQueue):
661661
await request_manager.drop()
662-
self._request_manager = await RequestQueue.open()
662+
self._request_manager = await RequestQueue.open(
663+
storage_client=self._service_locator.get_storage_client(),
664+
configuration=self._service_locator.get_configuration(),
665+
)
663666

664667
if requests is not None:
665668
await self.add_requests(requests)

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1620,3 +1620,26 @@ async def handler(context: BasicCrawlingContext) -> None:
16201620
)
16211621

16221622
await crawler.run(['https://start.placeholder.com'])
1623+
1624+
1625+
async def test_crawler_purge_request_queue_uses_same_storage_client() -> None:
1626+
"""Make sure that purge on start does not replace the storage client in the underlying storage manager"""
1627+
1628+
# Set some different storage_client globally and different for Crawlee.
1629+
service_locator.set_storage_client(FileSystemStorageClient())
1630+
unrelated_rq = await RequestQueue.open()
1631+
unrelated_request = Request.from_url('https://x.placeholder.com')
1632+
await unrelated_rq.add_request(unrelated_request)
1633+
1634+
crawler = BasicCrawler(storage_client=MemoryStorageClient())
1635+
1636+
@crawler.router.default_handler
1637+
async def handler(context: BasicCrawlingContext) -> None:
1638+
context.log.info(context.request.url)
1639+
1640+
for _ in (1, 2):
1641+
await crawler.run(requests=[Request.from_url('https://a.placeholder.com')], purge_request_queue=True)
1642+
assert crawler.statistics.state.requests_finished == 1
1643+
1644+
# Crawler should not fall back to the default storage after the purge
1645+
assert await unrelated_rq.fetch_next_request() == unrelated_request

0 commit comments

Comments
 (0)