From 7dd0f66a89e1e3ce3518e3aa3be52673fd8ef578 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 2 Oct 2025 15:38:11 +0200 Subject: [PATCH 1/8] WIp, fix failing tests --- src/crawlee/crawlers/_basic/_basic_crawler.py | 15 ++++- src/crawlee/statistics/_statistics.py | 5 +- .../crawlers/_basic/test_basic_crawler.py | 56 +++++++++++++++++++ tests/unit/test_configuration.py | 8 +-- 4 files changed, 75 insertions(+), 9 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index cdf7e527e2..0a97e4ae19 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -11,7 +11,7 @@ from asyncio import CancelledError from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Sequence from contextlib import AsyncExitStack, suppress -from datetime import timedelta +from datetime import datetime, timedelta, timezone from functools import partial from pathlib import Path from typing import TYPE_CHECKING, Any, Generic, Literal, cast @@ -56,7 +56,7 @@ SessionError, UserDefinedErrorHandlerError, ) -from crawlee.events._types import Event, EventCrawlerStatusData +from crawlee.events._types import Event, EventCrawlerStatusData, EventPersistStateData from crawlee.http_clients import ImpitHttpClient from crawlee.router import Router from crawlee.sessions import SessionPool @@ -440,6 +440,7 @@ def __init__( self._statistics = statistics or cast( 'Statistics[TStatisticsState]', Statistics.with_default_state( + persistence_enabled=True, periodic_message_logger=self._logger, statistics_log_format=self._statistics_log_format, log_message='Current request statistics:', @@ -744,6 +745,16 @@ async def _run_crawler(self) -> None: await self._autoscaled_pool.run() + # Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed + if not self.statistics.state.crawler_last_started_at: + raise RuntimeError('Statistics.state.crawler_last_started_at not set.') + run_duration = datetime.now(timezone.utc) - self.statistics.state.crawler_last_started_at + self._statistics.state.crawler_runtime = self.statistics.state.crawler_runtime + run_duration + self._service_locator.get_event_manager().emit( + event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False) + ) + await asyncio.sleep(10) + async def add_requests( self, requests: Sequence[str | Request], diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index 2386986001..cc7e430348 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -250,8 +250,7 @@ def calculate(self) -> FinalStatistics: if self._instance_start is None: raise RuntimeError('The Statistics object is not initialized') - crawler_runtime = datetime.now(timezone.utc) - self._instance_start - total_minutes = crawler_runtime.total_seconds() / 60 + total_minutes = self.state.crawler_runtime.total_seconds() / 60 state = self._state.current_value serialized_state = state.model_dump(by_alias=False) @@ -262,7 +261,7 @@ def calculate(self) -> FinalStatistics: requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0, request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration, requests_total=state.requests_failed + state.requests_finished, - crawler_runtime=crawler_runtime, + crawler_runtime=state.crawler_runtime, requests_finished=state.requests_finished, requests_failed=state.requests_failed, retry_histogram=serialized_state['request_retry_histogram'], diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 09b951ca35..d1a5f98488 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -2,6 +2,7 @@ from __future__ import annotations import asyncio +import concurrent import json import logging import os @@ -1643,3 +1644,58 @@ async def handler(context: BasicCrawlingContext) -> None: # Crawler should not fall back to the default storage after the purge assert await unrelated_rq.fetch_next_request() == unrelated_request + + +async def _run_crawler(requests: list[str], storage_dir: str) -> StatisticsState: + """Run crawler and return its statistics state. + + Must be defined like this to be picklable for ProcessPoolExecutor.""" + service_locator.set_configuration( + Configuration( + crawlee_storage_dir=storage_dir, # type: ignore[call-arg] + purge_on_start=False, + ) + ) + + async def request_handler(context: BasicCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + crawler = BasicCrawler( + request_handler=request_handler, + concurrency_settings=ConcurrencySettings(max_concurrency=1, desired_concurrency=1), + ) + + await crawler.run(requests) + return crawler.statistics.state + + +def _process_run_crawler(requests: list[str], storage_dir: str) -> StatisticsState: + return asyncio.run(_run_crawler(requests=requests, storage_dir=storage_dir)) + + +async def test_crawler_statistics_persistence(tmp_path: Path) -> None: + """Test that crawler statistics persist and are loaded correctly. + + This test simulates starting the crawler process twice, and checks that the statistics include first run.""" + + with concurrent.futures.ProcessPoolExecutor() as executor: + # Crawl 2 requests in the first run and automatically persist the state. + first_run_state = executor.submit( + _process_run_crawler, + requests=['https://a.placeholder.com', 'https://b.placeholder.com'], + storage_dir=str(tmp_path), + ).result() + assert first_run_state.requests_finished == 2 + + # Crawl 1 additional requests in the second run, but use previously automatically persisted state. + second_run_state = executor.submit( + _process_run_crawler, requests=['https://c.placeholder.com'], storage_dir=str(tmp_path) + ).result() + assert second_run_state.requests_finished == 3 + + assert first_run_state.crawler_started_at == second_run_state.crawler_started_at + assert first_run_state.crawler_finished_at + assert second_run_state.crawler_finished_at + + assert first_run_state.crawler_finished_at < second_run_state.crawler_finished_at + assert first_run_state.crawler_runtime < second_run_state.crawler_runtime diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index f89401e5be..f1bb244170 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -41,10 +41,10 @@ async def test_storage_not_persisted_when_disabled(tmp_path: Path, server_url: U ) storage_client = MemoryStorageClient() - crawler = HttpCrawler( - configuration=configuration, - storage_client=storage_client, - ) + service_locator.set_configuration(configuration) + service_locator.set_storage_client(storage_client) + + crawler = HttpCrawler() @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: From f3b981284ce9dc123e71392cd0e5425a14227321 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 2 Oct 2025 15:53:34 +0200 Subject: [PATCH 2/8] Add comment for remembering where left --- src/crawlee/crawlers/_basic/_basic_crawler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 0a97e4ae19..408125458b 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -440,7 +440,7 @@ def __init__( self._statistics = statistics or cast( 'Statistics[TStatisticsState]', Statistics.with_default_state( - persistence_enabled=True, + persistence_enabled=True, # TODO: Why does changing this to True breaks unrelated tests? periodic_message_logger=self._logger, statistics_log_format=self._statistics_log_format, log_message='Current request statistics:', @@ -753,7 +753,6 @@ async def _run_crawler(self) -> None: self._service_locator.get_event_manager().emit( event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False) ) - await asyncio.sleep(10) async def add_requests( self, From 3a28d42f046bff51a311b53a029422d85ad48d8a Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 15 Oct 2025 14:14:50 +0200 Subject: [PATCH 3/8] Start `_crawler_state_rec_task` when active contexts (to allow persistent state init) --- src/crawlee/crawlers/_basic/_basic_crawler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 408125458b..65d7dfa887 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -440,7 +440,7 @@ def __init__( self._statistics = statistics or cast( 'Statistics[TStatisticsState]', Statistics.with_default_state( - persistence_enabled=True, # TODO: Why does changing this to True breaks unrelated tests? + persistence_enabled=True, periodic_message_logger=self._logger, statistics_log_format=self._statistics_log_format, log_message='Current request statistics:', @@ -722,8 +722,6 @@ def sigint_handler() -> None: async def _run_crawler(self) -> None: event_manager = self._service_locator.get_event_manager() - self._crawler_state_rec_task.start() - # Collect the context managers to be entered. Context managers that are already active are excluded, # as they were likely entered by the caller, who will also be responsible for exiting them. contexts_to_enter = [ @@ -743,6 +741,7 @@ async def _run_crawler(self) -> None: for context in contexts_to_enter: await exit_stack.enter_async_context(context) # type: ignore[arg-type] + self._crawler_state_rec_task.start() await self._autoscaled_pool.run() # Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed From ebc350d81d12aab672ac08651d10e9daec0735e9 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 15 Oct 2025 16:25:10 +0200 Subject: [PATCH 4/8] Test Windows related issue --- src/crawlee/crawlers/_basic/_basic_crawler.py | 6 ++++-- tests/unit/crawlers/_basic/test_basic_crawler.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 65d7dfa887..b3d2c46db8 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -690,7 +690,6 @@ def sigint_handler() -> None: except CancelledError: pass finally: - await self._crawler_state_rec_task.stop() if threading.current_thread() is threading.main_thread(): with suppress(NotImplementedError): asyncio.get_running_loop().remove_signal_handler(signal.SIGINT) @@ -742,7 +741,10 @@ async def _run_crawler(self) -> None: await exit_stack.enter_async_context(context) # type: ignore[arg-type] self._crawler_state_rec_task.start() - await self._autoscaled_pool.run() + try: + await self._autoscaled_pool.run() + finally: + await self._crawler_state_rec_task.stop() # Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed if not self.statistics.state.crawler_last_started_at: diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index d1a5f98488..9d61ecc896 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -5,6 +5,7 @@ import concurrent import json import logging +import multiprocessing import os import sys import time @@ -1678,7 +1679,7 @@ async def test_crawler_statistics_persistence(tmp_path: Path) -> None: This test simulates starting the crawler process twice, and checks that the statistics include first run.""" - with concurrent.futures.ProcessPoolExecutor() as executor: + with concurrent.futures.ProcessPoolExecutor(mp_context=multiprocessing.get_context('fork')) as executor: # Crawl 2 requests in the first run and automatically persist the state. first_run_state = executor.submit( _process_run_crawler, From 52daca0a69c31a185dd15feac97fd1e2fee370a2 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 16 Oct 2025 11:26:08 +0200 Subject: [PATCH 5/8] Fix test on windows --- tests/unit/crawlers/_basic/test_basic_crawler.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 9d61ecc896..b2b75e50f7 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -5,7 +5,6 @@ import concurrent import json import logging -import multiprocessing import os import sys import time @@ -1650,7 +1649,7 @@ async def handler(context: BasicCrawlingContext) -> None: async def _run_crawler(requests: list[str], storage_dir: str) -> StatisticsState: """Run crawler and return its statistics state. - Must be defined like this to be picklable for ProcessPoolExecutor.""" + Must be defined like this to be pickable for ProcessPoolExecutor.""" service_locator.set_configuration( Configuration( crawlee_storage_dir=storage_dir, # type: ignore[call-arg] @@ -1679,7 +1678,7 @@ async def test_crawler_statistics_persistence(tmp_path: Path) -> None: This test simulates starting the crawler process twice, and checks that the statistics include first run.""" - with concurrent.futures.ProcessPoolExecutor(mp_context=multiprocessing.get_context('fork')) as executor: + with concurrent.futures.ProcessPoolExecutor() as executor: # Crawl 2 requests in the first run and automatically persist the state. first_run_state = executor.submit( _process_run_crawler, @@ -1688,6 +1687,8 @@ async def test_crawler_statistics_persistence(tmp_path: Path) -> None: ).result() assert first_run_state.requests_finished == 2 + # Do not reuse the executor to simulate a fresh process to avoid modified class attributes. + with concurrent.futures.ProcessPoolExecutor() as executor: # Crawl 1 additional requests in the second run, but use previously automatically persisted state. second_run_state = executor.submit( _process_run_crawler, requests=['https://c.placeholder.com'], storage_dir=str(tmp_path) From 675dadf19297aa667658f4692dd7e6811cc677b5 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 17 Oct 2025 16:54:24 +0200 Subject: [PATCH 6/8] Review comments TODO: Figure out reason for stats difference in request_total_finished_duration --- src/crawlee/_utils/recurring_task.py | 15 +++++++++++++++ src/crawlee/crawlers/_basic/_basic_crawler.py | 15 +++------------ src/crawlee/statistics/_statistics.py | 7 +++++-- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/crawlee/_utils/recurring_task.py b/src/crawlee/_utils/recurring_task.py index 8ffc71d144..d0c20249e9 100644 --- a/src/crawlee/_utils/recurring_task.py +++ b/src/crawlee/_utils/recurring_task.py @@ -7,6 +7,9 @@ if TYPE_CHECKING: from collections.abc import Callable from datetime import timedelta + from types import TracebackType + + from typing_extensions import Self logger = getLogger(__name__) @@ -26,6 +29,18 @@ def __init__(self, func: Callable, delay: timedelta) -> None: self.delay = delay self.task: asyncio.Task | None = None + async def __aenter__(self) -> Self: + self.start() + return self + + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + await self.stop() + async def _wrapper(self) -> None: """Continuously execute the provided function with the specified delay. diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index b3d2c46db8..92c32c7e04 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -11,7 +11,7 @@ from asyncio import CancelledError from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Sequence from contextlib import AsyncExitStack, suppress -from datetime import datetime, timedelta, timezone +from datetime import timedelta from functools import partial from pathlib import Path from typing import TYPE_CHECKING, Any, Generic, Literal, cast @@ -740,20 +740,11 @@ async def _run_crawler(self) -> None: for context in contexts_to_enter: await exit_stack.enter_async_context(context) # type: ignore[arg-type] - self._crawler_state_rec_task.start() - try: + async with self._crawler_state_rec_task: await self._autoscaled_pool.run() - finally: - await self._crawler_state_rec_task.stop() # Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed - if not self.statistics.state.crawler_last_started_at: - raise RuntimeError('Statistics.state.crawler_last_started_at not set.') - run_duration = datetime.now(timezone.utc) - self.statistics.state.crawler_last_started_at - self._statistics.state.crawler_runtime = self.statistics.state.crawler_runtime + run_duration - self._service_locator.get_event_manager().emit( - event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False) - ) + event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False)) async def add_requests( self, diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index 687a6b824d..f55d00a782 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -96,7 +96,7 @@ def __init__( self._state = RecoverableState( default_state=state_model(stats_id=self._id), - persist_state_key=persist_state_key or f'SDK_CRAWLER_STATISTICS_{self._id}', + persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}', persistence_enabled=persistence_enabled, persist_state_kvs_name=persist_state_kvs_name, persist_state_kvs_factory=persist_state_kvs_factory, @@ -187,7 +187,10 @@ async def __aexit__( if not self._active: raise RuntimeError(f'The {self.__class__.__name__} is not active.') - self._state.current_value.crawler_finished_at = datetime.now(timezone.utc) + if not self.state.crawler_last_started_at: + raise RuntimeError('Statistics.state.crawler_last_started_at not set.') + self.state.crawler_finished_at = datetime.now(timezone.utc) + self.state.crawler_runtime += self.state.crawler_finished_at - self.state.crawler_last_started_at await self._state.teardown() From 37f64732caab04afd0a978ff9e891573863dc167 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 20 Oct 2025 11:05:54 +0200 Subject: [PATCH 7/8] Add _crawler_state_rec_task to other context managers --- src/crawlee/crawlers/_basic/_basic_crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 92c32c7e04..36402c626c 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -731,6 +731,7 @@ async def _run_crawler(self) -> None: self._statistics, self._session_pool if self._use_session_pool else None, self._http_client, + self._crawler_state_rec_task, *self._additional_context_managers, ) if cm and getattr(cm, 'active', False) is False @@ -740,8 +741,7 @@ async def _run_crawler(self) -> None: for context in contexts_to_enter: await exit_stack.enter_async_context(context) # type: ignore[arg-type] - async with self._crawler_state_rec_task: - await self._autoscaled_pool.run() + await self._autoscaled_pool.run() # Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False)) From 00e65ec39486ae9fdbe256deebaf760f80cf946c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 23 Oct 2025 11:08:35 +0200 Subject: [PATCH 8/8] Persist Crawler statistics to Crawler KVS --- src/crawlee/crawlers/_basic/_basic_crawler.py | 26 +++++++----- src/crawlee/statistics/_statistics.py | 2 + tests/unit/test_configuration.py | 40 +++++++++++++++---- 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 36402c626c..28544ad607 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -437,15 +437,23 @@ def __init__( self._statistics_log_format = statistics_log_format # Statistics - self._statistics = statistics or cast( - 'Statistics[TStatisticsState]', - Statistics.with_default_state( - persistence_enabled=True, - periodic_message_logger=self._logger, - statistics_log_format=self._statistics_log_format, - log_message='Current request statistics:', - ), - ) + if statistics: + self._statistics = statistics + else: + + async def persist_state_factory() -> KeyValueStore: + return await self.get_key_value_store() + + self._statistics = cast( + 'Statistics[TStatisticsState]', + Statistics.with_default_state( + persistence_enabled=True, + periodic_message_logger=self._logger, + statistics_log_format=self._statistics_log_format, + log_message='Current request statistics:', + persist_state_kvs_factory=persist_state_factory, + ), + ) # Additional context managers to enter and exit self._additional_context_managers = _additional_context_managers or [] diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index f55d00a782..68b4ff6551 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -130,6 +130,7 @@ def with_default_state( persistence_enabled: bool = False, persist_state_kvs_name: str | None = None, persist_state_key: str | None = None, + persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None, log_message: str = 'Statistics', periodic_message_logger: Logger | None = None, log_interval: timedelta = timedelta(minutes=1), @@ -141,6 +142,7 @@ def with_default_state( persistence_enabled=persistence_enabled, persist_state_kvs_name=persist_state_kvs_name, persist_state_key=persist_state_key, + persist_state_kvs_factory=persist_state_kvs_factory, log_message=log_message, periodic_message_logger=periodic_message_logger, log_interval=log_interval, diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index f1bb244170..4be4309247 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -8,6 +8,7 @@ from crawlee import service_locator from crawlee.configuration import Configuration from crawlee.crawlers import HttpCrawler, HttpCrawlingContext +from crawlee.statistics import Statistics from crawlee.storage_clients import MemoryStorageClient from crawlee.storage_clients._file_system._storage_client import FileSystemStorageClient @@ -35,16 +36,41 @@ def test_global_configuration_works_reversed() -> None: ) -async def test_storage_not_persisted_when_disabled(tmp_path: Path, server_url: URL) -> None: +async def test_storage_not_persisted_when_non_persistable_storage_used(tmp_path: Path, server_url: URL) -> None: + """Make the Crawler use MemoryStorageClient which can't persist state.""" + service_locator.set_configuration( + Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + ) + ) + crawler = HttpCrawler(storage_client=MemoryStorageClient()) + + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + await context.push_data({'url': context.request.url}) + + await crawler.run([str(server_url)]) + + # Verify that no files were created in the storage directory. + content = list(tmp_path.iterdir()) + assert content == [], 'Expected the storage directory to be empty, but it is not.' + + +async def test_storage_persisted_with_explicit_statistics_with_persistable_storage( + tmp_path: Path, server_url: URL +) -> None: + """Make the Crawler use MemoryStorageClient which can't persist state, + but pass explicit statistics to it which will use global FileSystemStorageClient() that can persist state.""" + configuration = Configuration( crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] ) - storage_client = MemoryStorageClient() - service_locator.set_configuration(configuration) - service_locator.set_storage_client(storage_client) + service_locator.set_storage_client(FileSystemStorageClient()) - crawler = HttpCrawler() + crawler = HttpCrawler( + storage_client=MemoryStorageClient(), statistics=Statistics.with_default_state(persistence_enabled=True) + ) @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: @@ -52,9 +78,9 @@ async def default_handler(context: HttpCrawlingContext) -> None: await crawler.run([str(server_url)]) - # Verify that no files were created in the storage directory. + # Verify that files were created in the storage directory. content = list(tmp_path.iterdir()) - assert content == [], 'Expected the storage directory to be empty, but it is not.' + assert content != [], 'Expected the storage directory to contain files, but it does not.' async def test_storage_persisted_when_enabled(tmp_path: Path, server_url: URL) -> None: