From ac9c95f099b8b20af7a1134ec6db239d7978534e Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 30 Sep 2025 13:02:44 +0200 Subject: [PATCH 1/3] Draft of minimizing side effects Explicit kvs to RecoverableState --- src/crawlee/_utils/recoverable_state.py | 23 ++++++++++--- .../_file_system/_request_queue_client.py | 26 ++++++++++---- src/crawlee/storages/_key_value_store.py | 10 ++---- tests/unit/conftest.py | 3 ++ .../_file_system/test_fs_rq_client.py | 9 +++-- tests/unit/storages/test_key_value_store.py | 34 ++++++++++++++++++- 6 files changed, 85 insertions(+), 20 deletions(-) diff --git a/src/crawlee/_utils/recoverable_state.py b/src/crawlee/_utils/recoverable_state.py index fc115eb403..701dac1a86 100644 --- a/src/crawlee/_utils/recoverable_state.py +++ b/src/crawlee/_utils/recoverable_state.py @@ -4,6 +4,7 @@ from pydantic import BaseModel +from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.events._types import Event, EventPersistStateData if TYPE_CHECKING: @@ -37,6 +38,7 @@ def __init__( persistence_enabled: Literal[True, False, 'explicit_only'] = False, persist_state_kvs_name: str | None = None, persist_state_kvs_id: str | None = None, + persist_state_kvs: KeyValueStore | None = None, logger: logging.Logger, ) -> None: """Initialize a new recoverable state object. @@ -51,8 +53,20 @@ def __init__( If neither a name nor and id are supplied, the default store will be used. persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence. If neither a name nor and id are supplied, the default store will be used. + persist_state_kvs: KeyValueStore to use for persistence. If not provided, a system-wide KeyValueStore will + be used, based on service locator configuration. logger: A logger instance for logging operations related to state persistence """ + raise_if_too_many_kwargs(persist_state_kvs_name=persist_state_kvs_name, + persist_state_kvs_id=persist_state_kvs_id, + key_value_store=persist_state_kvs) + if not persist_state_kvs: + logger.debug( + 'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore ' + 'based on service_locator configuration, potentially calling service_locator.set_storage_client in the ' + 'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid ' + 'global side effects.') + self._default_state = default_state self._state_type: type[TStateModel] = self._default_state.__class__ self._state: TStateModel | None = None @@ -60,7 +74,7 @@ def __init__( self._persist_state_key = persist_state_key self._persist_state_kvs_name = persist_state_kvs_name self._persist_state_kvs_id = persist_state_kvs_id - self._key_value_store: 'KeyValueStore | None' = None # noqa: UP037 + self._key_value_store: KeyValueStore | None = persist_state_kvs self._log = logger async def initialize(self) -> TStateModel: @@ -79,9 +93,10 @@ async def initialize(self) -> TStateModel: # Import here to avoid circular imports. from crawlee.storages._key_value_store import KeyValueStore # noqa: PLC0415 - self._key_value_store = await KeyValueStore.open( - name=self._persist_state_kvs_name, id=self._persist_state_kvs_id - ) + if not self._key_value_store: + self._key_value_store = await KeyValueStore.open( + name=self._persist_state_kvs_name, id=self._persist_state_kvs_id + ) await self._load_saved_state() diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 426d78d3e1..d6b0d69e80 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -92,6 +92,7 @@ def __init__( metadata: RequestQueueMetadata, path_to_rq: Path, lock: asyncio.Lock, + recoverable_state: RecoverableState[RequestQueueState], ) -> None: """Initialize a new instance. @@ -114,12 +115,7 @@ def __init__( self._is_empty_cache: bool | None = None """Cache for is_empty result: None means unknown, True/False is cached state.""" - self._state = RecoverableState[RequestQueueState]( - default_state=RequestQueueState(), - persist_state_key=f'__RQ_STATE_{self._metadata.id}', - persistence_enabled=True, - logger=logger, - ) + self._state = recoverable_state """Recoverable state to maintain request ordering, in-progress status, and handled status.""" @override @@ -136,6 +132,19 @@ def path_to_metadata(self) -> Path: """The full path to the request queue metadata file.""" return self.path_to_rq / METADATA_FILENAME + @classmethod + async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState: + from crawlee.storage_clients import FileSystemStorageClient + from crawlee.storages import KeyValueStore + kvs = await KeyValueStore.open(storage_client=FileSystemStorageClient(),configuration=configuration) + return RecoverableState[RequestQueueState]( + default_state=RequestQueueState(), + persist_state_key=f'__RQ_STATE_{id}', + persist_state_kvs=kvs, + persistence_enabled=True, + logger=logger, + ) + @classmethod async def open( cls, @@ -194,6 +203,9 @@ async def open( metadata=metadata, path_to_rq=rq_base_path / rq_dir, lock=asyncio.Lock(), + recoverable_state=await cls._create_recoverable_state(id=id, + configuration=configuration), + ) await client._state.initialize() await client._discover_existing_requests() @@ -230,6 +242,7 @@ async def open( metadata=metadata, path_to_rq=path_to_rq, lock=asyncio.Lock(), + recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration), ) await client._state.initialize() @@ -254,6 +267,7 @@ async def open( metadata=metadata, path_to_rq=path_to_rq, lock=asyncio.Lock(), + recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration), ) await client._state.initialize() await client._update_metadata() diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index 3260e4f91e..cf68e483a1 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -278,13 +278,9 @@ async def get_auto_saved_value( if key in cache: return cache[key].current_value.root - cache[key] = recoverable_state = RecoverableState( - default_state=AutosavedValue(default_value), - persistence_enabled=True, - persist_state_kvs_id=self.id, - persist_state_key=key, - logger=logger, - ) + cache[key] = recoverable_state = RecoverableState(default_state=AutosavedValue(default_value), + persist_state_key=key, persistence_enabled=True, + persist_state_kvs=self, logger=logger) await recoverable_state.initialize() diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index cc2d3e8769..f3ca761dc8 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -69,6 +69,9 @@ def _prepare_test_env() -> None: # Verify that the test environment was set up correctly. assert os.environ.get('CRAWLEE_STORAGE_DIR') == str(tmp_path) + # Clear global cache of autosaved values + KeyValueStore._autosaved_values = {} + return _prepare_test_env diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index dc2937a259..e3c54e438f 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -6,9 +6,9 @@ import pytest -from crawlee import Request +from crawlee import Request, service_locator from crawlee.configuration import Configuration -from crawlee.storage_clients import FileSystemStorageClient +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient if TYPE_CHECKING: from collections.abc import AsyncGenerator @@ -77,6 +77,11 @@ async def test_request_file_persistence(rq_client: FileSystemRequestQueueClient) assert 'url' in request_data assert request_data['url'].startswith('https://example.com/') +async def test_opening_rq_does_not_have_side_effect_on_service_locator( + rq_client: FileSystemRequestQueueClient # noqa: ARG001 +) -> None: + service_locator.set_storage_client(MemoryStorageClient()) + async def test_drop_removes_directory(rq_client: FileSystemRequestQueueClient) -> None: """Test that drop removes the entire RQ directory from disk.""" diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index 21cdf6ad1b..1a3eb381df 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -10,13 +10,15 @@ from crawlee import service_locator from crawlee.configuration import Configuration +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, SqlStorageClient, StorageClient from crawlee.storages import KeyValueStore from crawlee.storages._storage_instance_manager import StorageInstanceManager if TYPE_CHECKING: from collections.abc import AsyncGenerator + from pathlib import Path + - from crawlee.storage_clients import StorageClient @pytest.fixture @@ -1063,3 +1065,33 @@ async def test_name_default_not_allowed(storage_client: StorageClient) -> None: f'it is reserved for default alias.', ): await KeyValueStore.open(name=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client) + +@pytest.mark.parametrize('tested_storage_client', [ + pytest.param(MemoryStorageClient(), id='tested=MemoryStorageClient'), + pytest.param(FileSystemStorageClient(), id='tested=FileSystemStorageClient'), + pytest.param(SqlStorageClient(), id='tested=SqlStorageClient'), + ]) +@pytest.mark.parametrize('global_storage_client', [ + pytest.param(MemoryStorageClient(), id='global=MemoryStorageClient'), + pytest.param(FileSystemStorageClient(), id='global=FileSystemStorageClient'), + pytest.param(SqlStorageClient(), id='global=SqlStorageClient'), + ]) +async def test_get_auto_saved_value_various_global_clients(tmp_path: Path, tested_storage_client: StorageClient, + global_storage_client:StorageClient) -> None: + """Ensure that persistence is working for all clients regardless of what is set in service locator.""" + service_locator.set_configuration(Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + purge_on_start=True, + )) + service_locator.set_storage_client(global_storage_client) + + kvs = await KeyValueStore.open(storage_client=tested_storage_client) + values_kvs = {'key': 'some_value'} + test_key = 'test_key' + + autosaved_value_kvs1 = await kvs.get_auto_saved_value(test_key) + assert autosaved_value_kvs1 == {} + autosaved_value_kvs1.update(values_kvs) + await kvs.persist_autosaved_values() + + assert await kvs.get_value(test_key) == autosaved_value_kvs1 From 537ed1ab3724be9110c3c9094a283fcd7f20bbce Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 30 Sep 2025 14:23:19 +0200 Subject: [PATCH 2/3] Use factory method istead of the explicit kvs --- src/crawlee/_utils/recoverable_state.py | 43 +++++++++------ src/crawlee/statistics/_statistics.py | 7 ++- .../_file_system/_request_queue_client.py | 18 ++++--- src/crawlee/storages/_key_value_store.py | 13 +++-- tests/unit/conftest.py | 4 +- .../test_adaptive_playwright_crawler.py | 1 - .../_file_system/test_fs_rq_client.py | 3 +- tests/unit/storages/test_key_value_store.py | 52 +++++++++++-------- 8 files changed, 88 insertions(+), 53 deletions(-) diff --git a/src/crawlee/_utils/recoverable_state.py b/src/crawlee/_utils/recoverable_state.py index 701dac1a86..9916083a72 100644 --- a/src/crawlee/_utils/recoverable_state.py +++ b/src/crawlee/_utils/recoverable_state.py @@ -9,8 +9,9 @@ if TYPE_CHECKING: import logging + from collections.abc import Callable, Coroutine - from crawlee.storages._key_value_store import KeyValueStore + from crawlee.storages import KeyValueStore TStateModel = TypeVar('TStateModel', bound=BaseModel) @@ -38,7 +39,7 @@ def __init__( persistence_enabled: Literal[True, False, 'explicit_only'] = False, persist_state_kvs_name: str | None = None, persist_state_kvs_id: str | None = None, - persist_state_kvs: KeyValueStore | None = None, + persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None, logger: logging.Logger, ) -> None: """Initialize a new recoverable state object. @@ -53,28 +54,40 @@ def __init__( If neither a name nor and id are supplied, the default store will be used. persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence. If neither a name nor and id are supplied, the default store will be used. - persist_state_kvs: KeyValueStore to use for persistence. If not provided, a system-wide KeyValueStore will - be used, based on service locator configuration. + persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If + not provided, a system-wide KeyValueStore will be used, based on service locator configuration. logger: A logger instance for logging operations related to state persistence """ - raise_if_too_many_kwargs(persist_state_kvs_name=persist_state_kvs_name, - persist_state_kvs_id=persist_state_kvs_id, - key_value_store=persist_state_kvs) - if not persist_state_kvs: + raise_if_too_many_kwargs( + persist_state_kvs_name=persist_state_kvs_name, + persist_state_kvs_id=persist_state_kvs_id, + persist_state_kvs_factory=persist_state_kvs_factory, + ) + if not persist_state_kvs_factory: logger.debug( 'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore ' 'based on service_locator configuration, potentially calling service_locator.set_storage_client in the ' 'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid ' - 'global side effects.') + 'global side effects.' + ) self._default_state = default_state self._state_type: type[TStateModel] = self._default_state.__class__ self._state: TStateModel | None = None self._persistence_enabled = persistence_enabled self._persist_state_key = persist_state_key - self._persist_state_kvs_name = persist_state_kvs_name - self._persist_state_kvs_id = persist_state_kvs_id - self._key_value_store: KeyValueStore | None = persist_state_kvs + if persist_state_kvs_factory is None: + + async def kvs_factory() -> KeyValueStore: + from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import + + return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id) + + self._persist_state_kvs_factory = kvs_factory + else: + self._persist_state_kvs_factory = persist_state_kvs_factory + + self._key_value_store: KeyValueStore | None = None self._log = logger async def initialize(self) -> TStateModel: @@ -91,12 +104,8 @@ async def initialize(self) -> TStateModel: return self.current_value # Import here to avoid circular imports. - from crawlee.storages._key_value_store import KeyValueStore # noqa: PLC0415 - if not self._key_value_store: - self._key_value_store = await KeyValueStore.open( - name=self._persist_state_kvs_name, id=self._persist_state_kvs_id - ) + self._key_value_store = await self._persist_state_kvs_factory() await self._load_saved_state() diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index 2386986001..3e95932c2a 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -17,8 +17,11 @@ from crawlee.statistics._error_tracker import ErrorTracker if TYPE_CHECKING: + from collections.abc import Callable, Coroutine from types import TracebackType + from crawlee.storages import KeyValueStore + TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState) logger = getLogger(__name__) @@ -70,6 +73,7 @@ def __init__( persistence_enabled: bool | Literal['explicit_only'] = False, persist_state_kvs_name: str | None = None, persist_state_key: str | None = None, + persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None, log_message: str = 'Statistics', periodic_message_logger: Logger | None = None, log_interval: timedelta = timedelta(minutes=1), @@ -95,6 +99,7 @@ def __init__( persist_state_key=persist_state_key or f'SDK_CRAWLER_STATISTICS_{self._id}', persistence_enabled=persistence_enabled, persist_state_kvs_name=persist_state_kvs_name, + persist_state_kvs_factory=persist_state_kvs_factory, logger=logger, ) @@ -110,8 +115,8 @@ def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statist """Create near copy of the `Statistics` with replaced `state_model`.""" new_statistics: Statistics[TNewStatisticsState] = Statistics( persistence_enabled=self._state._persistence_enabled, # noqa: SLF001 - persist_state_kvs_name=self._state._persist_state_kvs_name, # noqa: SLF001 persist_state_key=self._state._persist_state_key, # noqa: SLF001 + persist_state_kvs_factory=self._state._persist_state_kvs_factory, # noqa: SLF001 log_message=self._log_message, periodic_message_logger=self._periodic_message_logger, state_model=state_model, diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index d6b0d69e80..b831a4cfb4 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -31,6 +31,7 @@ from collections.abc import Sequence from crawlee.configuration import Configuration + from crawlee.storages import KeyValueStore logger = getLogger(__name__) @@ -134,13 +135,16 @@ def path_to_metadata(self) -> Path: @classmethod async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState: - from crawlee.storage_clients import FileSystemStorageClient - from crawlee.storages import KeyValueStore - kvs = await KeyValueStore.open(storage_client=FileSystemStorageClient(),configuration=configuration) + async def kvs_factory() -> KeyValueStore: + from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import + from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import + + return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration) + return RecoverableState[RequestQueueState]( default_state=RequestQueueState(), persist_state_key=f'__RQ_STATE_{id}', - persist_state_kvs=kvs, + persist_state_kvs_factory=kvs_factory, persistence_enabled=True, logger=logger, ) @@ -203,9 +207,9 @@ async def open( metadata=metadata, path_to_rq=rq_base_path / rq_dir, lock=asyncio.Lock(), - recoverable_state=await cls._create_recoverable_state(id=id, - configuration=configuration), - + recoverable_state=await cls._create_recoverable_state( + id=id, configuration=configuration + ), ) await client._state.initialize() await client._discover_existing_requests() diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index cf68e483a1..96bac8f7b3 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -278,9 +278,16 @@ async def get_auto_saved_value( if key in cache: return cache[key].current_value.root - cache[key] = recoverable_state = RecoverableState(default_state=AutosavedValue(default_value), - persist_state_key=key, persistence_enabled=True, - persist_state_kvs=self, logger=logger) + async def kvs_factory() -> KeyValueStore: + return self + + cache[key] = recoverable_state = RecoverableState( + default_state=AutosavedValue(default_value), + persist_state_key=key, + persistence_enabled=True, + persist_state_kvs_factory=kvs_factory, + logger=logger, + ) await recoverable_state.initialize() diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index f3ca761dc8..9956167931 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -17,6 +17,7 @@ from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_network from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient, ImpitHttpClient from crawlee.proxy_configuration import ProxyInfo +from crawlee.statistics import Statistics from crawlee.storages import KeyValueStore from tests.unit.server import TestServer, app, serve_in_thread @@ -69,8 +70,9 @@ def _prepare_test_env() -> None: # Verify that the test environment was set up correctly. assert os.environ.get('CRAWLEE_STORAGE_DIR') == str(tmp_path) - # Clear global cache of autosaved values + # Reset global class variables to ensure test isolation. KeyValueStore._autosaved_values = {} + Statistics._Statistics__next_id = 0 # type:ignore[attr-defined] # Mangled attribute return _prepare_test_env diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 5fdb621718..5a9070e9ca 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -493,7 +493,6 @@ async def test_adaptive_playwright_crawler_statistics_in_init() -> None: assert type(crawler._statistics.state) is AdaptivePlaywrightCrawlerStatisticState assert crawler._statistics._state._persistence_enabled == persistence_enabled - assert crawler._statistics._state._persist_state_kvs_name == persist_state_kvs_name assert crawler._statistics._state._persist_state_key == persist_state_key assert crawler._statistics._log_message == log_message diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index e3c54e438f..63d6678aad 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -77,8 +77,9 @@ async def test_request_file_persistence(rq_client: FileSystemRequestQueueClient) assert 'url' in request_data assert request_data['url'].startswith('https://example.com/') + async def test_opening_rq_does_not_have_side_effect_on_service_locator( - rq_client: FileSystemRequestQueueClient # noqa: ARG001 + rq_client: FileSystemRequestQueueClient, # noqa: ARG001 ) -> None: service_locator.set_storage_client(MemoryStorageClient()) diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index 1a3eb381df..f02d5a72cf 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -19,8 +19,6 @@ from pathlib import Path - - @pytest.fixture async def kvs( storage_client: StorageClient, @@ -1066,32 +1064,42 @@ async def test_name_default_not_allowed(storage_client: StorageClient) -> None: ): await KeyValueStore.open(name=StorageInstanceManager._DEFAULT_STORAGE_ALIAS, storage_client=storage_client) -@pytest.mark.parametrize('tested_storage_client', [ - pytest.param(MemoryStorageClient(), id='tested=MemoryStorageClient'), - pytest.param(FileSystemStorageClient(), id='tested=FileSystemStorageClient'), - pytest.param(SqlStorageClient(), id='tested=SqlStorageClient'), - ]) -@pytest.mark.parametrize('global_storage_client', [ - pytest.param(MemoryStorageClient(), id='global=MemoryStorageClient'), - pytest.param(FileSystemStorageClient(), id='global=FileSystemStorageClient'), - pytest.param(SqlStorageClient(), id='global=SqlStorageClient'), - ]) -async def test_get_auto_saved_value_various_global_clients(tmp_path: Path, tested_storage_client: StorageClient, - global_storage_client:StorageClient) -> None: + +@pytest.mark.parametrize( + 'tested_storage_client', + [ + pytest.param(MemoryStorageClient(), id='tested=MemoryStorageClient'), + pytest.param(FileSystemStorageClient(), id='tested=FileSystemStorageClient'), + pytest.param(SqlStorageClient(), id='tested=SqlStorageClient'), + ], +) +@pytest.mark.parametrize( + 'global_storage_client', + [ + pytest.param(MemoryStorageClient(), id='global=MemoryStorageClient'), + pytest.param(FileSystemStorageClient(), id='global=FileSystemStorageClient'), + pytest.param(SqlStorageClient(), id='global=SqlStorageClient'), + ], +) +async def test_get_auto_saved_value_various_global_clients( + tmp_path: Path, tested_storage_client: StorageClient, global_storage_client: StorageClient +) -> None: """Ensure that persistence is working for all clients regardless of what is set in service locator.""" - service_locator.set_configuration(Configuration( - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - purge_on_start=True, - )) + service_locator.set_configuration( + Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + purge_on_start=True, + ) + ) service_locator.set_storage_client(global_storage_client) kvs = await KeyValueStore.open(storage_client=tested_storage_client) values_kvs = {'key': 'some_value'} test_key = 'test_key' - autosaved_value_kvs1 = await kvs.get_auto_saved_value(test_key) - assert autosaved_value_kvs1 == {} - autosaved_value_kvs1.update(values_kvs) + autosaved_value_kvs = await kvs.get_auto_saved_value(test_key) + assert autosaved_value_kvs == {} + autosaved_value_kvs.update(values_kvs) await kvs.persist_autosaved_values() - assert await kvs.get_value(test_key) == autosaved_value_kvs1 + assert await kvs.get_value(test_key) == autosaved_value_kvs From ab62d6c752d626c86544ead6ff484ab1a261c6b8 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 2 Oct 2025 09:51:00 +0200 Subject: [PATCH 3/3] Make test more readable --- .../storage_clients/_file_system/test_fs_rq_client.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index 63d6678aad..6dc8c837d5 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -78,9 +78,11 @@ async def test_request_file_persistence(rq_client: FileSystemRequestQueueClient) assert request_data['url'].startswith('https://example.com/') -async def test_opening_rq_does_not_have_side_effect_on_service_locator( - rq_client: FileSystemRequestQueueClient, # noqa: ARG001 -) -> None: +async def test_opening_rq_does_not_have_side_effect_on_service_locator(configuration: Configuration) -> None: + """Opening request queue client should cause setting storage client in the global service locator.""" + await FileSystemStorageClient().create_rq_client(name='test_request_queue', configuration=configuration) + + # Set some specific storage client in the service locator. There should be no `ServiceConflictError`. service_locator.set_storage_client(MemoryStorageClient())