From f28570713b2d4344e8d798f28e0177256adec9df Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Sat, 10 May 2025 10:52:13 +0200 Subject: [PATCH 01/43] refactor!: Introduce new storage client system --- .../code_examples/google/cloud_run_example.py | 15 +- .../code_examples/google/google_example.py | 15 +- .../export_entire_dataset_to_file_csv.py | 2 +- .../export_entire_dataset_to_file_json.py | 2 +- docs/examples/code_examples/parsel_crawler.py | 2 +- .../cleaning_purge_explicitly_example.py | 21 - .../storages/rq_basic_example.py | 2 +- .../rq_with_crawler_explicit_example.py | 4 +- docs/guides/request_loaders.mdx | 4 +- docs/guides/storages.mdx | 7 - pyproject.toml | 6 +- src/crawlee/_autoscaling/autoscaled_pool.py | 3 +- src/crawlee/_cli.py | 2 +- src/crawlee/_service_locator.py | 14 +- src/crawlee/_types.py | 12 +- src/crawlee/_utils/file.py | 77 +- src/crawlee/_utils/globs.py | 2 +- src/crawlee/configuration.py | 18 +- src/crawlee/crawlers/_basic/_basic_crawler.py | 91 +- .../_browserforge_adapter.py | 6 +- .../hooks/post_gen_project.py | 6 +- src/crawlee/request_loaders/_request_list.py | 12 +- .../request_loaders/_request_loader.py | 14 +- .../request_loaders/_request_manager.py | 10 +- .../_request_manager_tandem.py | 20 +- src/crawlee/statistics/_error_snapshotter.py | 55 +- src/crawlee/storage_clients/__init__.py | 7 +- src/crawlee/storage_clients/_base/__init__.py | 9 - .../storage_clients/_base/_dataset_client.py | 219 ++--- .../_base/_dataset_collection_client.py | 59 -- .../_base/_key_value_store_client.py | 140 ++- .../_key_value_store_collection_client.py | 59 -- .../_base/_request_queue_client.py | 173 ++-- .../_base/_request_queue_collection_client.py | 59 -- .../storage_clients/_base/_storage_client.py | 67 +- src/crawlee/storage_clients/_base/_types.py | 22 - .../storage_clients/_file_system/__init__.py | 11 + .../_file_system/_dataset_client.py | 488 +++++++++++ .../_file_system/_key_value_store_client.py | 457 ++++++++++ .../_file_system/_request_queue_client.py | 784 +++++++++++++++++ .../_file_system/_storage_client.py | 62 ++ .../storage_clients/_file_system/_utils.py | 49 ++ .../storage_clients/_file_system/py.typed | 0 .../storage_clients/_memory/__init__.py | 20 +- .../_memory/_creation_management.py | 429 --------- .../_memory/_dataset_client.py | 490 ++++------- .../_memory/_dataset_collection_client.py | 62 -- .../_memory/_key_value_store_client.py | 500 +++-------- .../_key_value_store_collection_client.py | 62 -- .../_memory/_memory_storage_client.py | 358 -------- .../_memory/_request_queue_client.py | 736 ++++++---------- .../_request_queue_collection_client.py | 62 -- .../_memory/_storage_client.py | 62 ++ src/crawlee/storage_clients/models.py | 127 ++- src/crawlee/storages/_base.py | 19 +- src/crawlee/storages/_creation_management.py | 231 ----- src/crawlee/storages/_dataset.py | 633 ++++++-------- src/crawlee/storages/_key_value_store.py | 252 +++--- src/crawlee/storages/_request_queue.py | 713 ++++----------- src/crawlee/storages/_types.py | 167 ++++ tests/e2e/project_template/utils.py | 20 +- .../unit/_autoscaling/test_autoscaled_pool.py | 2 + tests/unit/_utils/test_file.py | 17 +- tests/unit/conftest.py | 36 +- .../test_adaptive_playwright_crawler.py | 16 +- .../crawlers/_basic/test_basic_crawler.py | 134 +-- .../unit/crawlers/_http/test_http_crawler.py | 3 +- .../_file_system/test_fs_dataset_client.py | 317 +++++++ .../_file_system/test_fs_kvs_client.py | 360 ++++++++ .../_file_system/test_fs_rq_client.py | 454 ++++++++++ .../_memory/test_creation_management.py | 59 -- .../_memory/test_dataset_client.py | 148 ---- .../_memory/test_dataset_collection_client.py | 45 - .../_memory/test_key_value_store_client.py | 443 ---------- .../test_key_value_store_collection_client.py | 42 - .../_memory/test_memory_dataset_client.py | 279 ++++++ .../_memory/test_memory_kvs_client.py | 243 ++++++ .../_memory/test_memory_rq_client.py | 442 ++++++++++ .../_memory/test_memory_storage_client.py | 288 ------- .../_memory/test_memory_storage_e2e.py | 130 --- .../_memory/test_request_queue_client.py | 249 ------ .../test_request_queue_collection_client.py | 42 - tests/unit/storages/test_dataset.py | 656 +++++++++++--- tests/unit/storages/test_key_value_store.py | 580 +++++++++---- .../storages/test_request_manager_tandem.py | 2 +- tests/unit/storages/test_request_queue.py | 814 ++++++++++++------ tests/unit/test_configuration.py | 24 +- tests/unit/test_service_locator.py | 12 +- website/generate_module_shortcuts.py | 3 +- 89 files changed, 7472 insertions(+), 6398 deletions(-) delete mode 100644 docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py delete mode 100644 src/crawlee/storage_clients/_base/_dataset_collection_client.py delete mode 100644 src/crawlee/storage_clients/_base/_key_value_store_collection_client.py delete mode 100644 src/crawlee/storage_clients/_base/_request_queue_collection_client.py delete mode 100644 src/crawlee/storage_clients/_base/_types.py create mode 100644 src/crawlee/storage_clients/_file_system/__init__.py create mode 100644 src/crawlee/storage_clients/_file_system/_dataset_client.py create mode 100644 src/crawlee/storage_clients/_file_system/_key_value_store_client.py create mode 100644 src/crawlee/storage_clients/_file_system/_request_queue_client.py create mode 100644 src/crawlee/storage_clients/_file_system/_storage_client.py create mode 100644 src/crawlee/storage_clients/_file_system/_utils.py create mode 100644 src/crawlee/storage_clients/_file_system/py.typed delete mode 100644 src/crawlee/storage_clients/_memory/_creation_management.py delete mode 100644 src/crawlee/storage_clients/_memory/_dataset_collection_client.py delete mode 100644 src/crawlee/storage_clients/_memory/_key_value_store_collection_client.py delete mode 100644 src/crawlee/storage_clients/_memory/_memory_storage_client.py delete mode 100644 src/crawlee/storage_clients/_memory/_request_queue_collection_client.py create mode 100644 src/crawlee/storage_clients/_memory/_storage_client.py delete mode 100644 src/crawlee/storages/_creation_management.py create mode 100644 src/crawlee/storages/_types.py create mode 100644 tests/unit/storage_clients/_file_system/test_fs_dataset_client.py create mode 100644 tests/unit/storage_clients/_file_system/test_fs_kvs_client.py create mode 100644 tests/unit/storage_clients/_file_system/test_fs_rq_client.py delete mode 100644 tests/unit/storage_clients/_memory/test_creation_management.py delete mode 100644 tests/unit/storage_clients/_memory/test_dataset_client.py delete mode 100644 tests/unit/storage_clients/_memory/test_dataset_collection_client.py delete mode 100644 tests/unit/storage_clients/_memory/test_key_value_store_client.py delete mode 100644 tests/unit/storage_clients/_memory/test_key_value_store_collection_client.py create mode 100644 tests/unit/storage_clients/_memory/test_memory_dataset_client.py create mode 100644 tests/unit/storage_clients/_memory/test_memory_kvs_client.py create mode 100644 tests/unit/storage_clients/_memory/test_memory_rq_client.py delete mode 100644 tests/unit/storage_clients/_memory/test_memory_storage_client.py delete mode 100644 tests/unit/storage_clients/_memory/test_memory_storage_e2e.py delete mode 100644 tests/unit/storage_clients/_memory/test_request_queue_client.py delete mode 100644 tests/unit/storage_clients/_memory/test_request_queue_collection_client.py diff --git a/docs/deployment/code_examples/google/cloud_run_example.py b/docs/deployment/code_examples/google/cloud_run_example.py index c01a4f3821..88db52bc75 100644 --- a/docs/deployment/code_examples/google/cloud_run_example.py +++ b/docs/deployment/code_examples/google/cloud_run_example.py @@ -5,24 +5,23 @@ import uvicorn from litestar import Litestar, get -from crawlee import service_locator from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext - -# highlight-start -# Disable writing storage data to the file system -configuration = service_locator.get_configuration() -configuration.persist_storage = False -configuration.write_metadata = False -# highlight-end +from crawlee.storage_clients import MemoryStorageClient @get('/') async def main() -> str: """The crawler entry point that will be called when the HTTP endpoint is accessed.""" + # highlight-start + # Disable writing storage data to the file system + storage_client = MemoryStorageClient() + # highlight-end + crawler = PlaywrightCrawler( headless=True, max_requests_per_crawl=10, browser_type='firefox', + storage_client=storage_client, ) @crawler.router.default_handler diff --git a/docs/deployment/code_examples/google/google_example.py b/docs/deployment/code_examples/google/google_example.py index f7180aa417..e31af2c3ab 100644 --- a/docs/deployment/code_examples/google/google_example.py +++ b/docs/deployment/code_examples/google/google_example.py @@ -6,22 +6,21 @@ import functions_framework from flask import Request, Response -from crawlee import service_locator from crawlee.crawlers import ( BeautifulSoupCrawler, BeautifulSoupCrawlingContext, ) - -# highlight-start -# Disable writing storage data to the file system -configuration = service_locator.get_configuration() -configuration.persist_storage = False -configuration.write_metadata = False -# highlight-end +from crawlee.storage_clients import MemoryStorageClient async def main() -> str: + # highlight-start + # Disable writing storage data to the file system + storage_client = MemoryStorageClient() + # highlight-end + crawler = BeautifulSoupCrawler( + storage_client=storage_client, max_request_retries=1, request_handler_timeout=timedelta(seconds=30), max_requests_per_crawl=10, diff --git a/docs/examples/code_examples/export_entire_dataset_to_file_csv.py b/docs/examples/code_examples/export_entire_dataset_to_file_csv.py index 115474fc61..f86a469c03 100644 --- a/docs/examples/code_examples/export_entire_dataset_to_file_csv.py +++ b/docs/examples/code_examples/export_entire_dataset_to_file_csv.py @@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run(['https://crawlee.dev']) # Export the entire dataset to a CSV file. - await crawler.export_data_csv(path='results.csv') + await crawler.export_data(path='results.csv') if __name__ == '__main__': diff --git a/docs/examples/code_examples/export_entire_dataset_to_file_json.py b/docs/examples/code_examples/export_entire_dataset_to_file_json.py index 5c871fb228..81fe07afa4 100644 --- a/docs/examples/code_examples/export_entire_dataset_to_file_json.py +++ b/docs/examples/code_examples/export_entire_dataset_to_file_json.py @@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run(['https://crawlee.dev']) # Export the entire dataset to a JSON file. - await crawler.export_data_json(path='results.json') + await crawler.export_data(path='results.json') if __name__ == '__main__': diff --git a/docs/examples/code_examples/parsel_crawler.py b/docs/examples/code_examples/parsel_crawler.py index 61ddb7484e..9807d7ca3b 100644 --- a/docs/examples/code_examples/parsel_crawler.py +++ b/docs/examples/code_examples/parsel_crawler.py @@ -40,7 +40,7 @@ async def some_hook(context: BasicCrawlingContext) -> None: await crawler.run(['https://github.com']) # Export the entire dataset to a JSON file. - await crawler.export_data_json(path='results.json') + await crawler.export_data(path='results.json') if __name__ == '__main__': diff --git a/docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py b/docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py deleted file mode 100644 index 15435da7bf..0000000000 --- a/docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py +++ /dev/null @@ -1,21 +0,0 @@ -import asyncio - -from crawlee.crawlers import HttpCrawler -from crawlee.storage_clients import MemoryStorageClient - - -async def main() -> None: - storage_client = MemoryStorageClient.from_config() - - # Call the purge_on_start method to explicitly purge the storage. - # highlight-next-line - await storage_client.purge_on_start() - - # Pass the storage client to the crawler. - crawler = HttpCrawler(storage_client=storage_client) - - # ... - - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/docs/guides/code_examples/storages/rq_basic_example.py b/docs/guides/code_examples/storages/rq_basic_example.py index 9e983bb9fe..388c184fc6 100644 --- a/docs/guides/code_examples/storages/rq_basic_example.py +++ b/docs/guides/code_examples/storages/rq_basic_example.py @@ -12,7 +12,7 @@ async def main() -> None: await request_queue.add_request('https://apify.com/') # Add multiple requests as a batch. - await request_queue.add_requests_batched( + await request_queue.add_requests( ['https://crawlee.dev/', 'https://crawlee.dev/python/'] ) diff --git a/docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py b/docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py index 21bedad0b9..bfece2eca5 100644 --- a/docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py +++ b/docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py @@ -10,9 +10,7 @@ async def main() -> None: request_queue = await RequestQueue.open(name='my-request-queue') # Interact with the request queue directly, e.g. add a batch of requests. - await request_queue.add_requests_batched( - ['https://apify.com/', 'https://crawlee.dev/'] - ) + await request_queue.add_requests(['https://apify.com/', 'https://crawlee.dev/']) # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request # list as request manager to it. It will be managed by the crawler. diff --git a/docs/guides/request_loaders.mdx b/docs/guides/request_loaders.mdx index 73fe374a62..8816f2a388 100644 --- a/docs/guides/request_loaders.mdx +++ b/docs/guides/request_loaders.mdx @@ -52,12 +52,12 @@ class BaseStorage { class RequestLoader { <> + + handled_count + + total_count + fetch_next_request() + mark_request_as_handled() + is_empty() + is_finished() - + get_handled_count() - + get_total_count() + to_tandem() } diff --git a/docs/guides/storages.mdx b/docs/guides/storages.mdx index 3be168b683..37815bde59 100644 --- a/docs/guides/storages.mdx +++ b/docs/guides/storages.mdx @@ -24,7 +24,6 @@ import KvsWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/stora import KvsWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_explicit_example.py'; import CleaningDoNotPurgeExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_do_not_purge_example.py'; -import CleaningPurgeExplicitlyExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_purge_explicitly_example.py'; Crawlee offers multiple storage types for managing and persisting your crawling data. Request-oriented storages, such as the `RequestQueue`, help you store and deduplicate URLs, while result-oriented storages, like `Dataset` and `KeyValueStore`, focus on storing and retrieving scraping results. This guide helps you choose the storage type that suits your needs. @@ -210,12 +209,6 @@ Default storages are purged before the crawler starts, unless explicitly configu If you do not explicitly interact with storages in your code, the purging will occur automatically when the `BasicCrawler.run` method is invoked. -If you need to purge storages earlier, you can call `MemoryStorageClient.purge_on_start` directly if you are using the default storage client. This method triggers the purging process for the underlying storage implementation you are currently using. - - - {CleaningPurgeExplicitlyExample} - - ## Conclusion This guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned how to manage requests and store and retrieve scraping results using the `RequestQueue`, `Dataset`, and `KeyValueStore`. You also discovered how to use helper functions to simplify interactions with these storages. Finally, you learned how to clean up storages before starting a crawler run and how to purge them explicitly. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/pyproject.toml b/pyproject.toml index 5987557e8c..0e26f6a572 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -144,9 +144,9 @@ ignore = [ "ISC001", # This rule may cause conflicts when used with the formatter "FIX", # flake8-fixme "PLR0911", # Too many return statements + "PLR0912", # Too many branches "PLR0913", # Too many arguments in function definition "PLR0915", # Too many statements - "PTH", # flake8-use-pathlib "PYI034", # `__aenter__` methods in classes like `{name}` usually return `self` at runtime "PYI036", # The second argument in `__aexit__` should be annotated with `object` or `BaseException | None` "S102", # Use of `exec` detected @@ -168,6 +168,7 @@ indent-style = "space" "F401", # Unused imports ] "**/{tests}/*" = [ + "ASYNC230", # Async functions should not open files with blocking methods like `open` "D", # Everything from the pydocstyle "INP001", # File {filename} is part of an implicit namespace package, add an __init__.py "PLR2004", # Magic value used in comparison, consider replacing {value} with a constant variable @@ -205,9 +206,6 @@ builtins-ignorelist = ["id"] [tool.ruff.lint.isort] known-first-party = ["crawlee"] -[tool.ruff.lint.pylint] -max-branches = 18 - [tool.pytest.ini_options] addopts = "-ra" asyncio_default_fixture_loop_scope = "function" diff --git a/src/crawlee/_autoscaling/autoscaled_pool.py b/src/crawlee/_autoscaling/autoscaled_pool.py index 5a9aa1fcff..7a751d1783 100644 --- a/src/crawlee/_autoscaling/autoscaled_pool.py +++ b/src/crawlee/_autoscaling/autoscaled_pool.py @@ -142,7 +142,8 @@ async def run(self) -> None: logger.info('Waiting for remaining tasks to finish') - for task in run.worker_tasks: + tasks_to_wait = list(run.worker_tasks) + for task in tasks_to_wait: if not task.done(): with suppress(BaseException): await task diff --git a/src/crawlee/_cli.py b/src/crawlee/_cli.py index 60d8d1a138..d7eadde35c 100644 --- a/src/crawlee/_cli.py +++ b/src/crawlee/_cli.py @@ -22,7 +22,7 @@ cli = typer.Typer(no_args_is_help=True) template_directory = importlib.resources.files('crawlee') / 'project_template' -with open(str(template_directory / 'cookiecutter.json')) as f: +with (template_directory / 'cookiecutter.json').open() as f: cookiecutter_json = json.load(f) crawler_choices = cookiecutter_json['crawler_type'] diff --git a/src/crawlee/_service_locator.py b/src/crawlee/_service_locator.py index 31bc36c63c..2cb8f8302a 100644 --- a/src/crawlee/_service_locator.py +++ b/src/crawlee/_service_locator.py @@ -3,8 +3,8 @@ from crawlee._utils.docs import docs_group from crawlee.configuration import Configuration from crawlee.errors import ServiceConflictError -from crawlee.events import EventManager -from crawlee.storage_clients import StorageClient +from crawlee.events import EventManager, LocalEventManager +from crawlee.storage_clients import FileSystemStorageClient, StorageClient @docs_group('Classes') @@ -49,8 +49,6 @@ def set_configuration(self, configuration: Configuration) -> None: def get_event_manager(self) -> EventManager: """Get the event manager.""" if self._event_manager is None: - from crawlee.events import LocalEventManager - self._event_manager = ( LocalEventManager().from_config(config=self._configuration) if self._configuration @@ -77,13 +75,7 @@ def set_event_manager(self, event_manager: EventManager) -> None: def get_storage_client(self) -> StorageClient: """Get the storage client.""" if self._storage_client is None: - from crawlee.storage_clients import MemoryStorageClient - - self._storage_client = ( - MemoryStorageClient.from_config(config=self._configuration) - if self._configuration - else MemoryStorageClient.from_config() - ) + self._storage_client = FileSystemStorageClient() self._storage_client_was_retrieved = True return self._storage_client diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 3cb84111fe..289b705ee2 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -23,7 +23,7 @@ from crawlee.sessions import Session from crawlee.storage_clients.models import DatasetItemsListPage from crawlee.storages import KeyValueStore - from crawlee.storages._dataset import ExportToKwargs, GetDataKwargs + from crawlee.storages._types import ExportToKwargs, GetDataKwargs # Workaround for https://github.com/pydantic/pydantic/issues/9445 J = TypeVar('J', bound='JsonSerializable') @@ -190,7 +190,7 @@ class PushDataKwargs(TypedDict): class PushDataFunctionCall(PushDataKwargs): - data: JsonSerializable + data: list[dict[str, Any]] | dict[str, Any] dataset_id: str | None dataset_name: str | None @@ -271,16 +271,12 @@ async def add_requests( async def push_data( self, - data: JsonSerializable, + data: list[dict[str, Any]] | dict[str, Any], dataset_id: str | None = None, dataset_name: str | None = None, **kwargs: Unpack[PushDataKwargs], ) -> None: """Track a call to the `push_data` context helper.""" - from crawlee.storages._dataset import Dataset - - await Dataset.check_and_serialize(data) - self.push_data_calls.append( PushDataFunctionCall( data=data, @@ -520,7 +516,7 @@ class PushDataFunction(Protocol): def __call__( self, - data: JsonSerializable, + data: list[dict[str, Any]] | dict[str, Any], dataset_id: str | None = None, dataset_name: str | None = None, **kwargs: Unpack[PushDataKwargs], diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index 022d0604ef..4de6804490 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -2,18 +2,26 @@ import asyncio import contextlib -import io +import csv import json import mimetypes import os import re import shutil from enum import Enum +from logging import getLogger from typing import TYPE_CHECKING if TYPE_CHECKING: + from collections.abc import AsyncIterator from pathlib import Path - from typing import Any + from typing import Any, TextIO + + from typing_extensions import Unpack + + from crawlee.storages._types import ExportDataCsvKwargs, ExportDataJsonKwargs + +logger = getLogger(__name__) class ContentType(Enum): @@ -83,28 +91,67 @@ def determine_file_extension(content_type: str) -> str | None: return ext[1:] if ext is not None else ext -def is_file_or_bytes(value: Any) -> bool: - """Determine if the input value is a file-like object or bytes. - - This function checks whether the provided value is an instance of bytes, bytearray, or io.IOBase (file-like). - The method is simplified for common use cases and may not cover all edge cases. +async def json_dumps(obj: Any) -> str: + """Serialize an object to a JSON-formatted string with specific settings. Args: - value: The value to be checked. + obj: The object to serialize. Returns: - True if the value is either a file-like object or bytes, False otherwise. + A string containing the JSON representation of the input object. """ - return isinstance(value, (bytes, bytearray, io.IOBase)) + return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str) -async def json_dumps(obj: Any) -> str: - """Serialize an object to a JSON-formatted string with specific settings. +def infer_mime_type(value: Any) -> str: + """Infer the MIME content type from the value. Args: - obj: The object to serialize. + value: The value to infer the content type from. Returns: - A string containing the JSON representation of the input object. + The inferred MIME content type. """ - return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str) + # If the value is bytes (or bytearray), return binary content type. + if isinstance(value, (bytes, bytearray)): + return 'application/octet-stream' + + # If the value is a dict or list, assume JSON. + if isinstance(value, (dict, list)): + return 'application/json; charset=utf-8' + + # If the value is a string, assume plain text. + if isinstance(value, str): + return 'text/plain; charset=utf-8' + + # Default fallback. + return 'application/octet-stream' + + +async def export_json_to_stream( + iterator: AsyncIterator[dict], + dst: TextIO, + **kwargs: Unpack[ExportDataJsonKwargs], +) -> None: + items = [item async for item in iterator] + json.dump(items, dst, **kwargs) + + +async def export_csv_to_stream( + iterator: AsyncIterator[dict], + dst: TextIO, + **kwargs: Unpack[ExportDataCsvKwargs], +) -> None: + writer = csv.writer(dst, **kwargs) + write_header = True + + # Iterate over the dataset and write to CSV. + async for item in iterator: + if not item: + continue + + if write_header: + writer.writerow(item.keys()) + write_header = False + + writer.writerow(item.values()) diff --git a/src/crawlee/_utils/globs.py b/src/crawlee/_utils/globs.py index d497631d07..f7e1a57927 100644 --- a/src/crawlee/_utils/globs.py +++ b/src/crawlee/_utils/globs.py @@ -73,7 +73,7 @@ def _translate( return rf'(?s:{res})\Z' -def _fnmatch_translate(pat: str, star: str, question_mark: str) -> list[str]: # noqa: PLR0912 +def _fnmatch_translate(pat: str, star: str, question_mark: str) -> list[str]: """Copy of fnmatch._translate from Python 3.13.""" res = list[str]() add = res.append diff --git a/src/crawlee/configuration.py b/src/crawlee/configuration.py index de22118816..e3ef39f486 100644 --- a/src/crawlee/configuration.py +++ b/src/crawlee/configuration.py @@ -118,21 +118,7 @@ class Configuration(BaseSettings): ) ), ] = True - """Whether to purge the storage on the start. This option is utilized by the `MemoryStorageClient`.""" - - write_metadata: Annotated[bool, Field(alias='crawlee_write_metadata')] = True - """Whether to write the storage metadata. This option is utilized by the `MemoryStorageClient`.""" - - persist_storage: Annotated[ - bool, - Field( - validation_alias=AliasChoices( - 'apify_persist_storage', - 'crawlee_persist_storage', - ) - ), - ] = True - """Whether to persist the storage. This option is utilized by the `MemoryStorageClient`.""" + """Whether to purge the storage on the start. This option is utilized by the storage clients.""" persist_state_interval: Annotated[ timedelta_ms, @@ -239,7 +225,7 @@ class Configuration(BaseSettings): ), ), ] = './storage' - """The path to the storage directory. This option is utilized by the `MemoryStorageClient`.""" + """The path to the storage directory. This option is utilized by the storage clients.""" headless: Annotated[ bool, diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 192d34091f..97f9ea6546 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -38,6 +38,7 @@ SkippedReason, ) from crawlee._utils.docs import docs_group +from crawlee._utils.file import export_csv_to_stream, export_json_to_stream from crawlee._utils.robots import RobotsTxtFile from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute from crawlee._utils.wait import wait_for @@ -75,6 +76,7 @@ ExtractLinksFunction, HttpMethod, JsonSerializable, + PushDataKwargs, ) from crawlee.configuration import Configuration from crawlee.events import EventManager @@ -85,7 +87,7 @@ from crawlee.statistics import FinalStatistics from crawlee.storage_clients import StorageClient from crawlee.storage_clients.models import DatasetItemsListPage - from crawlee.storages._dataset import ExportDataCsvKwargs, ExportDataJsonKwargs, GetDataKwargs, PushDataKwargs + from crawlee.storages._types import GetDataKwargs TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext) TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) @@ -685,6 +687,7 @@ async def add_requests( self, requests: Sequence[str | Request], *, + forefront: bool = False, batch_size: int = 1000, wait_time_between_batches: timedelta = timedelta(0), wait_for_all_requests_to_be_added: bool = False, @@ -694,6 +697,7 @@ async def add_requests( Args: requests: A list of requests to add to the queue. + forefront: If True, add requests to the forefront of the queue. batch_size: The number of requests to add in one batch. wait_time_between_batches: Time to wait between adding batches. wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning. @@ -718,17 +722,21 @@ async def add_requests( request_manager = await self.get_request_manager() - await request_manager.add_requests_batched( + await request_manager.add_requests( requests=allowed_requests, + forefront=forefront, batch_size=batch_size, wait_time_between_batches=wait_time_between_batches, wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added, wait_for_all_requests_to_be_added_timeout=wait_for_all_requests_to_be_added_timeout, ) - async def _use_state(self, default_value: dict[str, JsonSerializable] | None = None) -> dict[str, JsonSerializable]: - store = await self.get_key_value_store() - return await store.get_auto_saved_value(self._CRAWLEE_STATE_KEY, default_value) + async def _use_state( + self, + default_value: dict[str, JsonSerializable] | None = None, + ) -> dict[str, JsonSerializable]: + kvs = await self.get_key_value_store() + return await kvs.get_auto_saved_value(self._CRAWLEE_STATE_KEY, default_value) async def _save_crawler_state(self) -> None: store = await self.get_key_value_store() @@ -762,81 +770,32 @@ async def export_data( dataset_id: str | None = None, dataset_name: str | None = None, ) -> None: - """Export data from a `Dataset`. + """Export all items from a Dataset to a JSON or CSV file. - This helper method simplifies the process of exporting data from a `Dataset`. It opens the specified - one and then exports the data based on the provided parameters. If you need to pass options - specific to the output format, use the `export_data_csv` or `export_data_json` method instead. + This method simplifies the process of exporting data collected during crawling. It automatically + determines the export format based on the file extension (`.json` or `.csv`) and handles + the conversion of `Dataset` items to the appropriate format. Args: - path: The destination path. - dataset_id: The ID of the `Dataset`. - dataset_name: The name of the `Dataset`. + path: The destination file path. Must end with '.json' or '.csv'. + dataset_id: The ID of the Dataset to export from. If None, uses `name` parameter instead. + dataset_name: The name of the Dataset to export from. If None, uses `id` parameter instead. """ dataset = await self.get_dataset(id=dataset_id, name=dataset_name) path = path if isinstance(path, Path) else Path(path) - destination = path.open('w', newline='') + dst = path.open('w', newline='') if path.suffix == '.csv': - await dataset.write_to_csv(destination) + await export_csv_to_stream(dataset.iterate_items(), dst) elif path.suffix == '.json': - await dataset.write_to_json(destination) + await export_json_to_stream(dataset.iterate_items(), dst) else: raise ValueError(f'Unsupported file extension: {path.suffix}') - async def export_data_csv( - self, - path: str | Path, - *, - dataset_id: str | None = None, - dataset_name: str | None = None, - **kwargs: Unpack[ExportDataCsvKwargs], - ) -> None: - """Export data from a `Dataset` to a CSV file. - - This helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens - the specified one and then exports the data based on the provided parameters. - - Args: - path: The destination path. - content_type: The output format. - dataset_id: The ID of the `Dataset`. - dataset_name: The name of the `Dataset`. - kwargs: Extra configurations for dumping/writing in csv format. - """ - dataset = await self.get_dataset(id=dataset_id, name=dataset_name) - path = path if isinstance(path, Path) else Path(path) - - return await dataset.write_to_csv(path.open('w', newline=''), **kwargs) - - async def export_data_json( - self, - path: str | Path, - *, - dataset_id: str | None = None, - dataset_name: str | None = None, - **kwargs: Unpack[ExportDataJsonKwargs], - ) -> None: - """Export data from a `Dataset` to a JSON file. - - This helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the - specified one and then exports the data based on the provided parameters. - - Args: - path: The destination path - dataset_id: The ID of the `Dataset`. - dataset_name: The name of the `Dataset`. - kwargs: Extra configurations for dumping/writing in json format. - """ - dataset = await self.get_dataset(id=dataset_id, name=dataset_name) - path = path if isinstance(path, Path) else Path(path) - - return await dataset.write_to_json(path.open('w', newline=''), **kwargs) - async def _push_data( self, - data: JsonSerializable, + data: list[dict[str, Any]] | dict[str, Any], dataset_id: str | None = None, dataset_name: str | None = None, **kwargs: Unpack[PushDataKwargs], @@ -1211,7 +1170,7 @@ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth: requests.append(dst_request) - await request_manager.add_requests_batched(requests) + await request_manager.add_requests(requests) for push_data_call in result.push_data_calls: await self._push_data(**push_data_call) diff --git a/src/crawlee/fingerprint_suite/_browserforge_adapter.py b/src/crawlee/fingerprint_suite/_browserforge_adapter.py index d64ddd59f0..11f9f82d79 100644 --- a/src/crawlee/fingerprint_suite/_browserforge_adapter.py +++ b/src/crawlee/fingerprint_suite/_browserforge_adapter.py @@ -1,10 +1,10 @@ from __future__ import annotations -import os.path from collections.abc import Iterable from copy import deepcopy from functools import reduce from operator import or_ +from pathlib import Path from typing import TYPE_CHECKING, Any, Literal from browserforge.bayesian_network import extract_json @@ -253,9 +253,9 @@ def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str, def get_available_header_network() -> dict: """Get header network that contains possible header values.""" - if os.path.isfile(DATA_DIR / 'header-network.zip'): + if Path(DATA_DIR / 'header-network.zip').is_file(): return extract_json(DATA_DIR / 'header-network.zip') - if os.path.isfile(DATA_DIR / 'header-network-definition.zip'): + if Path(DATA_DIR / 'header-network-definition.zip').is_file(): return extract_json(DATA_DIR / 'header-network-definition.zip') raise FileNotFoundError('Missing header-network file.') diff --git a/src/crawlee/project_template/hooks/post_gen_project.py b/src/crawlee/project_template/hooks/post_gen_project.py index e076ff9308..c0495a724d 100644 --- a/src/crawlee/project_template/hooks/post_gen_project.py +++ b/src/crawlee/project_template/hooks/post_gen_project.py @@ -2,7 +2,6 @@ import subprocess from pathlib import Path - # % if cookiecutter.package_manager in ['poetry', 'uv'] Path('requirements.txt').unlink() @@ -32,8 +31,9 @@ # Install requirements and generate requirements.txt as an impromptu lockfile subprocess.check_call([str(path / 'pip'), 'install', '-r', 'requirements.txt']) -with open('requirements.txt', 'w') as requirements_txt: - subprocess.check_call([str(path / 'pip'), 'freeze'], stdout=requirements_txt) +Path('requirements.txt').write_text( + subprocess.check_output([str(path / 'pip'), 'freeze']).decode() +) # % if cookiecutter.crawler_type == 'playwright' subprocess.check_call([str(path / 'playwright'), 'install']) diff --git a/src/crawlee/request_loaders/_request_list.py b/src/crawlee/request_loaders/_request_list.py index 5964b106d0..3f545e1615 100644 --- a/src/crawlee/request_loaders/_request_list.py +++ b/src/crawlee/request_loaders/_request_list.py @@ -55,7 +55,13 @@ def name(self) -> str | None: return self._name @override - async def get_total_count(self) -> int: + @property + async def handled_count(self) -> int: + return self._handled_count + + @override + @property + async def total_count(self) -> int: return self._assumed_total_count @override @@ -87,10 +93,6 @@ async def mark_request_as_handled(self, request: Request) -> None: self._handled_count += 1 self._in_progress.remove(request.id) - @override - async def get_handled_count(self) -> int: - return self._handled_count - async def _ensure_next_request(self) -> None: if self._requests_lock is None: self._requests_lock = asyncio.Lock() diff --git a/src/crawlee/request_loaders/_request_loader.py b/src/crawlee/request_loaders/_request_loader.py index e358306a45..0a2e96e02f 100644 --- a/src/crawlee/request_loaders/_request_loader.py +++ b/src/crawlee/request_loaders/_request_loader.py @@ -25,9 +25,15 @@ class RequestLoader(ABC): - Managing state information such as the total and handled request counts. """ + @property @abstractmethod - async def get_total_count(self) -> int: - """Return an offline approximation of the total number of requests in the source (i.e. pending + handled).""" + async def handled_count(self) -> int: + """The number of requests that have been handled.""" + + @property + @abstractmethod + async def total_count(self) -> int: + """The total number of requests in the loader.""" @abstractmethod async def is_empty(self) -> bool: @@ -45,10 +51,6 @@ async def fetch_next_request(self) -> Request | None: async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: """Mark a request as handled after a successful processing (or after giving up retrying).""" - @abstractmethod - async def get_handled_count(self) -> int: - """Return the number of handled requests.""" - async def to_tandem(self, request_manager: RequestManager | None = None) -> RequestManagerTandem: """Combine the loader with a request manager to support adding and reclaiming requests. diff --git a/src/crawlee/request_loaders/_request_manager.py b/src/crawlee/request_loaders/_request_manager.py index f63f962cb9..5a8427c2cb 100644 --- a/src/crawlee/request_loaders/_request_manager.py +++ b/src/crawlee/request_loaders/_request_manager.py @@ -6,12 +6,12 @@ from crawlee._utils.docs import docs_group from crawlee.request_loaders._request_loader import RequestLoader +from crawlee.storage_clients.models import ProcessedRequest if TYPE_CHECKING: from collections.abc import Sequence from crawlee._request import Request - from crawlee.storage_clients.models import ProcessedRequest @docs_group('Abstract classes') @@ -40,10 +40,11 @@ async def add_request( Information about the request addition to the manager. """ - async def add_requests_batched( + async def add_requests( self, requests: Sequence[str | Request], *, + forefront: bool = False, batch_size: int = 1000, # noqa: ARG002 wait_time_between_batches: timedelta = timedelta(seconds=1), # noqa: ARG002 wait_for_all_requests_to_be_added: bool = False, # noqa: ARG002 @@ -53,14 +54,17 @@ async def add_requests_batched( Args: requests: Requests to enqueue. + forefront: If True, add requests to the beginning of the queue. batch_size: The number of requests to add in one batch. wait_time_between_batches: Time to wait between adding batches. wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning. wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added. """ # Default and dumb implementation. + processed_requests = list[ProcessedRequest]() for request in requests: - await self.add_request(request) + processed_request = await self.add_request(request, forefront=forefront) + processed_requests.append(processed_request) @abstractmethod async def reclaim_request(self, request: Request, *, forefront: bool = False) -> ProcessedRequest | None: diff --git a/src/crawlee/request_loaders/_request_manager_tandem.py b/src/crawlee/request_loaders/_request_manager_tandem.py index 9f0b8cefe8..35cc59e102 100644 --- a/src/crawlee/request_loaders/_request_manager_tandem.py +++ b/src/crawlee/request_loaders/_request_manager_tandem.py @@ -33,8 +33,14 @@ def __init__(self, request_loader: RequestLoader, request_manager: RequestManage self._read_write_manager = request_manager @override - async def get_total_count(self) -> int: - return (await self._read_only_loader.get_total_count()) + (await self._read_write_manager.get_total_count()) + @property + async def handled_count(self) -> int: + return await self._read_write_manager.handled_count + + @override + @property + async def total_count(self) -> int: + return (await self._read_only_loader.total_count) + (await self._read_write_manager.total_count) @override async def is_empty(self) -> bool: @@ -49,17 +55,19 @@ async def add_request(self, request: str | Request, *, forefront: bool = False) return await self._read_write_manager.add_request(request, forefront=forefront) @override - async def add_requests_batched( + async def add_requests( self, requests: Sequence[str | Request], *, + forefront: bool = False, batch_size: int = 1000, wait_time_between_batches: timedelta = timedelta(seconds=1), wait_for_all_requests_to_be_added: bool = False, wait_for_all_requests_to_be_added_timeout: timedelta | None = None, ) -> None: - return await self._read_write_manager.add_requests_batched( + return await self._read_write_manager.add_requests( requests, + forefront=forefront, batch_size=batch_size, wait_time_between_batches=wait_time_between_batches, wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added, @@ -97,10 +105,6 @@ async def reclaim_request(self, request: Request, *, forefront: bool = False) -> async def mark_request_as_handled(self, request: Request) -> None: await self._read_write_manager.mark_request_as_handled(request) - @override - async def get_handled_count(self) -> int: - return await self._read_write_manager.get_handled_count() - @override async def drop(self) -> None: await self._read_write_manager.drop() diff --git a/src/crawlee/statistics/_error_snapshotter.py b/src/crawlee/statistics/_error_snapshotter.py index 21dbd33d48..0d15973e2f 100644 --- a/src/crawlee/statistics/_error_snapshotter.py +++ b/src/crawlee/statistics/_error_snapshotter.py @@ -1,6 +1,5 @@ from __future__ import annotations -import asyncio import hashlib import re import string @@ -23,41 +22,29 @@ class ErrorSnapshotter: def __init__(self, *, snapshot_kvs_name: str | None = None) -> None: self._kvs_name = snapshot_kvs_name - async def capture_snapshot(self, error_message: str, file_and_line: str, context: BasicCrawlingContext) -> None: - """Capture error snapshot and save it to key value store. - - It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because - it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler` - returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with - an exception. - - Args: - error_message: Used in filename of the snapshot. - file_and_line: Used in filename of the snapshot. - context: Context that is used to get the snapshot. - """ - if snapshot := await context.get_snapshot(): - snapshot_base_name = self._get_snapshot_base_name(error_message, file_and_line) - snapshot_save_tasks = [] - if snapshot.html: - snapshot_save_tasks.append( - asyncio.create_task(self._save_html(snapshot.html, base_name=snapshot_base_name)) - ) - if snapshot.screenshot: - snapshot_save_tasks.append( - asyncio.create_task(self._save_screenshot(snapshot.screenshot, base_name=snapshot_base_name)) - ) - await asyncio.gather(*snapshot_save_tasks) - - async def _save_html(self, html: str, base_name: str) -> None: - file_name = f'{base_name}.html' + async def capture_snapshot( + self, + error_message: str, + file_and_line: str, + context: BasicCrawlingContext, + ) -> None: + """Capture error snapshot and save it to key value store.""" + snapshot = await context.get_snapshot() + if not snapshot: + return + + base = self._get_snapshot_base_name(error_message, file_and_line) kvs = await KeyValueStore.open(name=self._kvs_name) - await kvs.set_value(file_name, html, content_type='text/html') - async def _save_screenshot(self, screenshot: bytes, base_name: str) -> None: - file_name = f'{base_name}.jpg' - kvs = await KeyValueStore.open(name=self._kvs_name) - await kvs.set_value(file_name, screenshot, content_type='image/jpeg') + # Save HTML snapshot if present + if snapshot.html: + key_html = f'{base}.html' + await kvs.set_value(key_html, snapshot.html, content_type='text/html') + + # Save screenshot snapshot if present + if snapshot.screenshot: + key_jpg = f'{base}.jpg' + await kvs.set_value(key_jpg, snapshot.screenshot, content_type='image/jpeg') def _sanitize_filename(self, filename: str) -> str: return re.sub(f'[^{re.escape(self.ALLOWED_CHARACTERS)}]', '', filename[: self.MAX_FILENAME_LENGTH]) diff --git a/src/crawlee/storage_clients/__init__.py b/src/crawlee/storage_clients/__init__.py index 66d352d7a7..ce8c713ca9 100644 --- a/src/crawlee/storage_clients/__init__.py +++ b/src/crawlee/storage_clients/__init__.py @@ -1,4 +1,9 @@ from ._base import StorageClient +from ._file_system import FileSystemStorageClient from ._memory import MemoryStorageClient -__all__ = ['MemoryStorageClient', 'StorageClient'] +__all__ = [ + 'FileSystemStorageClient', + 'MemoryStorageClient', + 'StorageClient', +] diff --git a/src/crawlee/storage_clients/_base/__init__.py b/src/crawlee/storage_clients/_base/__init__.py index 5194da8768..73298560da 100644 --- a/src/crawlee/storage_clients/_base/__init__.py +++ b/src/crawlee/storage_clients/_base/__init__.py @@ -1,20 +1,11 @@ from ._dataset_client import DatasetClient -from ._dataset_collection_client import DatasetCollectionClient from ._key_value_store_client import KeyValueStoreClient -from ._key_value_store_collection_client import KeyValueStoreCollectionClient from ._request_queue_client import RequestQueueClient -from ._request_queue_collection_client import RequestQueueCollectionClient from ._storage_client import StorageClient -from ._types import ResourceClient, ResourceCollectionClient __all__ = [ 'DatasetClient', - 'DatasetCollectionClient', 'KeyValueStoreClient', - 'KeyValueStoreCollectionClient', 'RequestQueueClient', - 'RequestQueueCollectionClient', - 'ResourceClient', - 'ResourceCollectionClient', 'StorageClient', ] diff --git a/src/crawlee/storage_clients/_base/_dataset_client.py b/src/crawlee/storage_clients/_base/_dataset_client.py index d8495b2dd0..854e32dfce 100644 --- a/src/crawlee/storage_clients/_base/_dataset_client.py +++ b/src/crawlee/storage_clients/_base/_dataset_client.py @@ -7,58 +7,83 @@ if TYPE_CHECKING: from collections.abc import AsyncIterator - from contextlib import AbstractAsyncContextManager + from typing import Any - from httpx import Response - - from crawlee._types import JsonSerializable + from crawlee.configuration import Configuration from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata @docs_group('Abstract classes') class DatasetClient(ABC): - """An abstract class for dataset resource clients. + """An abstract class for dataset storage clients. + + Dataset clients provide an interface for accessing and manipulating dataset storage. They handle + operations like adding and getting dataset items across different storage backends. - These clients are specific to the type of resource they manage and operate under a designated storage - client, like a memory storage client. + Storage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`, + `RequestQueue`), and can operate with various storage systems including memory, file system, + databases, and cloud storage solutions. + + This abstract class defines the interface that all specific dataset clients must implement. """ - _LIST_ITEMS_LIMIT = 999_999_999_999 - """This is what API returns in the x-apify-pagination-limit header when no limit query parameter is used.""" + @property + @abstractmethod + def metadata(self) -> DatasetMetadata: + """The metadata of the dataset.""" + @classmethod @abstractmethod - async def get(self) -> DatasetMetadata | None: - """Get metadata about the dataset being managed by this client. + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> DatasetClient: + """Open existing or create a new dataset client. + + If a dataset with the given name or ID already exists, the appropriate dataset client is returned. + Otherwise, a new dataset is created and client for it is returned. + + The backend method for the `Dataset.open` call. + + Args: + id: The ID of the dataset. If not provided, an ID may be generated. + name: The name of the dataset. If not provided a default name may be used. + configuration: The configuration object. Returns: - An object containing the dataset's details, or None if the dataset does not exist. + A dataset client instance. """ @abstractmethod - async def update( - self, - *, - name: str | None = None, - ) -> DatasetMetadata: - """Update the dataset metadata. + async def drop(self) -> None: + """Drop the whole dataset and remove all its items. - Args: - name: New new name for the dataset. + The backend method for the `Dataset.drop` call. + """ - Returns: - An object reflecting the updated dataset metadata. + @abstractmethod + async def purge(self) -> None: + """Purge all items from the dataset. + + The backend method for the `Dataset.purge` call. """ @abstractmethod - async def delete(self) -> None: - """Permanently delete the dataset managed by this client.""" + async def push_data(self, data: list[Any] | dict[str, Any]) -> None: + """Push data to the dataset. + + The backend method for the `Dataset.push_data` call. + """ @abstractmethod - async def list_items( + async def get_data( self, *, - offset: int | None = 0, - limit: int | None = _LIST_ITEMS_LIMIT, + offset: int = 0, + limit: int | None = 999_999_999_999, clean: bool = False, desc: bool = False, fields: list[str] | None = None, @@ -69,27 +94,9 @@ async def list_items( flatten: list[str] | None = None, view: str | None = None, ) -> DatasetItemsListPage: - """Retrieve a paginated list of items from a dataset based on various filtering parameters. - - This method provides the flexibility to filter, sort, and modify the appearance of dataset items - when listed. Each parameter modifies the result set according to its purpose. The method also - supports pagination through 'offset' and 'limit' parameters. + """Get data from the dataset with various filtering options. - Args: - offset: The number of initial items to skip. - limit: The maximum number of items to return. - clean: If True, removes empty items and hidden fields, equivalent to 'skip_hidden' and 'skip_empty'. - desc: If True, items are returned in descending order, i.e., newest first. - fields: Specifies a subset of fields to include in each item. - omit: Specifies a subset of fields to exclude from each item. - unwind: Specifies a field that should be unwound. If it's an array, each element becomes a separate record. - skip_empty: If True, omits items that are empty after other filters have been applied. - skip_hidden: If True, omits fields starting with the '#' character. - flatten: A list of fields to flatten in each item. - view: The specific view of the dataset to use when retrieving items. - - Returns: - An object with filtered, sorted, and paginated dataset items plus pagination details. + The backend method for the `Dataset.get_data` call. """ @abstractmethod @@ -106,126 +113,12 @@ async def iterate_items( skip_empty: bool = False, skip_hidden: bool = False, ) -> AsyncIterator[dict]: - """Iterate over items in the dataset according to specified filters and sorting. - - This method allows for asynchronously iterating through dataset items while applying various filters such as - skipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit` - parameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and - `skip_hidden` parameters. + """Iterate over the dataset items with filtering options. - Args: - offset: The number of initial items to skip. - limit: The maximum number of items to iterate over. None means no limit. - clean: If True, removes empty items and hidden fields, equivalent to 'skip_hidden' and 'skip_empty'. - desc: If set to True, items are returned in descending order, i.e., newest first. - fields: Specifies a subset of fields to include in each item. - omit: Specifies a subset of fields to exclude from each item. - unwind: Specifies a field that should be unwound into separate items. - skip_empty: If set to True, omits items that are empty after other filters have been applied. - skip_hidden: If set to True, omits fields starting with the '#' character from the output. - - Yields: - An asynchronous iterator of dictionary objects, each representing a dataset item after applying - the specified filters and transformations. + The backend method for the `Dataset.iterate_items` call. """ # This syntax is to make mypy properly work with abstract AsyncIterator. # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators raise NotImplementedError if False: # type: ignore[unreachable] yield 0 - - @abstractmethod - async def get_items_as_bytes( - self, - *, - item_format: str = 'json', - offset: int | None = None, - limit: int | None = None, - desc: bool = False, - clean: bool = False, - bom: bool = False, - delimiter: str | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool = False, - skip_header_row: bool = False, - skip_hidden: bool = False, - xml_root: str | None = None, - xml_row: str | None = None, - flatten: list[str] | None = None, - ) -> bytes: - """Retrieve dataset items as bytes. - - Args: - item_format: Output format (e.g., 'json', 'csv'); default is 'json'. - offset: Number of items to skip; default is 0. - limit: Max number of items to return; no default limit. - desc: If True, results are returned in descending order. - clean: If True, filters out empty items and hidden fields. - bom: Include or exclude UTF-8 BOM; default behavior varies by format. - delimiter: Delimiter character for CSV; default is ','. - fields: List of fields to include in the results. - omit: List of fields to omit from the results. - unwind: Unwinds a field into separate records. - skip_empty: If True, skips empty items in the output. - skip_header_row: If True, skips the header row in CSV. - skip_hidden: If True, skips hidden fields in the output. - xml_root: Root element name for XML output; default is 'items'. - xml_row: Element name for each item in XML output; default is 'item'. - flatten: List of fields to flatten. - - Returns: - The dataset items as raw bytes. - """ - - @abstractmethod - async def stream_items( - self, - *, - item_format: str = 'json', - offset: int | None = None, - limit: int | None = None, - desc: bool = False, - clean: bool = False, - bom: bool = False, - delimiter: str | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool = False, - skip_header_row: bool = False, - skip_hidden: bool = False, - xml_root: str | None = None, - xml_row: str | None = None, - ) -> AbstractAsyncContextManager[Response | None]: - """Retrieve dataset items as a streaming response. - - Args: - item_format: Output format, options include json, jsonl, csv, html, xlsx, xml, rss; default is json. - offset: Number of items to skip at the start; default is 0. - limit: Maximum number of items to return; no default limit. - desc: If True, reverses the order of results. - clean: If True, filters out empty items and hidden fields. - bom: Include or exclude UTF-8 BOM; varies by format. - delimiter: Delimiter for CSV files; default is ','. - fields: List of fields to include in the output. - omit: List of fields to omit from the output. - unwind: Unwinds a field into separate records. - skip_empty: If True, empty items are omitted. - skip_header_row: If True, skips the header row in CSV. - skip_hidden: If True, hides fields starting with the # character. - xml_root: Custom root element name for XML output; default is 'items'. - xml_row: Custom element name for each item in XML; default is 'item'. - - Yields: - The dataset items in a streaming response. - """ - - @abstractmethod - async def push_items(self, items: JsonSerializable) -> None: - """Push items to the dataset. - - Args: - items: The items which to push in the dataset. They must be JSON serializable. - """ diff --git a/src/crawlee/storage_clients/_base/_dataset_collection_client.py b/src/crawlee/storage_clients/_base/_dataset_collection_client.py deleted file mode 100644 index 8530655c8c..0000000000 --- a/src/crawlee/storage_clients/_base/_dataset_collection_client.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING - -from crawlee._utils.docs import docs_group - -if TYPE_CHECKING: - from crawlee.storage_clients.models import DatasetListPage, DatasetMetadata - - -@docs_group('Abstract classes') -class DatasetCollectionClient(ABC): - """An abstract class for dataset collection clients. - - This collection client handles operations that involve multiple instances of a given resource type. - """ - - @abstractmethod - async def get_or_create( - self, - *, - id: str | None = None, - name: str | None = None, - schema: dict | None = None, - ) -> DatasetMetadata: - """Retrieve an existing dataset by its name or ID, or create a new one if it does not exist. - - Args: - id: Optional ID of the dataset to retrieve or create. If provided, the method will attempt - to find a dataset with the ID. - name: Optional name of the dataset resource to retrieve or create. If provided, the method will - attempt to find a dataset with this name. - schema: Optional schema for the dataset resource to be created. - - Returns: - Metadata object containing the information of the retrieved or created dataset. - """ - - @abstractmethod - async def list( - self, - *, - unnamed: bool = False, - limit: int | None = None, - offset: int | None = None, - desc: bool = False, - ) -> DatasetListPage: - """List the available datasets. - - Args: - unnamed: Whether to list only the unnamed datasets. - limit: Maximum number of datasets to return. - offset: Number of datasets to skip from the beginning of the list. - desc: Whether to sort the datasets in descending order. - - Returns: - The list of available datasets matching the specified filters. - """ diff --git a/src/crawlee/storage_clients/_base/_key_value_store_client.py b/src/crawlee/storage_clients/_base/_key_value_store_client.py index 6a5d141be6..013830932b 100644 --- a/src/crawlee/storage_clients/_base/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_base/_key_value_store_client.py @@ -6,126 +6,112 @@ from crawlee._utils.docs import docs_group if TYPE_CHECKING: - from contextlib import AbstractAsyncContextManager + from collections.abc import AsyncIterator - from httpx import Response - - from crawlee.storage_clients.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord + from crawlee.configuration import Configuration + from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata @docs_group('Abstract classes') class KeyValueStoreClient(ABC): - """An abstract class for key-value store resource clients. + """An abstract class for key-value store (KVS) storage clients. + + Key-value stores clients provide an interface for accessing and manipulating KVS storage. They handle + operations like getting, setting, deleting KVS values across different storage backends. + + Storage clients are specific to the type of storage they manage (`Dataset`, `KeyValueStore`, + `RequestQueue`), and can operate with various storage systems including memory, file system, + databases, and cloud storage solutions. - These clients are specific to the type of resource they manage and operate under a designated storage - client, like a memory storage client. + This abstract class defines the interface that all specific KVS clients must implement. """ + @property @abstractmethod - async def get(self) -> KeyValueStoreMetadata | None: - """Get metadata about the key-value store being managed by this client. - - Returns: - An object containing the key-value store's details, or None if the key-value store does not exist. - """ + def metadata(self) -> KeyValueStoreMetadata: + """The metadata of the key-value store.""" + @classmethod @abstractmethod - async def update( - self, + async def open( + cls, *, - name: str | None = None, - ) -> KeyValueStoreMetadata: - """Update the key-value store metadata. + id: str | None, + name: str | None, + configuration: Configuration, + ) -> KeyValueStoreClient: + """Open existing or create a new key-value store client. + + If a key-value store with the given name or ID already exists, the appropriate + key-value store client is returned. Otherwise, a new key-value store is created + and a client for it is returned. + + The backend method for the `KeyValueStoreClient.open` call. Args: - name: New new name for the key-value store. + id: The ID of the key-value store. If not provided, an ID may be generated. + name: The name of the key-value store. If not provided a default name may be used. + configuration: The configuration object. Returns: - An object reflecting the updated key-value store metadata. + A key-value store client instance. """ @abstractmethod - async def delete(self) -> None: - """Permanently delete the key-value store managed by this client.""" + async def drop(self) -> None: + """Drop the whole key-value store and remove all its values. - @abstractmethod - async def list_keys( - self, - *, - limit: int = 1000, - exclusive_start_key: str | None = None, - ) -> KeyValueStoreListKeysPage: - """List the keys in the key-value store. - - Args: - limit: Number of keys to be returned. Maximum value is 1000. - exclusive_start_key: All keys up to this one (including) are skipped from the result. - - Returns: - The list of keys in the key-value store matching the given arguments. + The backend method for the `KeyValueStore.drop` call. """ @abstractmethod - async def get_record(self, key: str) -> KeyValueStoreRecord | None: - """Retrieve the given record from the key-value store. + async def purge(self) -> None: + """Purge all items from the key-value store. - Args: - key: Key of the record to retrieve. - - Returns: - The requested record, or None, if the record does not exist + The backend method for the `KeyValueStore.purge` call. """ @abstractmethod - async def get_record_as_bytes(self, key: str) -> KeyValueStoreRecord[bytes] | None: - """Retrieve the given record from the key-value store, without parsing it. - - Args: - key: Key of the record to retrieve. + async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: + """Retrieve the given record from the key-value store. - Returns: - The requested record, or None, if the record does not exist + The backend method for the `KeyValueStore.get_value` call. """ @abstractmethod - async def stream_record(self, key: str) -> AbstractAsyncContextManager[KeyValueStoreRecord[Response] | None]: - """Retrieve the given record from the key-value store, as a stream. - - Args: - key: Key of the record to retrieve. + async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None: + """Set a value in the key-value store by its key. - Returns: - The requested record as a context-managed streaming Response, or None, if the record does not exist + The backend method for the `KeyValueStore.set_value` call. """ @abstractmethod - async def set_record(self, key: str, value: Any, content_type: str | None = None) -> None: - """Set a value to the given record in the key-value store. + async def delete_value(self, *, key: str) -> None: + """Delete a value from the key-value store by its key. - Args: - key: The key of the record to save the value to. - value: The value to save into the record. - content_type: The content type of the saved value. + The backend method for the `KeyValueStore.delete_value` call. """ @abstractmethod - async def delete_record(self, key: str) -> None: - """Delete the specified record from the key-value store. + async def iterate_keys( + self, + *, + exclusive_start_key: str | None = None, + limit: int | None = None, + ) -> AsyncIterator[KeyValueStoreRecordMetadata]: + """Iterate over all the existing keys in the key-value store. - Args: - key: The key of the record which to delete. + The backend method for the `KeyValueStore.iterate_keys` call. """ + # This syntax is to make mypy properly work with abstract AsyncIterator. + # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators + raise NotImplementedError + if False: # type: ignore[unreachable] + yield 0 @abstractmethod - async def get_public_url(self, key: str) -> str: + async def get_public_url(self, *, key: str) -> str: """Get the public URL for the given key. - Args: - key: Key of the record for which URL is required. - - Returns: - The public URL for the given key. - - Raises: - ValueError: If the key does not exist. + The backend method for the `KeyValueStore.get_public_url` call. """ diff --git a/src/crawlee/storage_clients/_base/_key_value_store_collection_client.py b/src/crawlee/storage_clients/_base/_key_value_store_collection_client.py deleted file mode 100644 index b447cf49b1..0000000000 --- a/src/crawlee/storage_clients/_base/_key_value_store_collection_client.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING - -from crawlee._utils.docs import docs_group - -if TYPE_CHECKING: - from crawlee.storage_clients.models import KeyValueStoreListPage, KeyValueStoreMetadata - - -@docs_group('Abstract classes') -class KeyValueStoreCollectionClient(ABC): - """An abstract class for key-value store collection clients. - - This collection client handles operations that involve multiple instances of a given resource type. - """ - - @abstractmethod - async def get_or_create( - self, - *, - id: str | None = None, - name: str | None = None, - schema: dict | None = None, - ) -> KeyValueStoreMetadata: - """Retrieve an existing key-value store by its name or ID, or create a new one if it does not exist. - - Args: - id: Optional ID of the key-value store to retrieve or create. If provided, the method will attempt - to find a key-value store with the ID. - name: Optional name of the key-value store resource to retrieve or create. If provided, the method will - attempt to find a key-value store with this name. - schema: Optional schema for the key-value store resource to be created. - - Returns: - Metadata object containing the information of the retrieved or created key-value store. - """ - - @abstractmethod - async def list( - self, - *, - unnamed: bool = False, - limit: int | None = None, - offset: int | None = None, - desc: bool = False, - ) -> KeyValueStoreListPage: - """List the available key-value stores. - - Args: - unnamed: Whether to list only the unnamed key-value stores. - limit: Maximum number of key-value stores to return. - offset: Number of key-value stores to skip from the beginning of the list. - desc: Whether to sort the key-value stores in descending order. - - Returns: - The list of available key-value stores matching the specified filters. - """ diff --git a/src/crawlee/storage_clients/_base/_request_queue_client.py b/src/crawlee/storage_clients/_base/_request_queue_client.py index 06b180801a..b1e6ba389f 100644 --- a/src/crawlee/storage_clients/_base/_request_queue_client.py +++ b/src/crawlee/storage_clients/_base/_request_queue_client.py @@ -8,13 +8,11 @@ if TYPE_CHECKING: from collections.abc import Sequence + from crawlee.configuration import Configuration from crawlee.storage_clients.models import ( - BatchRequestsOperationResponse, + AddRequestsResponse, ProcessedRequest, - ProlongRequestLockResponse, Request, - RequestQueueHead, - RequestQueueHeadWithLocks, RequestQueueMetadata, ) @@ -27,91 +25,70 @@ class RequestQueueClient(ABC): client, like a memory storage client. """ + @property @abstractmethod - async def get(self) -> RequestQueueMetadata | None: - """Get metadata about the request queue being managed by this client. - - Returns: - An object containing the request queue's details, or None if the request queue does not exist. - """ + def metadata(self) -> RequestQueueMetadata: + """The metadata of the request queue.""" + @classmethod @abstractmethod - async def update( - self, + async def open( + cls, *, - name: str | None = None, - ) -> RequestQueueMetadata: - """Update the request queue metadata. - - Args: - name: New new name for the request queue. - - Returns: - An object reflecting the updated request queue metadata. - """ - - @abstractmethod - async def delete(self) -> None: - """Permanently delete the request queue managed by this client.""" - - @abstractmethod - async def list_head(self, *, limit: int | None = None) -> RequestQueueHead: - """Retrieve a given number of requests from the beginning of the queue. + id: str | None, + name: str | None, + configuration: Configuration, + ) -> RequestQueueClient: + """Open a request queue client. Args: - limit: How many requests to retrieve. + id: ID of the queue to open. If not provided, a new queue will be created with a random ID. + name: Name of the queue to open. If not provided, the queue will be unnamed. + configuration: The configuration object. Returns: - The desired number of requests from the beginning of the queue. + A request queue client. """ @abstractmethod - async def list_and_lock_head(self, *, lock_secs: int, limit: int | None = None) -> RequestQueueHeadWithLocks: - """Fetch and lock a specified number of requests from the start of the queue. - - Retrieve and locks the first few requests of a queue for the specified duration. This prevents the requests - from being fetched by another client until the lock expires. - - Args: - lock_secs: Duration for which the requests are locked, in seconds. - limit: Maximum number of requests to retrieve and lock. + async def drop(self) -> None: + """Drop the whole request queue and remove all its values. - Returns: - The desired number of locked requests from the beginning of the queue. + The backend method for the `RequestQueue.drop` call. """ @abstractmethod - async def add_request( - self, - request: Request, - *, - forefront: bool = False, - ) -> ProcessedRequest: - """Add a request to the queue. - - Args: - request: The request to add to the queue. - forefront: Whether to add the request to the head or the end of the queue. + async def purge(self) -> None: + """Purge all items from the request queue. - Returns: - Request queue operation information. + The backend method for the `RequestQueue.purge` call. """ @abstractmethod - async def batch_add_requests( + async def add_batch_of_requests( self, requests: Sequence[Request], *, forefront: bool = False, - ) -> BatchRequestsOperationResponse: - """Add a batch of requests to the queue. + ) -> AddRequestsResponse: + """Add batch of requests to the queue. + + This method adds a batch of requests to the queue. Each request is processed based on its uniqueness + (determined by `unique_key`). Duplicates will be identified but not re-added to the queue. Args: - requests: The requests to add to the queue. - forefront: Whether to add the requests to the head or the end of the queue. + requests: The collection of requests to add to the queue. + forefront: Whether to put the added requests at the beginning (True) or the end (False) of the queue. + When True, the requests will be processed sooner than previously added requests. + batch_size: The maximum number of requests to add in a single batch. + wait_time_between_batches: The time to wait between adding batches of requests. + wait_for_all_requests_to_be_added: If True, the method will wait until all requests are added + to the queue before returning. + wait_for_all_requests_to_be_added_timeout: The maximum time to wait for all requests to be added. Returns: - Request queue batch operation information. + A response object containing information about which requests were successfully + processed and which failed (if any). """ @abstractmethod @@ -126,64 +103,58 @@ async def get_request(self, request_id: str) -> Request | None: """ @abstractmethod - async def update_request( - self, - request: Request, - *, - forefront: bool = False, - ) -> ProcessedRequest: - """Update a request in the queue. + async def fetch_next_request(self) -> Request | None: + """Return the next request in the queue to be processed. - Args: - request: The updated request. - forefront: Whether to put the updated request in the beginning or the end of the queue. + Once you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled` + to mark the request as handled in the queue. If there was some error in processing the request, call + `RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer + in another call to the `fetch_next_request` method. + + Note that the `None` return value does not mean the queue processing finished, it means there are currently + no pending requests. To check whether all requests in queue were finished, use `RequestQueue.is_finished` + instead. Returns: - The updated request + The request or `None` if there are no more pending requests. """ @abstractmethod - async def delete_request(self, request_id: str) -> None: - """Delete a request from the queue. - - Args: - request_id: ID of the request to delete. - """ + async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: + """Mark a request as handled after successful processing. - @abstractmethod - async def prolong_request_lock( - self, - request_id: str, - *, - forefront: bool = False, - lock_secs: int, - ) -> ProlongRequestLockResponse: - """Prolong the lock on a specific request in the queue. + Handled requests will never again be returned by the `RequestQueue.fetch_next_request` method. Args: - request_id: The identifier of the request whose lock is to be prolonged. - forefront: Whether to put the request in the beginning or the end of the queue after lock expires. - lock_secs: The additional amount of time, in seconds, that the request will remain locked. + request: The request to mark as handled. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. """ @abstractmethod - async def delete_request_lock( + async def reclaim_request( self, - request_id: str, + request: Request, *, forefront: bool = False, - ) -> None: - """Delete the lock on a specific request in the queue. + ) -> ProcessedRequest | None: + """Reclaim a failed request back to the queue. + + The request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`. Args: - request_id: ID of the request to delete the lock. - forefront: Whether to put the request in the beginning or the end of the queue after the lock is deleted. + request: The request to return to the queue. + forefront: Whether to add the request to the head or the end of the queue. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. """ @abstractmethod - async def batch_delete_requests(self, requests: list[Request]) -> BatchRequestsOperationResponse: - """Delete given requests from the queue. + async def is_empty(self) -> bool: + """Check if the request queue is empty. - Args: - requests: The requests to delete from the queue. + Returns: + True if the request queue is empty, False otherwise. """ diff --git a/src/crawlee/storage_clients/_base/_request_queue_collection_client.py b/src/crawlee/storage_clients/_base/_request_queue_collection_client.py deleted file mode 100644 index 7de876c344..0000000000 --- a/src/crawlee/storage_clients/_base/_request_queue_collection_client.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING - -from crawlee._utils.docs import docs_group - -if TYPE_CHECKING: - from crawlee.storage_clients.models import RequestQueueListPage, RequestQueueMetadata - - -@docs_group('Abstract classes') -class RequestQueueCollectionClient(ABC): - """An abstract class for request queue collection clients. - - This collection client handles operations that involve multiple instances of a given resource type. - """ - - @abstractmethod - async def get_or_create( - self, - *, - id: str | None = None, - name: str | None = None, - schema: dict | None = None, - ) -> RequestQueueMetadata: - """Retrieve an existing request queue by its name or ID, or create a new one if it does not exist. - - Args: - id: Optional ID of the request queue to retrieve or create. If provided, the method will attempt - to find a request queue with the ID. - name: Optional name of the request queue resource to retrieve or create. If provided, the method will - attempt to find a request queue with this name. - schema: Optional schema for the request queue resource to be created. - - Returns: - Metadata object containing the information of the retrieved or created request queue. - """ - - @abstractmethod - async def list( - self, - *, - unnamed: bool = False, - limit: int | None = None, - offset: int | None = None, - desc: bool = False, - ) -> RequestQueueListPage: - """List the available request queues. - - Args: - unnamed: Whether to list only the unnamed request queues. - limit: Maximum number of request queues to return. - offset: Number of request queues to skip from the beginning of the list. - desc: Whether to sort the request queues in descending order. - - Returns: - The list of available request queues matching the specified filters. - """ diff --git a/src/crawlee/storage_clients/_base/_storage_client.py b/src/crawlee/storage_clients/_base/_storage_client.py index 4f022cf30a..36f9cb7567 100644 --- a/src/crawlee/storage_clients/_base/_storage_client.py +++ b/src/crawlee/storage_clients/_base/_storage_client.py @@ -1,61 +1,48 @@ -# Inspiration: https://github.com/apify/crawlee/blob/v3.8.2/packages/types/src/storages.ts#L314:L328 - from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING -from crawlee._utils.docs import docs_group - if TYPE_CHECKING: + from crawlee.configuration import Configuration + from ._dataset_client import DatasetClient - from ._dataset_collection_client import DatasetCollectionClient from ._key_value_store_client import KeyValueStoreClient - from ._key_value_store_collection_client import KeyValueStoreCollectionClient from ._request_queue_client import RequestQueueClient - from ._request_queue_collection_client import RequestQueueCollectionClient -@docs_group('Abstract classes') class StorageClient(ABC): - """Defines an abstract base for storage clients. - - It offers interfaces to get subclients for interacting with storage resources like datasets, key-value stores, - and request queues. - """ + """Base class for storage clients.""" @abstractmethod - def dataset(self, id: str) -> DatasetClient: - """Get a subclient for a specific dataset by its ID.""" + async def open_dataset_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> DatasetClient: + """Open a dataset client.""" @abstractmethod - def datasets(self) -> DatasetCollectionClient: - """Get a subclient for dataset collection operations.""" + async def open_key_value_store_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> KeyValueStoreClient: + """Open a key-value store client.""" @abstractmethod - def key_value_store(self, id: str) -> KeyValueStoreClient: - """Get a subclient for a specific key-value store by its ID.""" - - @abstractmethod - def key_value_stores(self) -> KeyValueStoreCollectionClient: - """Get a subclient for key-value store collection operations.""" - - @abstractmethod - def request_queue(self, id: str) -> RequestQueueClient: - """Get a subclient for a specific request queue by its ID.""" - - @abstractmethod - def request_queues(self) -> RequestQueueCollectionClient: - """Get a subclient for request queue collection operations.""" - - @abstractmethod - async def purge_on_start(self) -> None: - """Perform a purge of the default storages. - - This method ensures that the purge is executed only once during the lifetime of the instance. - It is primarily used to clean up residual data from previous runs to maintain a clean state. - If the storage client does not support purging, leave it empty. - """ + async def open_request_queue_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> RequestQueueClient: + """Open a request queue client.""" def get_rate_limit_errors(self) -> dict[int, int]: """Return statistics about rate limit errors encountered by the HTTP client in storage client.""" diff --git a/src/crawlee/storage_clients/_base/_types.py b/src/crawlee/storage_clients/_base/_types.py deleted file mode 100644 index a5cf1325f5..0000000000 --- a/src/crawlee/storage_clients/_base/_types.py +++ /dev/null @@ -1,22 +0,0 @@ -from __future__ import annotations - -from typing import Union - -from ._dataset_client import DatasetClient -from ._dataset_collection_client import DatasetCollectionClient -from ._key_value_store_client import KeyValueStoreClient -from ._key_value_store_collection_client import KeyValueStoreCollectionClient -from ._request_queue_client import RequestQueueClient -from ._request_queue_collection_client import RequestQueueCollectionClient - -ResourceClient = Union[ - DatasetClient, - KeyValueStoreClient, - RequestQueueClient, -] - -ResourceCollectionClient = Union[ - DatasetCollectionClient, - KeyValueStoreCollectionClient, - RequestQueueCollectionClient, -] diff --git a/src/crawlee/storage_clients/_file_system/__init__.py b/src/crawlee/storage_clients/_file_system/__init__.py new file mode 100644 index 0000000000..2169896d86 --- /dev/null +++ b/src/crawlee/storage_clients/_file_system/__init__.py @@ -0,0 +1,11 @@ +from ._dataset_client import FileSystemDatasetClient +from ._key_value_store_client import FileSystemKeyValueStoreClient +from ._request_queue_client import FileSystemRequestQueueClient +from ._storage_client import FileSystemStorageClient + +__all__ = [ + 'FileSystemDatasetClient', + 'FileSystemKeyValueStoreClient', + 'FileSystemRequestQueueClient', + 'FileSystemStorageClient', +] diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py new file mode 100644 index 0000000000..d9d9c1fda3 --- /dev/null +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -0,0 +1,488 @@ +from __future__ import annotations + +import asyncio +import json +import shutil +from datetime import datetime, timezone +from logging import getLogger +from pathlib import Path +from typing import TYPE_CHECKING + +from pydantic import ValidationError +from typing_extensions import override + +from crawlee._utils.crypto import crypto_random_object_id +from crawlee.storage_clients._base import DatasetClient +from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata + +from ._utils import METADATA_FILENAME, atomic_write_text, json_dumps + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + from typing import Any + + from crawlee.configuration import Configuration + +logger = getLogger(__name__) + + +class FileSystemDatasetClient(DatasetClient): + """File system implementation of the dataset client. + + This client persists dataset items to the file system as individual JSON files within a structured + directory hierarchy following the pattern: + + ``` + {STORAGE_DIR}/datasets/{DATASET_ID}/{ITEM_ID}.json + ``` + + Each item is stored as a separate file, which allows for durability and the ability to + recover after process termination. Dataset operations like filtering, sorting, and pagination are + implemented by processing the stored files according to the requested parameters. + + This implementation is ideal for long-running crawlers where data persistence is important, + and for development environments where you want to easily inspect the collected data between runs. + """ + + _STORAGE_SUBDIR = 'datasets' + """The name of the subdirectory where datasets are stored.""" + + _STORAGE_SUBSUBDIR_DEFAULT = 'default' + """The name of the subdirectory for the default dataset.""" + + _ITEM_FILENAME_DIGITS = 9 + """Number of digits used for the dataset item file names (e.g., 000000019.json).""" + + def __init__( + self, + *, + id: str, + name: str | None, + created_at: datetime, + accessed_at: datetime, + modified_at: datetime, + item_count: int, + storage_dir: Path, + ) -> None: + """Initialize a new instance. + + Preferably use the `FileSystemDatasetClient.open` class method to create a new instance. + """ + self._metadata = DatasetMetadata( + id=id, + name=name, + created_at=created_at, + accessed_at=accessed_at, + modified_at=modified_at, + item_count=item_count, + ) + + self._storage_dir = storage_dir + + # Internal attributes + self._lock = asyncio.Lock() + """A lock to ensure that only one operation is performed at a time.""" + + @override + @property + def metadata(self) -> DatasetMetadata: + return self._metadata + + @property + def path_to_dataset(self) -> Path: + """The full path to the dataset directory.""" + if self.metadata.name is None: + return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT + + return self._storage_dir / self._STORAGE_SUBDIR / self.metadata.name + + @property + def path_to_metadata(self) -> Path: + """The full path to the dataset metadata file.""" + return self.path_to_dataset / METADATA_FILENAME + + @override + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> FileSystemDatasetClient: + storage_dir = Path(configuration.storage_dir) + dataset_base_path = storage_dir / cls._STORAGE_SUBDIR + + if not dataset_base_path.exists(): + await asyncio.to_thread(dataset_base_path.mkdir, parents=True, exist_ok=True) + + # Get a new instance by ID. + if id: + found = False + for dataset_dir in dataset_base_path.iterdir(): + if not dataset_dir.is_dir(): + continue + + metadata_path = dataset_dir / METADATA_FILENAME + if not metadata_path.exists(): + continue + + try: + file = await asyncio.to_thread(metadata_path.open) + try: + file_content = json.load(file) + metadata = DatasetMetadata(**file_content) + if metadata.id == id: + client = cls( + id=metadata.id, + name=metadata.name, + created_at=metadata.created_at, + accessed_at=metadata.accessed_at, + modified_at=metadata.modified_at, + item_count=metadata.item_count, + storage_dir=storage_dir, + ) + await client._update_metadata(update_accessed_at=True) + found = True + break + finally: + await asyncio.to_thread(file.close) + except (json.JSONDecodeError, ValidationError): + continue + + if not found: + raise ValueError(f'Dataset with ID "{id}" not found') + + # Get a new instance by name. + else: + dataset_path = ( + dataset_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else dataset_base_path / name + ) + metadata_path = dataset_path / METADATA_FILENAME + + # If the dataset directory exists, reconstruct the client from the metadata file. + if dataset_path.exists(): + # If metadata file is missing, raise an error. + if not metadata_path.exists(): + raise ValueError(f'Metadata file not found for dataset "{name}"') + + file = await asyncio.to_thread(open, metadata_path) + try: + file_content = json.load(file) + finally: + await asyncio.to_thread(file.close) + try: + metadata = DatasetMetadata(**file_content) + except ValidationError as exc: + raise ValueError(f'Invalid metadata file for dataset "{name}"') from exc + + client = cls( + id=metadata.id, + name=name, + created_at=metadata.created_at, + accessed_at=metadata.accessed_at, + modified_at=metadata.modified_at, + item_count=metadata.item_count, + storage_dir=storage_dir, + ) + + await client._update_metadata(update_accessed_at=True) + + # Otherwise, create a new dataset client. + else: + now = datetime.now(timezone.utc) + client = cls( + id=crypto_random_object_id(), + name=name, + created_at=now, + accessed_at=now, + modified_at=now, + item_count=0, + storage_dir=storage_dir, + ) + await client._update_metadata() + + return client + + @override + async def drop(self) -> None: + async with self._lock: + if self.path_to_dataset.exists(): + await asyncio.to_thread(shutil.rmtree, self.path_to_dataset) + + @override + async def purge(self) -> None: + async with self._lock: + for file_path in await self._get_sorted_data_files(): + await asyncio.to_thread(file_path.unlink, missing_ok=True) + + await self._update_metadata( + update_accessed_at=True, + update_modified_at=True, + new_item_count=0, + ) + + @override + async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: + async with self._lock: + new_item_count = self._metadata.item_count + if isinstance(data, list): + for item in data: + new_item_count += 1 + await self._push_item(item, new_item_count) + else: + new_item_count += 1 + await self._push_item(data, new_item_count) + + # now update metadata under the same lock + await self._update_metadata( + update_accessed_at=True, + update_modified_at=True, + new_item_count=new_item_count, + ) + + @override + async def get_data( + self, + *, + offset: int = 0, + limit: int | None = 999_999_999_999, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + flatten: list[str] | None = None, + view: str | None = None, + ) -> DatasetItemsListPage: + # Check for unsupported arguments and log a warning if found. + unsupported_args = { + 'clean': clean, + 'fields': fields, + 'omit': omit, + 'unwind': unwind, + 'skip_hidden': skip_hidden, + 'flatten': flatten, + 'view': view, + } + unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} + + if unsupported: + logger.warning( + f'The arguments {list(unsupported.keys())} of get_data are not supported by the ' + f'{self.__class__.__name__} client.' + ) + + # If the dataset directory does not exist, log a warning and return an empty page. + if not self.path_to_dataset.exists(): + logger.warning(f'Dataset directory not found: {self.path_to_dataset}') + return DatasetItemsListPage( + count=0, + offset=offset, + limit=limit or 0, + total=0, + desc=desc, + items=[], + ) + + # Get the list of sorted data files. + async with self._lock: + try: + data_files = await self._get_sorted_data_files() + except FileNotFoundError: + # directory was dropped mid-check + return DatasetItemsListPage(count=0, offset=offset, limit=limit or 0, total=0, desc=desc, items=[]) + + total = len(data_files) + + # Reverse the order if descending order is requested. + if desc: + data_files.reverse() + + # Apply offset and limit slicing. + selected_files = data_files[offset:] + if limit is not None: + selected_files = selected_files[:limit] + + # Read and parse each data file. + items = [] + for file_path in selected_files: + try: + file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8') + except FileNotFoundError: + logger.warning(f'File disappeared during iterate_items(): {file_path}, skipping') + continue + + try: + item = json.loads(file_content) + except json.JSONDecodeError: + logger.exception(f'Corrupt JSON in {file_path}, skipping') + continue + + # Skip empty items if requested. + if skip_empty and not item: + continue + + items.append(item) + + async with self._lock: + await self._update_metadata(update_accessed_at=True) + + # Return a paginated list page of dataset items. + return DatasetItemsListPage( + count=len(items), + offset=offset, + limit=limit or total - offset, + total=total, + desc=desc, + items=items, + ) + + @override + async def iterate_items( + self, + *, + offset: int = 0, + limit: int | None = None, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + ) -> AsyncIterator[dict]: + # Check for unsupported arguments and log a warning if found. + unsupported_args = { + 'clean': clean, + 'fields': fields, + 'omit': omit, + 'unwind': unwind, + 'skip_hidden': skip_hidden, + } + unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} + + if unsupported: + logger.warning( + f'The arguments {list(unsupported.keys())} of iterate are not supported ' + f'by the {self.__class__.__name__} client.' + ) + + # If the dataset directory does not exist, log a warning and return immediately. + if not self.path_to_dataset.exists(): + logger.warning(f'Dataset directory not found: {self.path_to_dataset}') + return + + # Get the list of sorted data files. + async with self._lock: + try: + data_files = await self._get_sorted_data_files() + except FileNotFoundError: + return + + # Reverse the order if descending order is requested. + if desc: + data_files.reverse() + + # Apply offset and limit slicing. + selected_files = data_files[offset:] + if limit is not None: + selected_files = selected_files[:limit] + + # Iterate over each data file, reading and yielding its parsed content. + for file_path in selected_files: + try: + file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8') + except FileNotFoundError: + logger.warning(f'File disappeared during iterate_items(): {file_path}, skipping') + continue + + try: + item = json.loads(file_content) + except json.JSONDecodeError: + logger.exception(f'Corrupt JSON in {file_path}, skipping') + continue + + # Skip empty items if requested. + if skip_empty and not item: + continue + + yield item + + async with self._lock: + await self._update_metadata(update_accessed_at=True) + + async def _update_metadata( + self, + *, + new_item_count: int | None = None, + update_accessed_at: bool = False, + update_modified_at: bool = False, + ) -> None: + """Update the dataset metadata file with current information. + + Args: + new_item_count: If provided, update the item count to this value. + update_accessed_at: If True, update the `accessed_at` timestamp to the current time. + update_modified_at: If True, update the `modified_at` timestamp to the current time. + """ + now = datetime.now(timezone.utc) + + if update_accessed_at: + self._metadata.accessed_at = now + if update_modified_at: + self._metadata.modified_at = now + if new_item_count is not None: + self._metadata.item_count = new_item_count + + # Ensure the parent directory for the metadata file exists. + await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) + + # Dump the serialized metadata to the file. + data = await json_dumps(self._metadata.model_dump()) + await atomic_write_text(self.path_to_metadata, data) + + async def _push_item(self, item: dict[str, Any], item_id: int) -> None: + """Push a single item to the dataset. + + This method writes the item as a JSON file with a zero-padded numeric filename + that reflects its position in the dataset sequence. + + Args: + item: The data item to add to the dataset. + item_id: The sequential ID to use for this item's filename. + """ + # Generate the filename for the new item using zero-padded numbering. + filename = f'{str(item_id).zfill(self._ITEM_FILENAME_DIGITS)}.json' + file_path = self.path_to_dataset / filename + + # Ensure the dataset directory exists. + await asyncio.to_thread(self.path_to_dataset.mkdir, parents=True, exist_ok=True) + + # Dump the serialized item to the file. + data = await json_dumps(item) + await atomic_write_text(file_path, data) + + async def _get_sorted_data_files(self) -> list[Path]: + """Retrieve and return a sorted list of data files in the dataset directory. + + The files are sorted numerically based on the filename (without extension), + which corresponds to the order items were added to the dataset. + + Returns: + A list of `Path` objects pointing to data files, sorted by numeric filename. + """ + # Retrieve and sort all JSON files in the dataset directory numerically. + files = await asyncio.to_thread( + sorted, + self.path_to_dataset.glob('*.json'), + key=lambda f: int(f.stem) if f.stem.isdigit() else 0, + ) + + # Remove the metadata file from the list if present. + if self.path_to_metadata in files: + files.remove(self.path_to_metadata) + + return files diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py new file mode 100644 index 0000000000..d4e7334928 --- /dev/null +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -0,0 +1,457 @@ +from __future__ import annotations + +import asyncio +import json +import shutil +import urllib.parse +from datetime import datetime, timezone +from logging import getLogger +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from pydantic import ValidationError +from typing_extensions import override + +from crawlee._utils.crypto import crypto_random_object_id +from crawlee._utils.file import infer_mime_type +from crawlee.storage_clients._base import KeyValueStoreClient +from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata + +from ._utils import METADATA_FILENAME, atomic_write_bytes, atomic_write_text, json_dumps + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + + from crawlee.configuration import Configuration + + +logger = getLogger(__name__) + + +class FileSystemKeyValueStoreClient(KeyValueStoreClient): + """File system implementation of the key-value store client. + + This client persists data to the file system, making it suitable for scenarios where data needs to + survive process restarts. Keys are mapped to file paths in a directory structure following the pattern: + + ``` + {STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY} + ``` + + Binary data is stored as-is, while JSON and text data are stored in human-readable format. + The implementation automatically handles serialization based on the content type and + maintains metadata about each record. + + This implementation is ideal for long-running crawlers where persistence is important and + for development environments where you want to easily inspect the stored data between runs. + """ + + _STORAGE_SUBDIR = 'key_value_stores' + """The name of the subdirectory where key-value stores are stored.""" + + _STORAGE_SUBSUBDIR_DEFAULT = 'default' + """The name of the subdirectory for the default key-value store.""" + + def __init__( + self, + *, + id: str, + name: str | None, + created_at: datetime, + accessed_at: datetime, + modified_at: datetime, + storage_dir: Path, + ) -> None: + """Initialize a new instance. + + Preferably use the `FileSystemKeyValueStoreClient.open` class method to create a new instance. + """ + self._metadata = KeyValueStoreMetadata( + id=id, + name=name, + created_at=created_at, + accessed_at=accessed_at, + modified_at=modified_at, + ) + + self._storage_dir = storage_dir + + # Internal attributes + self._lock = asyncio.Lock() + """A lock to ensure that only one operation is performed at a time.""" + + @override + @property + def metadata(self) -> KeyValueStoreMetadata: + return self._metadata + + @property + def path_to_kvs(self) -> Path: + """The full path to the key-value store directory.""" + if self.metadata.name is None: + return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT + + return self._storage_dir / self._STORAGE_SUBDIR / self.metadata.name + + @property + def path_to_metadata(self) -> Path: + """The full path to the key-value store metadata file.""" + return self.path_to_kvs / METADATA_FILENAME + + @override + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> FileSystemKeyValueStoreClient: + storage_dir = Path(configuration.storage_dir) + kvs_base_path = storage_dir / cls._STORAGE_SUBDIR + + if not kvs_base_path.exists(): + await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True) + + # Get a new instance by ID. + if id: + found = False + for kvs_dir in kvs_base_path.iterdir(): + if not kvs_dir.is_dir(): + continue + + metadata_path = kvs_dir / METADATA_FILENAME + if not metadata_path.exists(): + continue + + try: + file = await asyncio.to_thread(metadata_path.open) + try: + file_content = json.load(file) + metadata = KeyValueStoreMetadata(**file_content) + if metadata.id == id: + client = cls( + id=metadata.id, + name=metadata.name, + created_at=metadata.created_at, + accessed_at=metadata.accessed_at, + modified_at=metadata.modified_at, + storage_dir=storage_dir, + ) + await client._update_metadata(update_accessed_at=True) + found = True + break + finally: + await asyncio.to_thread(file.close) + except (json.JSONDecodeError, ValidationError): + continue + + if not found: + raise ValueError(f'Key-value store with ID "{id}" not found.') + + # Get a new instance by name. + else: + kvs_path = kvs_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else kvs_base_path / name + metadata_path = kvs_path / METADATA_FILENAME + + # If the key-value store directory exists, reconstruct the client from the metadata file. + if kvs_path.exists(): + # If metadata file is missing, raise an error. + if not metadata_path.exists(): + raise ValueError(f'Metadata file not found for key-value store "{name}"') + + file = await asyncio.to_thread(open, metadata_path) + try: + file_content = json.load(file) + finally: + await asyncio.to_thread(file.close) + try: + metadata = KeyValueStoreMetadata(**file_content) + except ValidationError as exc: + raise ValueError(f'Invalid metadata file for key-value store "{name}"') from exc + + client = cls( + id=metadata.id, + name=name, + created_at=metadata.created_at, + accessed_at=metadata.accessed_at, + modified_at=metadata.modified_at, + storage_dir=storage_dir, + ) + + await client._update_metadata(update_accessed_at=True) + + # Otherwise, create a new key-value store client. + else: + now = datetime.now(timezone.utc) + client = cls( + id=crypto_random_object_id(), + name=name, + created_at=now, + accessed_at=now, + modified_at=now, + storage_dir=storage_dir, + ) + await client._update_metadata() + + return client + + @override + async def drop(self) -> None: + # If the client directory exists, remove it recursively. + if self.path_to_kvs.exists(): + async with self._lock: + await asyncio.to_thread(shutil.rmtree, self.path_to_kvs) + + @override + async def purge(self) -> None: + async with self._lock: + for file_path in self.path_to_kvs.glob('*'): + if file_path.name == METADATA_FILENAME: + continue + await asyncio.to_thread(file_path.unlink) + + await self._update_metadata( + update_accessed_at=True, + update_modified_at=True, + ) + + @override + async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: + # Update the metadata to record access + async with self._lock: + await self._update_metadata(update_accessed_at=True) + + record_path = self.path_to_kvs / self._encode_key(key) + + if not record_path.exists(): + return None + + # Found a file for this key, now look for its metadata + record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') + if not record_metadata_filepath.exists(): + logger.warning(f'Found value file for key "{key}" but no metadata file.') + return None + + # Read the metadata file + async with self._lock: + try: + file = await asyncio.to_thread(open, record_metadata_filepath) + except FileNotFoundError: + logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value') + return None + + try: + metadata_content = json.load(file) + except json.JSONDecodeError: + logger.warning(f'Invalid metadata file for key "{key}"') + return None + finally: + await asyncio.to_thread(file.close) + + try: + metadata = KeyValueStoreRecordMetadata(**metadata_content) + except ValidationError: + logger.warning(f'Invalid metadata schema for key "{key}"') + return None + + # Read the actual value + try: + value_bytes = await asyncio.to_thread(record_path.read_bytes) + except FileNotFoundError: + logger.warning(f'Value file disappeared for key "{key}"') + return None + + # Handle None values + if metadata.content_type == 'application/x-none': + value = None + # Handle JSON values + elif 'application/json' in metadata.content_type: + try: + value = json.loads(value_bytes.decode('utf-8')) + except (json.JSONDecodeError, UnicodeDecodeError): + logger.warning(f'Failed to decode JSON value for key "{key}"') + return None + # Handle text values + elif metadata.content_type.startswith('text/'): + try: + value = value_bytes.decode('utf-8') + except UnicodeDecodeError: + logger.warning(f'Failed to decode text value for key "{key}"') + return None + # Handle binary values + else: + value = value_bytes + + # Calculate the size of the value in bytes + size = len(value_bytes) + + return KeyValueStoreRecord( + key=metadata.key, + value=value, + content_type=metadata.content_type, + size=size, + ) + + @override + async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None: + # Special handling for None values + if value is None: + content_type = 'application/x-none' # Special content type to identify None values + value_bytes = b'' + else: + content_type = content_type or infer_mime_type(value) + + # Serialize the value to bytes. + if 'application/json' in content_type: + value_bytes = (await json_dumps(value)).encode('utf-8') + elif isinstance(value, str): + value_bytes = value.encode('utf-8') + elif isinstance(value, (bytes, bytearray)): + value_bytes = value + else: + # Fallback: attempt to convert to string and encode. + value_bytes = str(value).encode('utf-8') + + record_path = self.path_to_kvs / self._encode_key(key) + + # Prepare the metadata + size = len(value_bytes) + record_metadata = KeyValueStoreRecordMetadata(key=key, content_type=content_type, size=size) + record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') + record_metadata_content = await json_dumps(record_metadata.model_dump()) + + async with self._lock: + # Ensure the key-value store directory exists. + await asyncio.to_thread(self.path_to_kvs.mkdir, parents=True, exist_ok=True) + + # Write the value to the file. + await atomic_write_bytes(record_path, value_bytes) + + # Write the record metadata to the file. + await atomic_write_text(record_metadata_filepath, record_metadata_content) + + # Update the KVS metadata to record the access and modification. + await self._update_metadata(update_accessed_at=True, update_modified_at=True) + + @override + async def delete_value(self, *, key: str) -> None: + record_path = self.path_to_kvs / self._encode_key(key) + metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') + deleted = False + + async with self._lock: + # Delete the value file and its metadata if found + if record_path.exists(): + await asyncio.to_thread(record_path.unlink) + + # Delete the metadata file if it exists + if metadata_path.exists(): + await asyncio.to_thread(metadata_path.unlink) + else: + logger.warning(f'Found value file for key "{key}" but no metadata file when trying to delete it.') + + deleted = True + + # If we deleted something, update the KVS metadata + if deleted: + await self._update_metadata(update_accessed_at=True, update_modified_at=True) + + @override + async def iterate_keys( + self, + *, + exclusive_start_key: str | None = None, + limit: int | None = None, + ) -> AsyncIterator[KeyValueStoreRecordMetadata]: + # Check if the KVS directory exists + if not self.path_to_kvs.exists(): + return + + # List and sort all files *inside* a brief lock, then release it immediately: + async with self._lock: + files = sorted(await asyncio.to_thread(list, self.path_to_kvs.glob('*'))) + + count = 0 + + for file_path in files: + # Skip the main metadata file + if file_path.name == METADATA_FILENAME: + continue + + # Only process metadata files for records + if not file_path.name.endswith(f'.{METADATA_FILENAME}'): + continue + + # Extract the base key name from the metadata filename + key_name = self._decode_key(file_path.name[: -len(f'.{METADATA_FILENAME}')]) + + # Apply exclusive_start_key filter if provided + if exclusive_start_key is not None and key_name <= exclusive_start_key: + continue + + # Try to read and parse the metadata file + try: + metadata_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8') + except FileNotFoundError: + logger.warning(f'Metadata file disappeared for key "{key_name}", skipping it.') + continue + + try: + metadata_dict = json.loads(metadata_content) + except json.JSONDecodeError: + logger.warning(f'Failed to decode metadata file for key "{key_name}", skipping it.') + continue + + try: + record_metadata = KeyValueStoreRecordMetadata(**metadata_dict) + except ValidationError: + logger.warning(f'Invalid metadata schema for key "{key_name}", skipping it.') + + yield record_metadata + + count += 1 + if limit and count >= limit: + break + + # Update accessed_at timestamp + async with self._lock: + await self._update_metadata(update_accessed_at=True) + + @override + async def get_public_url(self, *, key: str) -> str: + raise NotImplementedError('Public URLs are not supported for file system key-value stores.') + + async def _update_metadata( + self, + *, + update_accessed_at: bool = False, + update_modified_at: bool = False, + ) -> None: + """Update the KVS metadata file with current information. + + Args: + update_accessed_at: If True, update the `accessed_at` timestamp to the current time. + update_modified_at: If True, update the `modified_at` timestamp to the current time. + """ + now = datetime.now(timezone.utc) + + if update_accessed_at: + self._metadata.accessed_at = now + if update_modified_at: + self._metadata.modified_at = now + + # Ensure the parent directory for the metadata file exists. + await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) + + # Dump the serialized metadata to the file. + data = await json_dumps(self._metadata.model_dump()) + await atomic_write_text(self.path_to_metadata, data) + + def _encode_key(self, key: str) -> str: + """Encode a key to make it safe for use in a file path.""" + return urllib.parse.quote(key, safe='') + + def _decode_key(self, encoded_key: str) -> str: + """Decode a key that was encoded to make it safe for use in a file path.""" + return urllib.parse.unquote(encoded_key) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py new file mode 100644 index 0000000000..0ade0f3846 --- /dev/null +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -0,0 +1,784 @@ +from __future__ import annotations + +import asyncio +import json +import shutil +from datetime import datetime, timezone +from logging import getLogger +from pathlib import Path +from typing import TYPE_CHECKING + +from pydantic import ValidationError +from typing_extensions import override + +from crawlee import Request +from crawlee._utils.crypto import crypto_random_object_id +from crawlee.storage_clients._base import RequestQueueClient +from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata + +from ._utils import METADATA_FILENAME, atomic_write_text, json_dumps + +if TYPE_CHECKING: + from collections.abc import Sequence + + from crawlee.configuration import Configuration + +logger = getLogger(__name__) + + +class FileSystemRequestQueueClient(RequestQueueClient): + """A file system implementation of the request queue client. + + This client persists requests to the file system as individual JSON files, making it suitable for scenarios + where data needs to survive process restarts. Each request is stored as a separate file in a directory + structure following the pattern: + + ``` + {STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json + ``` + + The implementation uses file timestamps for FIFO ordering of regular requests and maintains in-memory sets + for tracking in-progress and forefront requests. File system storage provides durability at the cost of + slower I/O operations compared to memory-based storage. + + This implementation is ideal for long-running crawlers where persistence is important and for situations + where you need to resume crawling after process termination. + """ + + _STORAGE_SUBDIR = 'request_queues' + """The name of the subdirectory where request queues are stored.""" + + _STORAGE_SUBSUBDIR_DEFAULT = 'default' + """The name of the subdirectory for the default request queue.""" + + def __init__( + self, + *, + id: str, + name: str | None, + created_at: datetime, + accessed_at: datetime, + modified_at: datetime, + had_multiple_clients: bool, + handled_request_count: int, + pending_request_count: int, + stats: dict, + total_request_count: int, + storage_dir: Path, + ) -> None: + """Initialize a new instance. + + Preferably use the `FileSystemRequestQueueClient.open` class method to create a new instance. + """ + self._metadata = RequestQueueMetadata( + id=id, + name=name, + created_at=created_at, + accessed_at=accessed_at, + modified_at=modified_at, + had_multiple_clients=had_multiple_clients, + handled_request_count=handled_request_count, + pending_request_count=pending_request_count, + stats=stats, + total_request_count=total_request_count, + ) + + self._storage_dir = storage_dir + + # Internal attributes + self._lock = asyncio.Lock() + """A lock to ensure that only one operation is performed at a time.""" + + self._in_progress = set[str]() + """A set of request IDs that are currently being processed.""" + + self._forefront_requests = list[str]() + """A list of request IDs that should be prioritized (added with forefront=True). + Most recent forefront requests are added at the beginning of the list.""" + + self._sequence_counter = 0 + """A counter to track the order of requests added to the queue.""" + + @override + @property + def metadata(self) -> RequestQueueMetadata: + return self._metadata + + @property + def path_to_rq(self) -> Path: + """The full path to the request queue directory.""" + if self.metadata.name is None: + return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT + + return self._storage_dir / self._STORAGE_SUBDIR / self.metadata.name + + @property + def path_to_metadata(self) -> Path: + """The full path to the request queue metadata file.""" + return self.path_to_rq / METADATA_FILENAME + + @override + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> FileSystemRequestQueueClient: + storage_dir = Path(configuration.storage_dir) + rq_base_path = storage_dir / cls._STORAGE_SUBDIR + + if not rq_base_path.exists(): + await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True) + + # Get a new instance by ID. + if id: + found = False + for rq_dir in rq_base_path.iterdir(): + if not rq_dir.is_dir(): + continue + + metadata_path = rq_dir / METADATA_FILENAME + if not metadata_path.exists(): + continue + + try: + file = await asyncio.to_thread(metadata_path.open) + try: + file_content = json.load(file) + metadata = RequestQueueMetadata(**file_content) + if metadata.id == id: + client = cls( + id=metadata.id, + name=metadata.name, + created_at=metadata.created_at, + accessed_at=metadata.accessed_at, + modified_at=metadata.modified_at, + had_multiple_clients=metadata.had_multiple_clients, + handled_request_count=metadata.handled_request_count, + pending_request_count=metadata.pending_request_count, + stats=metadata.stats, + total_request_count=metadata.total_request_count, + storage_dir=storage_dir, + ) + await client._update_metadata(update_accessed_at=True) + found = True + break + finally: + await asyncio.to_thread(file.close) + except (json.JSONDecodeError, ValidationError): + continue + + if not found: + raise ValueError(f'Request queue with ID "{id}" not found') + + # Get a new instance by name. + else: + rq_path = rq_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else rq_base_path / name + metadata_path = rq_path / METADATA_FILENAME + + # If the RQ directory exists, reconstruct the client from the metadata file. + if rq_path.exists(): + # If metadata file is missing, raise an error. + if not metadata_path.exists(): + raise ValueError(f'Metadata file not found for request queue "{name}"') + + file = await asyncio.to_thread(open, metadata_path) + try: + file_content = json.load(file) + finally: + await asyncio.to_thread(file.close) + try: + metadata = RequestQueueMetadata(**file_content) + except ValidationError as exc: + raise ValueError(f'Invalid metadata file for request queue "{name}"') from exc + + client = cls( + id=metadata.id, + name=name, + created_at=metadata.created_at, + accessed_at=metadata.accessed_at, + modified_at=metadata.modified_at, + had_multiple_clients=metadata.had_multiple_clients, + handled_request_count=metadata.handled_request_count, + pending_request_count=metadata.pending_request_count, + stats=metadata.stats, + total_request_count=metadata.total_request_count, + storage_dir=storage_dir, + ) + + await client._update_metadata(update_accessed_at=True) + + # Otherwise, create a new dataset client. + else: + now = datetime.now(timezone.utc) + client = cls( + id=crypto_random_object_id(), + name=name, + created_at=now, + accessed_at=now, + modified_at=now, + had_multiple_clients=False, + handled_request_count=0, + pending_request_count=0, + stats={}, + total_request_count=0, + storage_dir=storage_dir, + ) + await client._update_metadata() + + return client + + @override + async def drop(self) -> None: + # If the client directory exists, remove it recursively. + if self.path_to_rq.exists(): + async with self._lock: + await asyncio.to_thread(shutil.rmtree, self.path_to_rq) + + @override + async def purge(self) -> None: + async with self._lock: + for file_path in self.path_to_rq.glob('*'): + if file_path.name == METADATA_FILENAME: + continue + await asyncio.to_thread(file_path.unlink) + + # Update metadata counts + await self._update_metadata( + update_modified_at=True, + update_accessed_at=True, + new_handled_request_count=0, + new_pending_request_count=0, + new_total_request_count=0, + ) + + @override + async def add_batch_of_requests( + self, + requests: Sequence[Request], + *, + forefront: bool = False, + ) -> AddRequestsResponse: + """Add a batch of requests to the queue. + + Args: + requests: The requests to add. + forefront: Whether to add the requests to the beginning of the queue. + + Returns: + Response containing information about the added requests. + """ + async with self._lock: + new_total_request_count = self._metadata.total_request_count + new_pending_request_count = self._metadata.pending_request_count + + processed_requests = [] + + # Create the requests directory if it doesn't exist + await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True) + + for request in requests: + # Check if the request is already in the queue by unique_key + existing_request = None + + # List all request files and check for matching unique_key + request_files = await asyncio.to_thread(list, self.path_to_rq.glob('*.json')) + for request_file in request_files: + # Skip metadata file + if request_file.name == METADATA_FILENAME: + continue + + file = await asyncio.to_thread(open, request_file) + try: + file_content = json.load(file) + if file_content.get('unique_key') == request.unique_key: + existing_request = Request(**file_content) + break + except (json.JSONDecodeError, ValidationError): + logger.warning(f'Failed to parse request file: {request_file}') + finally: + await asyncio.to_thread(file.close) + + was_already_present = existing_request is not None + was_already_handled = ( + was_already_present and existing_request and existing_request.handled_at is not None + ) + + # If the request is already in the queue and handled, don't add it again + if was_already_handled and existing_request: + processed_requests.append( + ProcessedRequest( + id=existing_request.id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=True, + ) + ) + continue + + # If forefront and existing request is not handled, mark it as forefront + if forefront and was_already_present and not was_already_handled and existing_request: + self._forefront_requests.insert(0, existing_request.id) + processed_requests.append( + ProcessedRequest( + id=existing_request.id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=False, + ) + ) + continue + + # If the request is already in the queue but not handled, update it + if was_already_present and existing_request: + # Update the existing request file + request_path = self.path_to_rq / f'{existing_request.id}.json' + request_data = await json_dumps(existing_request.model_dump()) + await atomic_write_text(request_path, request_data) + + processed_requests.append( + ProcessedRequest( + id=existing_request.id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=False, + ) + ) + continue + + # Add the new request to the queue + request_path = self.path_to_rq / f'{request.id}.json' + + # Create a data dictionary from the request and remove handled_at if it's None + request_dict = request.model_dump() + if request_dict.get('handled_at') is None: + request_dict.pop('handled_at', None) + + # Add sequence number to ensure FIFO ordering + sequence_number = self._sequence_counter + self._sequence_counter += 1 + request_dict['_sequence'] = sequence_number + + request_data = await json_dumps(request_dict) + await atomic_write_text(request_path, request_data) + + # Update metadata counts + new_total_request_count += 1 + new_pending_request_count += 1 + + # If forefront, add to the forefront list + if forefront: + self._forefront_requests.insert(0, request.id) + + processed_requests.append( + ProcessedRequest( + id=request.id, + unique_key=request.unique_key, + was_already_present=False, + was_already_handled=False, + ) + ) + + await self._update_metadata( + update_modified_at=True, + update_accessed_at=True, + new_total_request_count=new_total_request_count, + new_pending_request_count=new_pending_request_count, + ) + + return AddRequestsResponse( + processed_requests=processed_requests, + unprocessed_requests=[], + ) + + @override + async def get_request(self, request_id: str) -> Request | None: + """Retrieve a request from the queue. + + Args: + request_id: ID of the request to retrieve. + + Returns: + The retrieved request, or None, if it did not exist. + """ + request_path = self.path_to_rq / f'{request_id}.json' + + try: + file = await asyncio.to_thread(open, request_path) + except FileNotFoundError: + logger.warning(f'Request file "{request_path}" not found.') + return None + + try: + file_content = json.load(file) + except json.JSONDecodeError as exc: + logger.warning(f'Failed to parse request file {request_path}: {exc!s}') + return None + finally: + await asyncio.to_thread(file.close) + + try: + return Request(**file_content) + except ValidationError as exc: + logger.warning(f'Failed to validate request file {request_path}: {exc!s}') + + return None + + @override + async def fetch_next_request(self) -> Request | None: + """Return the next request in the queue to be processed. + + Once you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled` + to mark the request as handled in the queue. If there was some error in processing the request, call + `RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer + in another call to the `fetch_next_request` method. + + Returns: + The request or `None` if there are no more pending requests. + """ + async with self._lock: + # Create the requests directory if it doesn't exist + await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True) + + # First check forefront requests in the exact order they were added + for request_id in list(self._forefront_requests): + # Skip if already in progress + if request_id in self._in_progress: + continue + + request_path = self.path_to_rq / f'{request_id}.json' + + # Skip if file doesn't exist + if not await asyncio.to_thread(request_path.exists): + self._forefront_requests.remove(request_id) + continue + + file = await asyncio.to_thread(open, request_path) + try: + file_content = json.load(file) + # Skip if already handled + if file_content.get('handled_at') is not None: + self._forefront_requests.remove(request_id) + continue + + # Create request object + request = Request(**file_content) + + # Mark as in-progress in memory + self._in_progress.add(request.id) + + # Remove from forefront list + self._forefront_requests.remove(request.id) + + # Update accessed timestamp + await self._update_metadata(update_accessed_at=True) + except (json.JSONDecodeError, ValidationError) as exc: + logger.warning(f'Failed to parse request file {request_path}: {exc!s}') + self._forefront_requests.remove(request_id) + else: + return request + finally: + await asyncio.to_thread(file.close) + + # List all request files for regular (non-forefront) requests + request_files = await asyncio.to_thread(list, self.path_to_rq.glob('*.json')) + + # Dictionary to store request files by their sequence number + request_sequences = {} + requests_without_sequence = [] + + # Filter out metadata files and in-progress requests + for request_file in request_files: + # Skip metadata file + if request_file.name == METADATA_FILENAME: + continue + + # Extract request ID from filename + request_id = request_file.stem + + # Skip if already in progress or in forefront + if request_id in self._in_progress or request_id in self._forefront_requests: + continue + + # Read the file to get the sequence number + try: + file = await asyncio.to_thread(open, request_file) + try: + file_content = json.load(file) + # Skip if already handled + if file_content.get('handled_at') is not None: + continue + + # Use sequence number for ordering if available + sequence_number = file_content.get('_sequence') + if sequence_number is not None: + request_sequences[sequence_number] = request_file + else: + # For backward compatibility with existing files + requests_without_sequence.append(request_file) + finally: + await asyncio.to_thread(file.close) + except (json.JSONDecodeError, ValidationError) as exc: + logger.warning(f'Failed to parse request file {request_file}: {exc!s}') + + # Process requests with sequence numbers first, in FIFO order + for sequence in sorted(request_sequences.keys()): + request_file = request_sequences[sequence] + file = await asyncio.to_thread(open, request_file) + try: + file_content = json.load(file) + # Skip if already handled (double-check) + if file_content.get('handled_at') is not None: + continue + + # Create request object + request = Request(**file_content) + + # Mark as in-progress in memory + self._in_progress.add(request.id) + + # Update accessed timestamp + await self._update_metadata(update_accessed_at=True) + except (json.JSONDecodeError, ValidationError) as exc: + logger.warning(f'Failed to parse request file {request_file}: {exc!s}') + else: + return request + finally: + await asyncio.to_thread(file.close) + + # Process requests without sequence numbers using file timestamps (backward compatibility) + if requests_without_sequence: + # Get file creation times for sorting + request_file_times = {} + for request_file in requests_without_sequence: + try: + file_stat = await asyncio.to_thread(request_file.stat) + request_file_times[request_file] = file_stat.st_mtime + except Exception: # noqa: PERF203 + # If we can't get the time, use 0 (oldest) + request_file_times[request_file] = 0 + + # Sort by creation time + requests_without_sequence.sort(key=lambda f: request_file_times[f]) + + # Process requests without sequence in file timestamp order + for request_file in requests_without_sequence: + file = await asyncio.to_thread(open, request_file) + try: + file_content = json.load(file) + # Skip if already handled + if file_content.get('handled_at') is not None: + continue + + # Create request object + request = Request(**file_content) + + # Mark as in-progress in memory + self._in_progress.add(request.id) + + # Update accessed timestamp + await self._update_metadata(update_accessed_at=True) + except (json.JSONDecodeError, ValidationError) as exc: + logger.warning(f'Failed to parse request file {request_file}: {exc!s}') + else: + return request + finally: + await asyncio.to_thread(file.close) + + return None + + @override + async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: + """Mark a request as handled after successful processing. + + Handled requests will never again be returned by the `fetch_next_request` method. + + Args: + request: The request to mark as handled. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + async with self._lock: + # Check if the request is in progress + if request.id not in self._in_progress: + return None + + # Remove from in-progress set + self._in_progress.discard(request.id) + + # Update the request object - set handled_at timestamp + if request.handled_at is None: + request.handled_at = datetime.now(timezone.utc) + + # Write the updated request back to the requests directory + request_path = self.path_to_rq / f'{request.id}.json' + + if not await asyncio.to_thread(request_path.exists): + return None + + request_data = await json_dumps(request.model_dump()) + await atomic_write_text(request_path, request_data) + + # Update metadata timestamps + await self._update_metadata( + update_modified_at=True, + update_accessed_at=True, + new_handled_request_count=self._metadata.handled_request_count + 1, + new_pending_request_count=self._metadata.pending_request_count - 1, + ) + + return ProcessedRequest( + id=request.id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=True, + ) + + @override + async def reclaim_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest | None: + """Reclaim a failed request back to the queue. + + The request will be returned for processing later again by another call to `fetch_next_request`. + + Args: + request: The request to return to the queue. + forefront: Whether to add the request to the head or the end of the queue. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + async with self._lock: + # Check if the request is in progress + if request.id not in self._in_progress: + return None + + # Remove from in-progress set + self._in_progress.discard(request.id) + + # If forefront is true, mark this request as priority + if forefront: + self._forefront_requests.insert(0, request.id) + # Make sure it's not in the forefront list if it was previously added there + elif request.id in self._forefront_requests: + self._forefront_requests.remove(request.id) + + # To simulate changing the file timestamp for FIFO ordering, + # we'll update the file with current timestamp + request_path = self.path_to_rq / f'{request.id}.json' + + if not await asyncio.to_thread(request_path.exists): + return None + + request_data = await json_dumps(request.model_dump()) + await atomic_write_text(request_path, request_data) + + # Update metadata timestamps + await self._update_metadata(update_modified_at=True, update_accessed_at=True) + + return ProcessedRequest( + id=request.id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=False, + ) + + @override + async def is_empty(self) -> bool: + """Check if the queue is empty. + + Returns: + True if the queue is empty, False otherwise. + """ + async with self._lock: + # Update accessed timestamp when checking if queue is empty + await self._update_metadata(update_accessed_at=True) + + # Create the requests directory if it doesn't exist + await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True) + + # List all request files + request_files = await asyncio.to_thread(list, self.path_to_rq.glob('*.json')) + + # Check each file to see if there are any unhandled requests + for request_file in request_files: + # Skip metadata file + if request_file.name == METADATA_FILENAME: + continue + + try: + file = await asyncio.to_thread(open, request_file) + except FileNotFoundError: + logger.warning(f'Request file "{request_file}" not found.') + continue + + try: + file_content = json.load(file) + except json.JSONDecodeError: + logger.warning(f'Failed to parse request file: {request_file}') + finally: + await asyncio.to_thread(file.close) + + # If any request is not handled, the queue is not empty + if file_content.get('handled_at') is None: + return False + + # If we got here, all requests are handled or there are no requests + return True + + async def _update_metadata( + self, + *, + new_handled_request_count: int | None = None, + new_pending_request_count: int | None = None, + new_total_request_count: int | None = None, + update_had_multiple_clients: bool = False, + update_accessed_at: bool = False, + update_modified_at: bool = False, + ) -> None: + """Update the dataset metadata file with current information. + + Args: + new_handled_request_count: If provided, update the handled_request_count to this value. + new_pending_request_count: If provided, update the pending_request_count to this value. + new_total_request_count: If provided, update the total_request_count to this value. + update_had_multiple_clients: If True, set had_multiple_clients to True. + update_accessed_at: If True, update the `accessed_at` timestamp to the current time. + update_modified_at: If True, update the `modified_at` timestamp to the current time. + """ + # Always create a new timestamp to ensure it's truly updated + now = datetime.now(timezone.utc) + + # Update timestamps according to parameters + if update_accessed_at: + self._metadata.accessed_at = now + + if update_modified_at: + self._metadata.modified_at = now + + # Update request counts if provided + if new_handled_request_count is not None: + self._metadata.handled_request_count = new_handled_request_count + + if new_pending_request_count is not None: + self._metadata.pending_request_count = new_pending_request_count + + if new_total_request_count is not None: + self._metadata.total_request_count = new_total_request_count + + if update_had_multiple_clients: + self._metadata.had_multiple_clients = True + + # Ensure the parent directory for the metadata file exists. + await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) + + # Dump the serialized metadata to the file. + data = await json_dumps(self._metadata.model_dump()) + await atomic_write_text(self.path_to_metadata, data) diff --git a/src/crawlee/storage_clients/_file_system/_storage_client.py b/src/crawlee/storage_clients/_file_system/_storage_client.py new file mode 100644 index 0000000000..346fb4cdc2 --- /dev/null +++ b/src/crawlee/storage_clients/_file_system/_storage_client.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from typing_extensions import override + +from crawlee.configuration import Configuration +from crawlee.storage_clients._base import StorageClient + +from ._dataset_client import FileSystemDatasetClient +from ._key_value_store_client import FileSystemKeyValueStoreClient +from ._request_queue_client import FileSystemRequestQueueClient + + +class FileSystemStorageClient(StorageClient): + """File system storage client.""" + + @override + async def open_dataset_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> FileSystemDatasetClient: + configuration = configuration or Configuration.get_global_configuration() + client = await FileSystemDatasetClient.open(id=id, name=name, configuration=configuration) + + if configuration.purge_on_start and client.metadata.name is None: + await client.purge() + + return client + + @override + async def open_key_value_store_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> FileSystemKeyValueStoreClient: + configuration = configuration or Configuration.get_global_configuration() + client = await FileSystemKeyValueStoreClient.open(id=id, name=name, configuration=configuration) + + if configuration.purge_on_start and client.metadata.name is None: + await client.purge() + + return client + + @override + async def open_request_queue_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> FileSystemRequestQueueClient: + configuration = configuration or Configuration.get_global_configuration() + client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=configuration) + + if configuration.purge_on_start and client.metadata.name is None: + await client.purge() + + return client diff --git a/src/crawlee/storage_clients/_file_system/_utils.py b/src/crawlee/storage_clients/_file_system/_utils.py new file mode 100644 index 0000000000..f5068a5d8d --- /dev/null +++ b/src/crawlee/storage_clients/_file_system/_utils.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import asyncio +import json +import os +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path + from typing import Any + +METADATA_FILENAME = '__metadata__.json' +"""The name of the metadata file for storage clients.""" + + +async def json_dumps(obj: Any) -> str: + """Serialize an object to a JSON-formatted string with specific settings. + + Args: + obj: The object to serialize. + + Returns: + A string containing the JSON representation of the input object. + """ + return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str) + + +async def atomic_write_text(path: Path, data: str) -> None: + tmp = path.with_suffix(path.suffix + '.tmp') + # write to .tmp + await asyncio.to_thread(tmp.write_text, data, encoding='utf-8') + + try: + await asyncio.to_thread(os.replace, tmp, path) + except FileNotFoundError: + # If the .tmp vanished, fall back to a straight write + await asyncio.to_thread(path.write_text, data, encoding='utf-8') + + +async def atomic_write_bytes(path: Path, data: bytes) -> None: + tmp = path.with_suffix(path.suffix + '.tmp') + # write to .tmp + await asyncio.to_thread(tmp.write_bytes, data) + + try: + await asyncio.to_thread(os.replace, tmp, path) + except FileNotFoundError: + # If the .tmp vanished, fall back to a straight write + await asyncio.to_thread(path.write_bytes, data) diff --git a/src/crawlee/storage_clients/_file_system/py.typed b/src/crawlee/storage_clients/_file_system/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/crawlee/storage_clients/_memory/__init__.py b/src/crawlee/storage_clients/_memory/__init__.py index 09912e124d..3746907b4f 100644 --- a/src/crawlee/storage_clients/_memory/__init__.py +++ b/src/crawlee/storage_clients/_memory/__init__.py @@ -1,17 +1,11 @@ -from ._dataset_client import DatasetClient -from ._dataset_collection_client import DatasetCollectionClient -from ._key_value_store_client import KeyValueStoreClient -from ._key_value_store_collection_client import KeyValueStoreCollectionClient -from ._memory_storage_client import MemoryStorageClient -from ._request_queue_client import RequestQueueClient -from ._request_queue_collection_client import RequestQueueCollectionClient +from ._dataset_client import MemoryDatasetClient +from ._key_value_store_client import MemoryKeyValueStoreClient +from ._request_queue_client import MemoryRequestQueueClient +from ._storage_client import MemoryStorageClient __all__ = [ - 'DatasetClient', - 'DatasetCollectionClient', - 'KeyValueStoreClient', - 'KeyValueStoreCollectionClient', + 'MemoryDatasetClient', + 'MemoryKeyValueStoreClient', + 'MemoryRequestQueueClient', 'MemoryStorageClient', - 'RequestQueueClient', - 'RequestQueueCollectionClient', ] diff --git a/src/crawlee/storage_clients/_memory/_creation_management.py b/src/crawlee/storage_clients/_memory/_creation_management.py deleted file mode 100644 index f6d4fc1c91..0000000000 --- a/src/crawlee/storage_clients/_memory/_creation_management.py +++ /dev/null @@ -1,429 +0,0 @@ -from __future__ import annotations - -import asyncio -import json -import mimetypes -import os -import pathlib -from datetime import datetime, timezone -from logging import getLogger -from typing import TYPE_CHECKING - -from crawlee._consts import METADATA_FILENAME -from crawlee._utils.data_processing import maybe_parse_body -from crawlee._utils.file import json_dumps -from crawlee.storage_clients.models import ( - DatasetMetadata, - InternalRequest, - KeyValueStoreMetadata, - KeyValueStoreRecord, - KeyValueStoreRecordMetadata, - RequestQueueMetadata, -) - -if TYPE_CHECKING: - from ._dataset_client import DatasetClient - from ._key_value_store_client import KeyValueStoreClient - from ._memory_storage_client import MemoryStorageClient, TResourceClient - from ._request_queue_client import RequestQueueClient - -logger = getLogger(__name__) - - -async def persist_metadata_if_enabled(*, data: dict, entity_directory: str, write_metadata: bool) -> None: - """Update or writes metadata to a specified directory. - - The function writes a given metadata dictionary to a JSON file within a specified directory. - The writing process is skipped if `write_metadata` is False. Before writing, it ensures that - the target directory exists, creating it if necessary. - - Args: - data: A dictionary containing metadata to be written. - entity_directory: The directory path where the metadata file should be stored. - write_metadata: A boolean flag indicating whether the metadata should be written to file. - """ - # Skip metadata write; ensure directory exists first - if not write_metadata: - return - - # Ensure the directory for the entity exists - await asyncio.to_thread(os.makedirs, entity_directory, exist_ok=True) - - # Write the metadata to the file - file_path = os.path.join(entity_directory, METADATA_FILENAME) - f = await asyncio.to_thread(open, file_path, mode='wb') - try: - s = await json_dumps(data) - await asyncio.to_thread(f.write, s.encode('utf-8')) - finally: - await asyncio.to_thread(f.close) - - -def find_or_create_client_by_id_or_name_inner( - resource_client_class: type[TResourceClient], - memory_storage_client: MemoryStorageClient, - id: str | None = None, - name: str | None = None, -) -> TResourceClient | None: - """Locate or create a new storage client based on the given ID or name. - - This method attempts to find a storage client in the memory cache first. If not found, - it tries to locate a storage directory by name. If still not found, it searches through - storage directories for a matching ID or name in their metadata. If none exists, and the - specified ID is 'default', it checks for a default storage directory. If a storage client - is found or created, it is added to the memory cache. If no storage client can be located or - created, the method returns None. - - Args: - resource_client_class: The class of the resource client. - memory_storage_client: The memory storage client used to store and retrieve storage clients. - id: The unique identifier for the storage client. - name: The name of the storage client. - - Raises: - ValueError: If both id and name are None. - - Returns: - The found or created storage client, or None if no client could be found or created. - """ - from ._dataset_client import DatasetClient - from ._key_value_store_client import KeyValueStoreClient - from ._request_queue_client import RequestQueueClient - - if id is None and name is None: - raise ValueError('Either id or name must be specified.') - - # First check memory cache - found = memory_storage_client.get_cached_resource_client(resource_client_class, id, name) - - if found is not None: - return found - - storage_path = _determine_storage_path(resource_client_class, memory_storage_client, id, name) - - if not storage_path: - return None - - # Create from directory if storage path is found - if issubclass(resource_client_class, DatasetClient): - resource_client = create_dataset_from_directory(storage_path, memory_storage_client, id, name) - elif issubclass(resource_client_class, KeyValueStoreClient): - resource_client = create_kvs_from_directory(storage_path, memory_storage_client, id, name) - elif issubclass(resource_client_class, RequestQueueClient): - resource_client = create_rq_from_directory(storage_path, memory_storage_client, id, name) - else: - raise TypeError('Invalid resource client class.') - - memory_storage_client.add_resource_client_to_cache(resource_client) - return resource_client - - -async def get_or_create_inner( - *, - memory_storage_client: MemoryStorageClient, - storage_client_cache: list[TResourceClient], - resource_client_class: type[TResourceClient], - name: str | None = None, - id: str | None = None, -) -> TResourceClient: - """Retrieve a named storage, or create a new one when it doesn't exist. - - Args: - memory_storage_client: The memory storage client. - storage_client_cache: The cache of storage clients. - resource_client_class: The class of the storage to retrieve or create. - name: The name of the storage to retrieve or create. - id: ID of the storage to retrieve or create. - - Returns: - The retrieved or newly-created storage. - """ - # If the name or id is provided, try to find the dataset in the cache - if name or id: - found = find_or_create_client_by_id_or_name_inner( - resource_client_class=resource_client_class, - memory_storage_client=memory_storage_client, - name=name, - id=id, - ) - if found: - return found - - # Otherwise, create a new one and add it to the cache - resource_client = resource_client_class( - id=id, - name=name, - memory_storage_client=memory_storage_client, - ) - - storage_client_cache.append(resource_client) - - # Write to the disk - await persist_metadata_if_enabled( - data=resource_client.resource_info.model_dump(), - entity_directory=resource_client.resource_directory, - write_metadata=memory_storage_client.write_metadata, - ) - - return resource_client - - -def create_dataset_from_directory( - storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, - name: str | None = None, -) -> DatasetClient: - from ._dataset_client import DatasetClient - - item_count = 0 - has_seen_metadata_file = False - created_at = datetime.now(timezone.utc) - accessed_at = datetime.now(timezone.utc) - modified_at = datetime.now(timezone.utc) - - # Load metadata if it exists - metadata_filepath = os.path.join(storage_directory, METADATA_FILENAME) - - if os.path.exists(metadata_filepath): - has_seen_metadata_file = True - with open(metadata_filepath, encoding='utf-8') as f: - json_content = json.load(f) - resource_info = DatasetMetadata(**json_content) - - id = resource_info.id - name = resource_info.name - item_count = resource_info.item_count - created_at = resource_info.created_at - accessed_at = resource_info.accessed_at - modified_at = resource_info.modified_at - - # Load dataset entries - entries: dict[str, dict] = {} - - for entry in os.scandir(storage_directory): - if entry.is_file(): - if entry.name == METADATA_FILENAME: - has_seen_metadata_file = True - continue - - with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f: - entry_content = json.load(f) - - entry_name = entry.name.split('.')[0] - entries[entry_name] = entry_content - - if not has_seen_metadata_file: - item_count += 1 - - # Create new dataset client - new_client = DatasetClient( - memory_storage_client=memory_storage_client, - id=id, - name=name, - created_at=created_at, - accessed_at=accessed_at, - modified_at=modified_at, - item_count=item_count, - ) - - new_client.dataset_entries.update(entries) - return new_client - - -def create_kvs_from_directory( - storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, - name: str | None = None, -) -> KeyValueStoreClient: - from ._key_value_store_client import KeyValueStoreClient - - created_at = datetime.now(timezone.utc) - accessed_at = datetime.now(timezone.utc) - modified_at = datetime.now(timezone.utc) - - # Load metadata if it exists - metadata_filepath = os.path.join(storage_directory, METADATA_FILENAME) - - if os.path.exists(metadata_filepath): - with open(metadata_filepath, encoding='utf-8') as f: - json_content = json.load(f) - resource_info = KeyValueStoreMetadata(**json_content) - - id = resource_info.id - name = resource_info.name - created_at = resource_info.created_at - accessed_at = resource_info.accessed_at - modified_at = resource_info.modified_at - - # Create new KVS client - new_client = KeyValueStoreClient( - memory_storage_client=memory_storage_client, - id=id, - name=name, - accessed_at=accessed_at, - created_at=created_at, - modified_at=modified_at, - ) - - # Scan the KVS folder, check each entry in there and parse it as a store record - for entry in os.scandir(storage_directory): - if not entry.is_file(): - continue - - # Ignore metadata files on their own - if entry.name.endswith(METADATA_FILENAME): - continue - - # Try checking if this file has a metadata file associated with it - record_metadata = None - record_metadata_filepath = os.path.join(storage_directory, f'{entry.name}.__metadata__.json') - - if os.path.exists(record_metadata_filepath): - with open(record_metadata_filepath, encoding='utf-8') as metadata_file: - try: - json_content = json.load(metadata_file) - record_metadata = KeyValueStoreRecordMetadata(**json_content) - - except Exception: - logger.warning( - f'Metadata of key-value store entry "{entry.name}" for store {name or id} could ' - 'not be parsed. The metadata file will be ignored.', - exc_info=True, - ) - - if not record_metadata: - content_type, _ = mimetypes.guess_type(entry.name) - if content_type is None: - content_type = 'application/octet-stream' - - record_metadata = KeyValueStoreRecordMetadata( - key=pathlib.Path(entry.name).stem, - content_type=content_type, - ) - - with open(os.path.join(storage_directory, entry.name), 'rb') as f: - file_content = f.read() - - try: - maybe_parse_body(file_content, record_metadata.content_type) - except Exception: - record_metadata.content_type = 'application/octet-stream' - logger.warning( - f'Key-value store entry "{record_metadata.key}" for store {name or id} could not be parsed.' - 'The entry will be assumed as binary.', - exc_info=True, - ) - - new_client.records[record_metadata.key] = KeyValueStoreRecord( - key=record_metadata.key, - content_type=record_metadata.content_type, - filename=entry.name, - value=file_content, - ) - - return new_client - - -def create_rq_from_directory( - storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, - name: str | None = None, -) -> RequestQueueClient: - from ._request_queue_client import RequestQueueClient - - created_at = datetime.now(timezone.utc) - accessed_at = datetime.now(timezone.utc) - modified_at = datetime.now(timezone.utc) - handled_request_count = 0 - pending_request_count = 0 - - # Load metadata if it exists - metadata_filepath = os.path.join(storage_directory, METADATA_FILENAME) - - if os.path.exists(metadata_filepath): - with open(metadata_filepath, encoding='utf-8') as f: - json_content = json.load(f) - resource_info = RequestQueueMetadata(**json_content) - - id = resource_info.id - name = resource_info.name - created_at = resource_info.created_at - accessed_at = resource_info.accessed_at - modified_at = resource_info.modified_at - handled_request_count = resource_info.handled_request_count - pending_request_count = resource_info.pending_request_count - - # Load request entries - entries: dict[str, InternalRequest] = {} - - for entry in os.scandir(storage_directory): - if entry.is_file(): - if entry.name == METADATA_FILENAME: - continue - - with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f: - content = json.load(f) - - request = InternalRequest(**content) - - entries[request.id] = request - - # Create new RQ client - new_client = RequestQueueClient( - memory_storage_client=memory_storage_client, - id=id, - name=name, - accessed_at=accessed_at, - created_at=created_at, - modified_at=modified_at, - handled_request_count=handled_request_count, - pending_request_count=pending_request_count, - ) - - new_client.requests.update(entries) - return new_client - - -def _determine_storage_path( - resource_client_class: type[TResourceClient], - memory_storage_client: MemoryStorageClient, - id: str | None = None, - name: str | None = None, -) -> str | None: - storages_dir = memory_storage_client._get_storage_dir(resource_client_class) # noqa: SLF001 - default_id = memory_storage_client._get_default_storage_id(resource_client_class) # noqa: SLF001 - - # Try to find by name directly from directories - if name: - possible_storage_path = os.path.join(storages_dir, name) - if os.access(possible_storage_path, os.F_OK): - return possible_storage_path - - # If not found, try finding by metadata - if os.access(storages_dir, os.F_OK): - for entry in os.scandir(storages_dir): - if entry.is_dir(): - metadata_path = os.path.join(entry.path, METADATA_FILENAME) - if os.access(metadata_path, os.F_OK): - with open(metadata_path, encoding='utf-8') as metadata_file: - try: - metadata = json.load(metadata_file) - if (id and metadata.get('id') == id) or (name and metadata.get('name') == name): - return entry.path - except Exception: - logger.warning( - f'Metadata of store entry "{entry.name}" for store {name or id} could not be parsed. ' - 'The metadata file will be ignored.', - exc_info=True, - ) - - # Check for default storage directory as a last resort - if id == default_id: - possible_storage_path = os.path.join(storages_dir, default_id) - if os.access(possible_storage_path, os.F_OK): - return possible_storage_path - - return None diff --git a/src/crawlee/storage_clients/_memory/_dataset_client.py b/src/crawlee/storage_clients/_memory/_dataset_client.py index 50c8c7c8d4..9a78ac7f6a 100644 --- a/src/crawlee/storage_clients/_memory/_dataset_client.py +++ b/src/crawlee/storage_clients/_memory/_dataset_client.py @@ -1,162 +1,135 @@ from __future__ import annotations -import asyncio -import json -import os -import shutil from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING, Any from typing_extensions import override -from crawlee._types import StorageTypes from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.data_processing import raise_on_duplicate_storage, raise_on_non_existing_storage -from crawlee._utils.file import force_rename, json_dumps -from crawlee.storage_clients._base import DatasetClient as BaseDatasetClient +from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata -from ._creation_management import find_or_create_client_by_id_or_name_inner - if TYPE_CHECKING: from collections.abc import AsyncIterator - from contextlib import AbstractAsyncContextManager - - from httpx import Response - from crawlee._types import JsonSerializable - from crawlee.storage_clients import MemoryStorageClient + from crawlee.configuration import Configuration logger = getLogger(__name__) -class DatasetClient(BaseDatasetClient): - """Subclient for manipulating a single dataset.""" +class MemoryDatasetClient(DatasetClient): + """Memory implementation of the dataset client. - _LIST_ITEMS_LIMIT = 999_999_999_999 - """This is what API returns in the x-apify-pagination-limit header when no limit query parameter is used.""" + This client stores dataset items in memory using Python lists and dictionaries. No data is persisted + between process runs, meaning all stored data is lost when the program terminates. This implementation + is primarily useful for testing, development, and short-lived crawler operations where persistent + storage is not required. - _LOCAL_ENTRY_NAME_DIGITS = 9 - """Number of characters of the dataset item file names, e.g.: 000000019.json - 9 digits.""" + The memory implementation provides fast access to data but is limited by available memory and + does not support data sharing across different processes. It supports all dataset operations including + sorting, filtering, and pagination, but performs them entirely in memory. + """ def __init__( self, *, - memory_storage_client: MemoryStorageClient, - id: str | None = None, - name: str | None = None, - created_at: datetime | None = None, - accessed_at: datetime | None = None, - modified_at: datetime | None = None, - item_count: int = 0, + id: str, + name: str | None, + created_at: datetime, + accessed_at: datetime, + modified_at: datetime, + item_count: int, ) -> None: - self._memory_storage_client = memory_storage_client - self.id = id or crypto_random_object_id() - self.name = name - self._created_at = created_at or datetime.now(timezone.utc) - self._accessed_at = accessed_at or datetime.now(timezone.utc) - self._modified_at = modified_at or datetime.now(timezone.utc) - - self.dataset_entries: dict[str, dict] = {} - self.file_operation_lock = asyncio.Lock() - self.item_count = item_count + """Initialize a new instance. - @property - def resource_info(self) -> DatasetMetadata: - """Get the resource info for the dataset client.""" - return DatasetMetadata( - id=self.id, - name=self.name, - accessed_at=self._accessed_at, - created_at=self._created_at, - modified_at=self._modified_at, - item_count=self.item_count, + Preferably use the `MemoryDatasetClient.open` class method to create a new instance. + """ + self._metadata = DatasetMetadata( + id=id, + name=name, + created_at=created_at, + accessed_at=accessed_at, + modified_at=modified_at, + item_count=item_count, ) + # List to hold dataset items + self._records = list[dict[str, Any]]() + + @override @property - def resource_directory(self) -> str: - """Get the resource directory for the client.""" - return os.path.join(self._memory_storage_client.datasets_directory, self.name or self.id) + def metadata(self) -> DatasetMetadata: + return self._metadata @override - async def get(self) -> DatasetMetadata | None: - found = find_or_create_client_by_id_or_name_inner( - resource_client_class=DatasetClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> MemoryDatasetClient: + # Otherwise create a new dataset + dataset_id = id or crypto_random_object_id() + now = datetime.now(timezone.utc) + + return cls( + id=dataset_id, + name=name, + created_at=now, + accessed_at=now, + modified_at=now, + item_count=0, ) - if found: - async with found.file_operation_lock: - await found.update_timestamps(has_been_modified=False) - return found.resource_info - - return None - @override - async def update(self, *, name: str | None = None) -> DatasetMetadata: - # Check by id - existing_dataset_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=DatasetClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, + async def drop(self) -> None: + self._records.clear() + await self._update_metadata( + update_accessed_at=True, + update_modified_at=True, + new_item_count=0, ) - if existing_dataset_by_id is None: - raise_on_non_existing_storage(StorageTypes.DATASET, self.id) - - # Skip if no changes - if name is None: - return existing_dataset_by_id.resource_info - - async with existing_dataset_by_id.file_operation_lock: - # Check that name is not in use already - existing_dataset_by_name = next( - ( - dataset - for dataset in self._memory_storage_client.datasets_handled - if dataset.name and dataset.name.lower() == name.lower() - ), - None, - ) - - if existing_dataset_by_name is not None: - raise_on_duplicate_storage(StorageTypes.DATASET, 'name', name) - - previous_dir = existing_dataset_by_id.resource_directory - existing_dataset_by_id.name = name - - await force_rename(previous_dir, existing_dataset_by_id.resource_directory) - - # Update timestamps - await existing_dataset_by_id.update_timestamps(has_been_modified=True) + @override + async def purge(self) -> None: + """Delete all records from the dataset, but keep the dataset itself. - return existing_dataset_by_id.resource_info + This method clears all data items from the dataset while preserving the dataset structure. + """ + self._records.clear() + await self._update_metadata( + update_accessed_at=True, + update_modified_at=True, + new_item_count=0, + ) @override - async def delete(self) -> None: - dataset = next( - (dataset for dataset in self._memory_storage_client.datasets_handled if dataset.id == self.id), None + async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: + new_item_count = self.metadata.item_count + + if isinstance(data, list): + for item in data: + new_item_count += 1 + await self._push_item(item) + else: + new_item_count += 1 + await self._push_item(data) + + await self._update_metadata( + update_accessed_at=True, + update_modified_at=True, + new_item_count=new_item_count, ) - if dataset is not None: - async with dataset.file_operation_lock: - self._memory_storage_client.datasets_handled.remove(dataset) - dataset.item_count = 0 - dataset.dataset_entries.clear() - - if os.path.exists(dataset.resource_directory): - await asyncio.to_thread(shutil.rmtree, dataset.resource_directory) - @override - async def list_items( + async def get_data( self, *, - offset: int | None = 0, - limit: int | None = _LIST_ITEMS_LIMIT, + offset: int = 0, + limit: int | None = 999_999_999_999, clean: bool = False, desc: bool = False, fields: list[str] | None = None, @@ -167,44 +140,48 @@ async def list_items( flatten: list[str] | None = None, view: str | None = None, ) -> DatasetItemsListPage: - # Check by id - existing_dataset_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=DatasetClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) - - if existing_dataset_by_id is None: - raise_on_non_existing_storage(StorageTypes.DATASET, self.id) - - async with existing_dataset_by_id.file_operation_lock: - start, end = existing_dataset_by_id.get_start_and_end_indexes( - max(existing_dataset_by_id.item_count - (offset or 0) - (limit or self._LIST_ITEMS_LIMIT), 0) - if desc - else offset or 0, - limit, + # Check for unsupported arguments and log a warning if found + unsupported_args = { + 'clean': clean, + 'fields': fields, + 'omit': omit, + 'unwind': unwind, + 'skip_hidden': skip_hidden, + 'flatten': flatten, + 'view': view, + } + unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} + + if unsupported: + logger.warning( + f'The arguments {list(unsupported.keys())} of get_data are not supported ' + f'by the {self.__class__.__name__} client.' ) - items = [] + total = len(self._records) + items = self._records.copy() - for idx in range(start, end): - entry_number = self._generate_local_entry_name(idx) - items.append(existing_dataset_by_id.dataset_entries[entry_number]) + # Apply skip_empty filter if requested + if skip_empty: + items = [item for item in items if item] - await existing_dataset_by_id.update_timestamps(has_been_modified=False) + # Apply sorting + if desc: + items = list(reversed(items)) - if desc: - items.reverse() + # Apply pagination + sliced_items = items[offset : (offset + limit) if limit is not None else total] - return DatasetItemsListPage( - count=len(items), - desc=desc or False, - items=items, - limit=limit or self._LIST_ITEMS_LIMIT, - offset=offset or 0, - total=existing_dataset_by_id.item_count, - ) + await self._update_metadata(update_accessed_at=True) + + return DatasetItemsListPage( + count=len(sliced_items), + offset=offset, + limit=limit or (total - offset), + total=total, + desc=desc, + items=sliced_items, + ) @override async def iterate_items( @@ -220,191 +197,66 @@ async def iterate_items( skip_empty: bool = False, skip_hidden: bool = False, ) -> AsyncIterator[dict]: - cache_size = 1000 - first_item = offset - - # If there is no limit, set last_item to None until we get the total from the first API response - last_item = None if limit is None else offset + limit - current_offset = first_item - - while last_item is None or current_offset < last_item: - current_limit = cache_size if last_item is None else min(cache_size, last_item - current_offset) - - current_items_page = await self.list_items( - offset=current_offset, - limit=current_limit, - desc=desc, + # Check for unsupported arguments and log a warning if found + unsupported_args = { + 'clean': clean, + 'fields': fields, + 'omit': omit, + 'unwind': unwind, + 'skip_hidden': skip_hidden, + } + unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} + + if unsupported: + logger.warning( + f'The arguments {list(unsupported.keys())} of iterate are not supported ' + f'by the {self.__class__.__name__} client.' ) - current_offset += current_items_page.count - if last_item is None or current_items_page.total < last_item: - last_item = current_items_page.total - - for item in current_items_page.items: - yield item - - @override - async def get_items_as_bytes( - self, - *, - item_format: str = 'json', - offset: int | None = None, - limit: int | None = None, - desc: bool = False, - clean: bool = False, - bom: bool = False, - delimiter: str | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool = False, - skip_header_row: bool = False, - skip_hidden: bool = False, - xml_root: str | None = None, - xml_row: str | None = None, - flatten: list[str] | None = None, - ) -> bytes: - raise NotImplementedError('This method is not supported in memory storage.') - - @override - async def stream_items( - self, - *, - item_format: str = 'json', - offset: int | None = None, - limit: int | None = None, - desc: bool = False, - clean: bool = False, - bom: bool = False, - delimiter: str | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool = False, - skip_header_row: bool = False, - skip_hidden: bool = False, - xml_root: str | None = None, - xml_row: str | None = None, - ) -> AbstractAsyncContextManager[Response | None]: - raise NotImplementedError('This method is not supported in memory storage.') + items = self._records.copy() - @override - async def push_items( - self, - items: JsonSerializable, - ) -> None: - # Check by id - existing_dataset_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=DatasetClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) - - if existing_dataset_by_id is None: - raise_on_non_existing_storage(StorageTypes.DATASET, self.id) + # Apply sorting + if desc: + items = list(reversed(items)) - normalized = self._normalize_items(items) + # Apply pagination + sliced_items = items[offset : (offset + limit) if limit is not None else len(items)] - added_ids: list[str] = [] - for entry in normalized: - existing_dataset_by_id.item_count += 1 - idx = self._generate_local_entry_name(existing_dataset_by_id.item_count) + # Yield items one by one + for item in sliced_items: + if skip_empty and not item: + continue + yield item - existing_dataset_by_id.dataset_entries[idx] = entry - added_ids.append(idx) - - data_entries = [(id, existing_dataset_by_id.dataset_entries[id]) for id in added_ids] - - async with existing_dataset_by_id.file_operation_lock: - await existing_dataset_by_id.update_timestamps(has_been_modified=True) - - await self._persist_dataset_items_to_disk( - data=data_entries, - entity_directory=existing_dataset_by_id.resource_directory, - persist_storage=self._memory_storage_client.persist_storage, - ) + await self._update_metadata(update_accessed_at=True) - async def _persist_dataset_items_to_disk( + async def _update_metadata( self, *, - data: list[tuple[str, dict]], - entity_directory: str, - persist_storage: bool, + new_item_count: int | None = None, + update_accessed_at: bool = False, + update_modified_at: bool = False, ) -> None: - """Write dataset items to the disk. - - The function iterates over a list of dataset items, each represented as a tuple of an identifier - and a dictionary, and writes them as individual JSON files in a specified directory. The function - will skip writing if `persist_storage` is False. Before writing, it ensures that the target - directory exists, creating it if necessary. + """Update the dataset metadata with current information. Args: - data: A list of tuples, each containing an identifier (string) and a data dictionary. - entity_directory: The directory path where the dataset items should be stored. - persist_storage: A boolean flag indicating whether the data should be persisted to the disk. + new_item_count: If provided, update the item count to this value. + update_accessed_at: If True, update the `accessed_at` timestamp to the current time. + update_modified_at: If True, update the `modified_at` timestamp to the current time. """ - # Skip writing files to the disk if the client has the option set to false - if not persist_storage: - return - - # Ensure the directory for the entity exists - await asyncio.to_thread(os.makedirs, entity_directory, exist_ok=True) - - # Save all the new items to the disk - for idx, item in data: - file_path = os.path.join(entity_directory, f'{idx}.json') - f = await asyncio.to_thread(open, file_path, mode='w', encoding='utf-8') - try: - s = await json_dumps(item) - await asyncio.to_thread(f.write, s) - finally: - await asyncio.to_thread(f.close) - - async def update_timestamps(self, *, has_been_modified: bool) -> None: - """Update the timestamps of the dataset.""" - from ._creation_management import persist_metadata_if_enabled - - self._accessed_at = datetime.now(timezone.utc) - - if has_been_modified: - self._modified_at = datetime.now(timezone.utc) - - await persist_metadata_if_enabled( - data=self.resource_info.model_dump(), - entity_directory=self.resource_directory, - write_metadata=self._memory_storage_client.write_metadata, - ) - - def get_start_and_end_indexes(self, offset: int, limit: int | None = None) -> tuple[int, int]: - """Calculate the start and end indexes for listing items.""" - actual_limit = limit or self.item_count - start = offset + 1 - end = min(offset + actual_limit, self.item_count) + 1 - return (start, end) - - def _generate_local_entry_name(self, idx: int) -> str: - return str(idx).zfill(self._LOCAL_ENTRY_NAME_DIGITS) - - def _normalize_items(self, items: JsonSerializable) -> list[dict]: - def normalize_item(item: Any) -> dict | None: - if isinstance(item, str): - item = json.loads(item) + now = datetime.now(timezone.utc) - if isinstance(item, list): - received = ',\n'.join(item) - raise TypeError( - f'Each dataset item can only be a single JSON object, not an array. Received: [{received}]' - ) + if update_accessed_at: + self._metadata.accessed_at = now + if update_modified_at: + self._metadata.modified_at = now + if new_item_count is not None: + self._metadata.item_count = new_item_count - if (not isinstance(item, dict)) and item is not None: - raise TypeError(f'Each dataset item must be a JSON object. Received: {item}') + async def _push_item(self, item: dict[str, Any]) -> None: + """Push a single item to the dataset. - return item - - if isinstance(items, str): - items = json.loads(items) - - result = list(map(normalize_item, items)) if isinstance(items, list) else [normalize_item(items)] - # filter(None, ..) returns items that are True - return list(filter(None, result)) + Args: + item: The data item to add to the dataset. + """ + self._records.append(item) diff --git a/src/crawlee/storage_clients/_memory/_dataset_collection_client.py b/src/crawlee/storage_clients/_memory/_dataset_collection_client.py deleted file mode 100644 index 9e32b4086b..0000000000 --- a/src/crawlee/storage_clients/_memory/_dataset_collection_client.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee.storage_clients._base import DatasetCollectionClient as BaseDatasetCollectionClient -from crawlee.storage_clients.models import DatasetListPage, DatasetMetadata - -from ._creation_management import get_or_create_inner -from ._dataset_client import DatasetClient - -if TYPE_CHECKING: - from ._memory_storage_client import MemoryStorageClient - - -class DatasetCollectionClient(BaseDatasetCollectionClient): - """Subclient for manipulating datasets.""" - - def __init__(self, *, memory_storage_client: MemoryStorageClient) -> None: - self._memory_storage_client = memory_storage_client - - @property - def _storage_client_cache(self) -> list[DatasetClient]: - return self._memory_storage_client.datasets_handled - - @override - async def get_or_create( - self, - *, - name: str | None = None, - schema: dict | None = None, - id: str | None = None, - ) -> DatasetMetadata: - resource_client = await get_or_create_inner( - memory_storage_client=self._memory_storage_client, - storage_client_cache=self._storage_client_cache, - resource_client_class=DatasetClient, - name=name, - id=id, - ) - return resource_client.resource_info - - @override - async def list( - self, - *, - unnamed: bool = False, - limit: int | None = None, - offset: int | None = None, - desc: bool = False, - ) -> DatasetListPage: - items = [storage.resource_info for storage in self._storage_client_cache] - - return DatasetListPage( - total=len(items), - count=len(items), - offset=0, - limit=len(items), - desc=False, - items=sorted(items, key=lambda item: item.created_at), - ) diff --git a/src/crawlee/storage_clients/_memory/_key_value_store_client.py b/src/crawlee/storage_clients/_memory/_key_value_store_client.py index ab9def0f06..34843a380e 100644 --- a/src/crawlee/storage_clients/_memory/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_memory/_key_value_store_client.py @@ -1,425 +1,179 @@ from __future__ import annotations -import asyncio -import io -import os -import shutil +import sys from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING, Any from typing_extensions import override -from crawlee._types import StorageTypes from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.data_processing import maybe_parse_body, raise_on_duplicate_storage, raise_on_non_existing_storage -from crawlee._utils.file import determine_file_extension, force_remove, force_rename, is_file_or_bytes, json_dumps -from crawlee.storage_clients._base import KeyValueStoreClient as BaseKeyValueStoreClient -from crawlee.storage_clients.models import ( - KeyValueStoreKeyInfo, - KeyValueStoreListKeysPage, - KeyValueStoreMetadata, - KeyValueStoreRecord, - KeyValueStoreRecordMetadata, -) - -from ._creation_management import find_or_create_client_by_id_or_name_inner, persist_metadata_if_enabled +from crawlee._utils.file import infer_mime_type +from crawlee.storage_clients._base import KeyValueStoreClient +from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata if TYPE_CHECKING: - from contextlib import AbstractAsyncContextManager + from collections.abc import AsyncIterator - from httpx import Response - - from crawlee.storage_clients import MemoryStorageClient + from crawlee.configuration import Configuration logger = getLogger(__name__) -class KeyValueStoreClient(BaseKeyValueStoreClient): - """Subclient for manipulating a single key-value store.""" +class MemoryKeyValueStoreClient(KeyValueStoreClient): + """Memory implementation of the key-value store client. + + This client stores data in memory as Python dictionaries. No data is persisted between + process runs, meaning all stored data is lost when the program terminates. This implementation + is primarily useful for testing, development, and short-lived crawler operations where + persistence is not required. + + The memory implementation provides fast access to data but is limited by available memory and + does not support data sharing across different processes. + """ def __init__( self, *, - memory_storage_client: MemoryStorageClient, - id: str | None = None, - name: str | None = None, - created_at: datetime | None = None, - accessed_at: datetime | None = None, - modified_at: datetime | None = None, + id: str, + name: str | None, + created_at: datetime, + accessed_at: datetime, + modified_at: datetime, ) -> None: - self.id = id or crypto_random_object_id() - self.name = name - - self._memory_storage_client = memory_storage_client - self._created_at = created_at or datetime.now(timezone.utc) - self._accessed_at = accessed_at or datetime.now(timezone.utc) - self._modified_at = modified_at or datetime.now(timezone.utc) - - self.records: dict[str, KeyValueStoreRecord] = {} - self.file_operation_lock = asyncio.Lock() - - @property - def resource_info(self) -> KeyValueStoreMetadata: - """Get the resource info for the key-value store client.""" - return KeyValueStoreMetadata( - id=self.id, - name=self.name, - accessed_at=self._accessed_at, - created_at=self._created_at, - modified_at=self._modified_at, - user_id='1', + """Initialize a new instance. + + Preferably use the `MemoryKeyValueStoreClient.open` class method to create a new instance. + """ + self._metadata = KeyValueStoreMetadata( + id=id, + name=name, + created_at=created_at, + accessed_at=accessed_at, + modified_at=modified_at, ) - @property - def resource_directory(self) -> str: - """Get the resource directory for the client.""" - return os.path.join(self._memory_storage_client.key_value_stores_directory, self.name or self.id) + # Dictionary to hold key-value records with metadata + self._records = dict[str, KeyValueStoreRecord]() @override - async def get(self) -> KeyValueStoreMetadata | None: - found = find_or_create_client_by_id_or_name_inner( - resource_client_class=KeyValueStoreClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) - - if found: - async with found.file_operation_lock: - await found.update_timestamps(has_been_modified=False) - return found.resource_info - - return None + @property + def metadata(self) -> KeyValueStoreMetadata: + return self._metadata @override - async def update(self, *, name: str | None = None) -> KeyValueStoreMetadata: - # Check by id - existing_store_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=KeyValueStoreClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> MemoryKeyValueStoreClient: + # Otherwise create a new key-value store + store_id = id or crypto_random_object_id() + now = datetime.now(timezone.utc) + + return cls( + id=store_id, + name=name, + created_at=now, + accessed_at=now, + modified_at=now, ) - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) - - # Skip if no changes - if name is None: - return existing_store_by_id.resource_info - - async with existing_store_by_id.file_operation_lock: - # Check that name is not in use already - existing_store_by_name = next( - ( - store - for store in self._memory_storage_client.key_value_stores_handled - if store.name and store.name.lower() == name.lower() - ), - None, - ) - - if existing_store_by_name is not None: - raise_on_duplicate_storage(StorageTypes.KEY_VALUE_STORE, 'name', name) - - previous_dir = existing_store_by_id.resource_directory - existing_store_by_id.name = name - - await force_rename(previous_dir, existing_store_by_id.resource_directory) - - # Update timestamps - await existing_store_by_id.update_timestamps(has_been_modified=True) - - return existing_store_by_id.resource_info - @override - async def delete(self) -> None: - store = next( - (store for store in self._memory_storage_client.key_value_stores_handled if store.id == self.id), None - ) - - if store is not None: - async with store.file_operation_lock: - self._memory_storage_client.key_value_stores_handled.remove(store) - store.records.clear() - - if os.path.exists(store.resource_directory): - await asyncio.to_thread(shutil.rmtree, store.resource_directory) + async def drop(self) -> None: + self._records.clear() + await self._update_metadata(update_accessed_at=True, update_modified_at=True) @override - async def list_keys( - self, - *, - limit: int = 1000, - exclusive_start_key: str | None = None, - ) -> KeyValueStoreListKeysPage: - # Check by id - existing_store_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=KeyValueStoreClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) - - items: list[KeyValueStoreKeyInfo] = [] - - for record in existing_store_by_id.records.values(): - size = len(record.value) - items.append(KeyValueStoreKeyInfo(key=record.key, size=size)) - - if len(items) == 0: - return KeyValueStoreListKeysPage( - count=len(items), - limit=limit, - exclusive_start_key=exclusive_start_key, - is_truncated=False, - next_exclusive_start_key=None, - items=items, - ) - - # Lexically sort to emulate the API - items = sorted(items, key=lambda item: item.key) + async def purge(self) -> None: + """Delete all stored values from the key-value store, but keep the store itself. - truncated_items = items - if exclusive_start_key is not None: - key_pos = next((idx for idx, item in enumerate(items) if item.key == exclusive_start_key), None) - if key_pos is not None: - truncated_items = items[(key_pos + 1) :] - - limited_items = truncated_items[:limit] - - last_item_in_store = items[-1] - last_selected_item = limited_items[-1] - is_last_selected_item_absolutely_last = last_item_in_store == last_selected_item - next_exclusive_start_key = None if is_last_selected_item_absolutely_last else last_selected_item.key - - async with existing_store_by_id.file_operation_lock: - await existing_store_by_id.update_timestamps(has_been_modified=False) - - return KeyValueStoreListKeysPage( - count=len(items), - limit=limit, - exclusive_start_key=exclusive_start_key, - is_truncated=not is_last_selected_item_absolutely_last, - next_exclusive_start_key=next_exclusive_start_key, - items=limited_items, - ) + This method clears all key-value pairs while preserving the store structure. + """ + self._records.clear() + await self._update_metadata(update_accessed_at=True, update_modified_at=True) @override - async def get_record(self, key: str) -> KeyValueStoreRecord | None: - return await self._get_record_internal(key) + async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: + await self._update_metadata(update_accessed_at=True) - @override - async def get_record_as_bytes(self, key: str) -> KeyValueStoreRecord[bytes] | None: - return await self._get_record_internal(key, as_bytes=True) + # Return None if key doesn't exist + return self._records.get(key, None) @override - async def stream_record(self, key: str) -> AbstractAsyncContextManager[KeyValueStoreRecord[Response] | None]: - raise NotImplementedError('This method is not supported in memory storage.') + async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None: + content_type = content_type or infer_mime_type(value) + size = sys.getsizeof(value) - @override - async def set_record(self, key: str, value: Any, content_type: str | None = None) -> None: - # Check by id - existing_store_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=KeyValueStoreClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, + # Create and store the record + record = KeyValueStoreRecord( + key=key, + value=value, + content_type=content_type, + size=size, ) - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) - - if isinstance(value, io.IOBase): - raise NotImplementedError('File-like values are not supported in local memory storage') - - if content_type is None: - if is_file_or_bytes(value): - content_type = 'application/octet-stream' - elif isinstance(value, str): - content_type = 'text/plain; charset=utf-8' - else: - content_type = 'application/json; charset=utf-8' - - if 'application/json' in content_type and not is_file_or_bytes(value) and not isinstance(value, str): - s = await json_dumps(value) - value = s.encode('utf-8') - - async with existing_store_by_id.file_operation_lock: - await existing_store_by_id.update_timestamps(has_been_modified=True) - record = KeyValueStoreRecord(key=key, value=value, content_type=content_type, filename=None) - - old_record = existing_store_by_id.records.get(key) - existing_store_by_id.records[key] = record - - if self._memory_storage_client.persist_storage: - record_filename = self._filename_from_record(record) - record.filename = record_filename + self._records[key] = record - if old_record is not None and self._filename_from_record(old_record) != record_filename: - await existing_store_by_id.delete_persisted_record(old_record) - - await existing_store_by_id.persist_record(record) + await self._update_metadata(update_accessed_at=True, update_modified_at=True) @override - async def delete_record(self, key: str) -> None: - # Check by id - existing_store_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=KeyValueStoreClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) - - record = existing_store_by_id.records.get(key) - - if record is not None: - async with existing_store_by_id.file_operation_lock: - del existing_store_by_id.records[key] - await existing_store_by_id.update_timestamps(has_been_modified=True) - if self._memory_storage_client.persist_storage: - await existing_store_by_id.delete_persisted_record(record) + async def delete_value(self, *, key: str) -> None: + if key in self._records: + del self._records[key] + await self._update_metadata(update_accessed_at=True, update_modified_at=True) @override - async def get_public_url(self, key: str) -> str: - existing_store_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=KeyValueStoreClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) - - record = await self._get_record_internal(key) - - if not record: - raise ValueError(f'Record with key "{key}" was not found.') - - resource_dir = existing_store_by_id.resource_directory - record_filename = self._filename_from_record(record) - record_path = os.path.join(resource_dir, record_filename) - return f'file://{record_path}' - - async def persist_record(self, record: KeyValueStoreRecord) -> None: - """Persist the specified record to the key-value store.""" - store_directory = self.resource_directory - record_filename = self._filename_from_record(record) - record.filename = record_filename - record.content_type = record.content_type or 'application/octet-stream' - - # Ensure the directory for the entity exists - await asyncio.to_thread(os.makedirs, store_directory, exist_ok=True) - - # Create files for the record - record_path = os.path.join(store_directory, record_filename) - record_metadata_path = os.path.join(store_directory, f'{record_filename}.__metadata__.json') - - # Convert to bytes if string - if isinstance(record.value, str): - record.value = record.value.encode('utf-8') - - f = await asyncio.to_thread(open, record_path, mode='wb') - try: - await asyncio.to_thread(f.write, record.value) - finally: - await asyncio.to_thread(f.close) - - if self._memory_storage_client.write_metadata: - metadata_f = await asyncio.to_thread(open, record_metadata_path, mode='wb') - - try: - record_metadata = KeyValueStoreRecordMetadata(key=record.key, content_type=record.content_type) - await asyncio.to_thread(metadata_f.write, record_metadata.model_dump_json(indent=2).encode('utf-8')) - finally: - await asyncio.to_thread(metadata_f.close) - - async def delete_persisted_record(self, record: KeyValueStoreRecord) -> None: - """Delete the specified record from the key-value store.""" - store_directory = self.resource_directory - record_filename = self._filename_from_record(record) - - # Ensure the directory for the entity exists - await asyncio.to_thread(os.makedirs, store_directory, exist_ok=True) - - # Create files for the record - record_path = os.path.join(store_directory, record_filename) - record_metadata_path = os.path.join(store_directory, record_filename + '.__metadata__.json') - - await force_remove(record_path) - await force_remove(record_metadata_path) - - async def update_timestamps(self, *, has_been_modified: bool) -> None: - """Update the timestamps of the key-value store.""" - self._accessed_at = datetime.now(timezone.utc) - - if has_been_modified: - self._modified_at = datetime.now(timezone.utc) - - await persist_metadata_if_enabled( - data=self.resource_info.model_dump(), - entity_directory=self.resource_directory, - write_metadata=self._memory_storage_client.write_metadata, - ) - - async def _get_record_internal( + async def iterate_keys( self, - key: str, *, - as_bytes: bool = False, - ) -> KeyValueStoreRecord | None: - # Check by id - existing_store_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=KeyValueStoreClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) - - stored_record = existing_store_by_id.records.get(key) - - if stored_record is None: - return None - - record = KeyValueStoreRecord( - key=stored_record.key, - value=stored_record.value, - content_type=stored_record.content_type, - filename=stored_record.filename, - ) - - if not as_bytes: - try: - record.value = maybe_parse_body(record.value, str(record.content_type)) - except ValueError: - logger.exception('Error parsing key-value store record') - - async with existing_store_by_id.file_operation_lock: - await existing_store_by_id.update_timestamps(has_been_modified=False) - - return record - - def _filename_from_record(self, record: KeyValueStoreRecord) -> str: - if record.filename is not None: - return record.filename + exclusive_start_key: str | None = None, + limit: int | None = None, + ) -> AsyncIterator[KeyValueStoreRecordMetadata]: + await self._update_metadata(update_accessed_at=True) - if not record.content_type or record.content_type == 'application/octet-stream': - return record.key + # Get all keys, sorted alphabetically + keys = sorted(self._records.keys()) - extension = determine_file_extension(record.content_type) + # Apply exclusive_start_key filter if provided + if exclusive_start_key is not None: + keys = [k for k in keys if k > exclusive_start_key] + + # Apply limit if provided + if limit is not None: + keys = keys[:limit] + + # Yield metadata for each key + for key in keys: + record = self._records[key] + yield KeyValueStoreRecordMetadata( + key=key, + content_type=record.content_type, + size=record.size, + ) - if record.key.endswith(f'.{extension}'): - return record.key + @override + async def get_public_url(self, *, key: str) -> str: + raise NotImplementedError('Public URLs are not supported for memory key-value stores.') - return f'{record.key}.{extension}' + async def _update_metadata( + self, + *, + update_accessed_at: bool = False, + update_modified_at: bool = False, + ) -> None: + """Update the key-value store metadata with current information. + + Args: + update_accessed_at: If True, update the `accessed_at` timestamp to the current time. + update_modified_at: If True, update the `modified_at` timestamp to the current time. + """ + now = datetime.now(timezone.utc) + + if update_accessed_at: + self._metadata.accessed_at = now + if update_modified_at: + self._metadata.modified_at = now diff --git a/src/crawlee/storage_clients/_memory/_key_value_store_collection_client.py b/src/crawlee/storage_clients/_memory/_key_value_store_collection_client.py deleted file mode 100644 index 939780449f..0000000000 --- a/src/crawlee/storage_clients/_memory/_key_value_store_collection_client.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee.storage_clients._base import KeyValueStoreCollectionClient as BaseKeyValueStoreCollectionClient -from crawlee.storage_clients.models import KeyValueStoreListPage, KeyValueStoreMetadata - -from ._creation_management import get_or_create_inner -from ._key_value_store_client import KeyValueStoreClient - -if TYPE_CHECKING: - from ._memory_storage_client import MemoryStorageClient - - -class KeyValueStoreCollectionClient(BaseKeyValueStoreCollectionClient): - """Subclient for manipulating key-value stores.""" - - def __init__(self, *, memory_storage_client: MemoryStorageClient) -> None: - self._memory_storage_client = memory_storage_client - - @property - def _storage_client_cache(self) -> list[KeyValueStoreClient]: - return self._memory_storage_client.key_value_stores_handled - - @override - async def get_or_create( - self, - *, - name: str | None = None, - schema: dict | None = None, - id: str | None = None, - ) -> KeyValueStoreMetadata: - resource_client = await get_or_create_inner( - memory_storage_client=self._memory_storage_client, - storage_client_cache=self._storage_client_cache, - resource_client_class=KeyValueStoreClient, - name=name, - id=id, - ) - return resource_client.resource_info - - @override - async def list( - self, - *, - unnamed: bool = False, - limit: int | None = None, - offset: int | None = None, - desc: bool = False, - ) -> KeyValueStoreListPage: - items = [storage.resource_info for storage in self._storage_client_cache] - - return KeyValueStoreListPage( - total=len(items), - count=len(items), - offset=0, - limit=len(items), - desc=False, - items=sorted(items, key=lambda item: item.created_at), - ) diff --git a/src/crawlee/storage_clients/_memory/_memory_storage_client.py b/src/crawlee/storage_clients/_memory/_memory_storage_client.py deleted file mode 100644 index 8000f41274..0000000000 --- a/src/crawlee/storage_clients/_memory/_memory_storage_client.py +++ /dev/null @@ -1,358 +0,0 @@ -from __future__ import annotations - -import asyncio -import contextlib -import os -import shutil -from logging import getLogger -from pathlib import Path -from typing import TYPE_CHECKING, TypeVar - -from typing_extensions import override - -from crawlee._utils.docs import docs_group -from crawlee.configuration import Configuration -from crawlee.storage_clients import StorageClient - -from ._dataset_client import DatasetClient -from ._dataset_collection_client import DatasetCollectionClient -from ._key_value_store_client import KeyValueStoreClient -from ._key_value_store_collection_client import KeyValueStoreCollectionClient -from ._request_queue_client import RequestQueueClient -from ._request_queue_collection_client import RequestQueueCollectionClient - -if TYPE_CHECKING: - from crawlee.storage_clients._base import ResourceClient - - -TResourceClient = TypeVar('TResourceClient', DatasetClient, KeyValueStoreClient, RequestQueueClient) - -logger = getLogger(__name__) - - -@docs_group('Classes') -class MemoryStorageClient(StorageClient): - """Represents an in-memory storage client for managing datasets, key-value stores, and request queues. - - It emulates in-memory storage similar to the Apify platform, supporting both in-memory and local file system-based - persistence. - - The behavior of the storage, such as data persistence and metadata writing, can be customized via initialization - parameters or environment variables. - """ - - _MIGRATING_KEY_VALUE_STORE_DIR_NAME = '__CRAWLEE_MIGRATING_KEY_VALUE_STORE' - """Name of the directory used to temporarily store files during the migration of the default key-value store.""" - - _TEMPORARY_DIR_NAME = '__CRAWLEE_TEMPORARY' - """Name of the directory used to temporarily store files during purges.""" - - _DATASETS_DIR_NAME = 'datasets' - """Name of the directory containing datasets.""" - - _KEY_VALUE_STORES_DIR_NAME = 'key_value_stores' - """Name of the directory containing key-value stores.""" - - _REQUEST_QUEUES_DIR_NAME = 'request_queues' - """Name of the directory containing request queues.""" - - def __init__( - self, - *, - write_metadata: bool, - persist_storage: bool, - storage_dir: str, - default_request_queue_id: str, - default_key_value_store_id: str, - default_dataset_id: str, - ) -> None: - """Initialize a new instance. - - In most cases, you should use the `from_config` constructor to create a new instance based on - the provided configuration. - - Args: - write_metadata: Whether to write metadata to the storage. - persist_storage: Whether to persist the storage. - storage_dir: Path to the storage directory. - default_request_queue_id: The default request queue ID. - default_key_value_store_id: The default key-value store ID. - default_dataset_id: The default dataset ID. - """ - # Set the internal attributes. - self._write_metadata = write_metadata - self._persist_storage = persist_storage - self._storage_dir = storage_dir - self._default_request_queue_id = default_request_queue_id - self._default_key_value_store_id = default_key_value_store_id - self._default_dataset_id = default_dataset_id - - self.datasets_handled: list[DatasetClient] = [] - self.key_value_stores_handled: list[KeyValueStoreClient] = [] - self.request_queues_handled: list[RequestQueueClient] = [] - - self._purged_on_start = False # Indicates whether a purge was already performed on this instance. - self._purge_lock = asyncio.Lock() - - @classmethod - def from_config(cls, config: Configuration | None = None) -> MemoryStorageClient: - """Initialize a new instance based on the provided `Configuration`. - - Args: - config: The `Configuration` instance. Uses the global (default) one if not provided. - """ - config = config or Configuration.get_global_configuration() - - return cls( - write_metadata=config.write_metadata, - persist_storage=config.persist_storage, - storage_dir=config.storage_dir, - default_request_queue_id=config.default_request_queue_id, - default_key_value_store_id=config.default_key_value_store_id, - default_dataset_id=config.default_dataset_id, - ) - - @property - def write_metadata(self) -> bool: - """Whether to write metadata to the storage.""" - return self._write_metadata - - @property - def persist_storage(self) -> bool: - """Whether to persist the storage.""" - return self._persist_storage - - @property - def storage_dir(self) -> str: - """Path to the storage directory.""" - return self._storage_dir - - @property - def datasets_directory(self) -> str: - """Path to the directory containing datasets.""" - return os.path.join(self.storage_dir, self._DATASETS_DIR_NAME) - - @property - def key_value_stores_directory(self) -> str: - """Path to the directory containing key-value stores.""" - return os.path.join(self.storage_dir, self._KEY_VALUE_STORES_DIR_NAME) - - @property - def request_queues_directory(self) -> str: - """Path to the directory containing request queues.""" - return os.path.join(self.storage_dir, self._REQUEST_QUEUES_DIR_NAME) - - @override - def dataset(self, id: str) -> DatasetClient: - return DatasetClient(memory_storage_client=self, id=id) - - @override - def datasets(self) -> DatasetCollectionClient: - return DatasetCollectionClient(memory_storage_client=self) - - @override - def key_value_store(self, id: str) -> KeyValueStoreClient: - return KeyValueStoreClient(memory_storage_client=self, id=id) - - @override - def key_value_stores(self) -> KeyValueStoreCollectionClient: - return KeyValueStoreCollectionClient(memory_storage_client=self) - - @override - def request_queue(self, id: str) -> RequestQueueClient: - return RequestQueueClient(memory_storage_client=self, id=id) - - @override - def request_queues(self) -> RequestQueueCollectionClient: - return RequestQueueCollectionClient(memory_storage_client=self) - - @override - async def purge_on_start(self) -> None: - # Optimistic, non-blocking check - if self._purged_on_start is True: - logger.debug('Storage was already purged on start.') - return - - async with self._purge_lock: - # Another check under the lock just to be sure - if self._purged_on_start is True: - # Mypy doesn't understand that the _purged_on_start can change while we're getting the async lock - return # type: ignore[unreachable] - - await self._purge_default_storages() - self._purged_on_start = True - - def get_cached_resource_client( - self, - resource_client_class: type[TResourceClient], - id: str | None, - name: str | None, - ) -> TResourceClient | None: - """Try to return a resource client from the internal cache.""" - if issubclass(resource_client_class, DatasetClient): - cache = self.datasets_handled - elif issubclass(resource_client_class, KeyValueStoreClient): - cache = self.key_value_stores_handled - elif issubclass(resource_client_class, RequestQueueClient): - cache = self.request_queues_handled - else: - return None - - for storage_client in cache: - if storage_client.id == id or ( - storage_client.name and name and storage_client.name.lower() == name.lower() - ): - return storage_client - - return None - - def add_resource_client_to_cache(self, resource_client: ResourceClient) -> None: - """Add a new resource client to the internal cache.""" - if isinstance(resource_client, DatasetClient): - self.datasets_handled.append(resource_client) - if isinstance(resource_client, KeyValueStoreClient): - self.key_value_stores_handled.append(resource_client) - if isinstance(resource_client, RequestQueueClient): - self.request_queues_handled.append(resource_client) - - async def _purge_default_storages(self) -> None: - """Clean up the storage directories, preparing the environment for a new run. - - It aims to remove residues from previous executions to avoid data contamination between runs. - - It specifically targets: - - The local directory containing the default dataset. - - All records from the default key-value store in the local directory, except for the 'INPUT' key. - - The local directory containing the default request queue. - """ - # Key-value stores - if await asyncio.to_thread(os.path.exists, self.key_value_stores_directory): - key_value_store_folders = await asyncio.to_thread(os.scandir, self.key_value_stores_directory) - for key_value_store_folder in key_value_store_folders: - if key_value_store_folder.name.startswith( - self._TEMPORARY_DIR_NAME - ) or key_value_store_folder.name.startswith('__OLD'): - await self._batch_remove_files(key_value_store_folder.path) - elif key_value_store_folder.name == self._default_key_value_store_id: - await self._handle_default_key_value_store(key_value_store_folder.path) - - # Datasets - if await asyncio.to_thread(os.path.exists, self.datasets_directory): - dataset_folders = await asyncio.to_thread(os.scandir, self.datasets_directory) - for dataset_folder in dataset_folders: - if dataset_folder.name == self._default_dataset_id or dataset_folder.name.startswith( - self._TEMPORARY_DIR_NAME - ): - await self._batch_remove_files(dataset_folder.path) - - # Request queues - if await asyncio.to_thread(os.path.exists, self.request_queues_directory): - request_queue_folders = await asyncio.to_thread(os.scandir, self.request_queues_directory) - for request_queue_folder in request_queue_folders: - if request_queue_folder.name == self._default_request_queue_id or request_queue_folder.name.startswith( - self._TEMPORARY_DIR_NAME - ): - await self._batch_remove_files(request_queue_folder.path) - - async def _handle_default_key_value_store(self, folder: str) -> None: - """Manage the cleanup of the default key-value store. - - It removes all files to ensure a clean state except for a set of predefined input keys (`possible_input_keys`). - - Args: - folder: Path to the default key-value store directory to clean. - """ - folder_exists = await asyncio.to_thread(os.path.exists, folder) - temporary_path = os.path.normpath(os.path.join(folder, '..', self._MIGRATING_KEY_VALUE_STORE_DIR_NAME)) - - # For optimization, we want to only attempt to copy a few files from the default key-value store - possible_input_keys = [ - 'INPUT', - 'INPUT.json', - 'INPUT.bin', - 'INPUT.txt', - ] - - if folder_exists: - # Create a temporary folder to save important files in - Path(temporary_path).mkdir(parents=True, exist_ok=True) - - # Go through each file and save the ones that are important - for entity in possible_input_keys: - original_file_path = os.path.join(folder, entity) - temp_file_path = os.path.join(temporary_path, entity) - with contextlib.suppress(Exception): - await asyncio.to_thread(os.rename, original_file_path, temp_file_path) - - # Remove the original folder and all its content - counter = 0 - temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__')) - done = False - try: - while not done: - await asyncio.to_thread(os.rename, folder, temp_path_for_old_folder) - done = True - except Exception: - counter += 1 - temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__')) - - # Replace the temporary folder with the original folder - await asyncio.to_thread(os.rename, temporary_path, folder) - - # Remove the old folder - await self._batch_remove_files(temp_path_for_old_folder) - - async def _batch_remove_files(self, folder: str, counter: int = 0) -> None: - """Remove a folder and its contents in batches to minimize blocking time. - - This method first renames the target folder to a temporary name, then deletes the temporary folder, - allowing the file system operations to proceed without hindering other asynchronous tasks. - - Args: - folder: The directory path to remove. - counter: A counter used for generating temporary directory names in case of conflicts. - """ - folder_exists = await asyncio.to_thread(os.path.exists, folder) - - if folder_exists: - temporary_folder = ( - folder - if os.path.basename(folder).startswith(f'{self._TEMPORARY_DIR_NAME}_') - else os.path.normpath(os.path.join(folder, '..', f'{self._TEMPORARY_DIR_NAME}_{counter}')) - ) - - try: - # Rename the old folder to the new one to allow background deletions - await asyncio.to_thread(os.rename, folder, temporary_folder) - except Exception: - # Folder exists already, try again with an incremented counter - return await self._batch_remove_files(folder, counter + 1) - - await asyncio.to_thread(shutil.rmtree, temporary_folder, ignore_errors=True) - return None - - def _get_default_storage_id(self, storage_client_class: type[TResourceClient]) -> str: - """Get the default storage ID based on the storage class.""" - if issubclass(storage_client_class, DatasetClient): - return self._default_dataset_id - - if issubclass(storage_client_class, KeyValueStoreClient): - return self._default_key_value_store_id - - if issubclass(storage_client_class, RequestQueueClient): - return self._default_request_queue_id - - raise ValueError(f'Invalid storage class: {storage_client_class.__name__}') - - def _get_storage_dir(self, storage_client_class: type[TResourceClient]) -> str: - """Get the storage directory based on the storage class.""" - if issubclass(storage_client_class, DatasetClient): - return self.datasets_directory - - if issubclass(storage_client_class, KeyValueStoreClient): - return self.key_value_stores_directory - - if issubclass(storage_client_class, RequestQueueClient): - return self.request_queues_directory - - raise ValueError(f'Invalid storage class: {storage_client_class.__name__}') diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index 477d53df07..95035e6155 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -1,558 +1,380 @@ from __future__ import annotations -import asyncio -import os -import shutil from datetime import datetime, timezone -from decimal import Decimal from logging import getLogger from typing import TYPE_CHECKING -from sortedcollections import ValueSortedDict from typing_extensions import override -from crawlee._types import StorageTypes +from crawlee import Request from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.data_processing import raise_on_duplicate_storage, raise_on_non_existing_storage -from crawlee._utils.file import force_remove, force_rename, json_dumps -from crawlee._utils.requests import unique_key_to_request_id -from crawlee.storage_clients._base import RequestQueueClient as BaseRequestQueueClient +from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import ( - BatchRequestsOperationResponse, - InternalRequest, + AddRequestsResponse, ProcessedRequest, - ProlongRequestLockResponse, - RequestQueueHead, - RequestQueueHeadWithLocks, RequestQueueMetadata, - UnprocessedRequest, ) -from ._creation_management import find_or_create_client_by_id_or_name_inner, persist_metadata_if_enabled - if TYPE_CHECKING: from collections.abc import Sequence - from sortedcontainers import SortedDict + from crawlee.configuration import Configuration - from crawlee import Request +logger = getLogger(__name__) - from ._memory_storage_client import MemoryStorageClient -logger = getLogger(__name__) +class MemoryRequestQueueClient(RequestQueueClient): + """Memory implementation of the request queue client. + This client stores requests in memory using a Python list and dictionary. No data is persisted between + process runs, which means all requests are lost when the program terminates. This implementation + is primarily useful for testing, development, and short-lived crawler runs where persistence + is not required. -class RequestQueueClient(BaseRequestQueueClient): - """Subclient for manipulating a single request queue.""" + This client provides fast access to request data but is limited by available memory and + does not support data sharing across different processes. + """ def __init__( self, *, - memory_storage_client: MemoryStorageClient, - id: str | None = None, - name: str | None = None, - created_at: datetime | None = None, - accessed_at: datetime | None = None, - modified_at: datetime | None = None, - handled_request_count: int = 0, - pending_request_count: int = 0, + id: str, + name: str | None, + created_at: datetime, + accessed_at: datetime, + modified_at: datetime, + had_multiple_clients: bool, + handled_request_count: int, + pending_request_count: int, + stats: dict, + total_request_count: int, ) -> None: - self._memory_storage_client = memory_storage_client - self.id = id or crypto_random_object_id() - self.name = name - self._created_at = created_at or datetime.now(timezone.utc) - self._accessed_at = accessed_at or datetime.now(timezone.utc) - self._modified_at = modified_at or datetime.now(timezone.utc) - self.handled_request_count = handled_request_count - self.pending_request_count = pending_request_count - - self.requests: SortedDict[str, InternalRequest] = ValueSortedDict( - lambda request: request.order_no or -float('inf') - ) - self.file_operation_lock = asyncio.Lock() - self._last_used_timestamp = Decimal(0) - - self._in_progress = set[str]() + """Initialize a new instance. - @property - def resource_info(self) -> RequestQueueMetadata: - """Get the resource info for the request queue client.""" - return RequestQueueMetadata( - id=self.id, - name=self.name, - accessed_at=self._accessed_at, - created_at=self._created_at, - modified_at=self._modified_at, - had_multiple_clients=False, - handled_request_count=self.handled_request_count, - pending_request_count=self.pending_request_count, - stats={}, - total_request_count=len(self.requests), - user_id='1', - resource_directory=self.resource_directory, - ) - - @property - def resource_directory(self) -> str: - """Get the resource directory for the client.""" - return os.path.join(self._memory_storage_client.request_queues_directory, self.name or self.id) - - @override - async def get(self) -> RequestQueueMetadata | None: - found = find_or_create_client_by_id_or_name_inner( - resource_client_class=RequestQueueClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, + Preferably use the `MemoryRequestQueueClient.open` class method to create a new instance. + """ + self._metadata = RequestQueueMetadata( + id=id, + name=name, + created_at=created_at, + accessed_at=accessed_at, + modified_at=modified_at, + had_multiple_clients=had_multiple_clients, + handled_request_count=handled_request_count, + pending_request_count=pending_request_count, + stats=stats, + total_request_count=total_request_count, ) - if found: - async with found.file_operation_lock: - await found.update_timestamps(has_been_modified=False) - return found.resource_info + # List to hold RQ items + self._records = list[Request]() - return None + # Dictionary to track in-progress requests (fetched but not yet handled or reclaimed) + self._in_progress = dict[str, Request]() @override - async def update(self, *, name: str | None = None) -> RequestQueueMetadata: - # Check by id - existing_queue_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=RequestQueueClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) - - # Skip if no changes - if name is None: - return existing_queue_by_id.resource_info - - async with existing_queue_by_id.file_operation_lock: - # Check that name is not in use already - existing_queue_by_name = next( - ( - queue - for queue in self._memory_storage_client.request_queues_handled - if queue.name and queue.name.lower() == name.lower() - ), - None, - ) - - if existing_queue_by_name is not None: - raise_on_duplicate_storage(StorageTypes.REQUEST_QUEUE, 'name', name) - - previous_dir = existing_queue_by_id.resource_directory - existing_queue_by_id.name = name - - await force_rename(previous_dir, existing_queue_by_id.resource_directory) - - # Update timestamps - await existing_queue_by_id.update_timestamps(has_been_modified=True) - - return existing_queue_by_id.resource_info + @property + def metadata(self) -> RequestQueueMetadata: + return self._metadata @override - async def delete(self) -> None: - queue = next( - (queue for queue in self._memory_storage_client.request_queues_handled if queue.id == self.id), - None, + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> MemoryRequestQueueClient: + # Otherwise create a new queue + queue_id = id or crypto_random_object_id() + now = datetime.now(timezone.utc) + + return cls( + id=queue_id, + name=name, + created_at=now, + accessed_at=now, + modified_at=now, + had_multiple_clients=False, + handled_request_count=0, + pending_request_count=0, + stats={}, + total_request_count=0, ) - if queue is not None: - async with queue.file_operation_lock: - self._memory_storage_client.request_queues_handled.remove(queue) - queue.pending_request_count = 0 - queue.handled_request_count = 0 - queue.requests.clear() - - if os.path.exists(queue.resource_directory): - await asyncio.to_thread(shutil.rmtree, queue.resource_directory) - @override - async def list_head(self, *, limit: int | None = None, skip_in_progress: bool = False) -> RequestQueueHead: - existing_queue_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=RequestQueueClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, + async def drop(self) -> None: + # Clear all data + self._records.clear() + self._in_progress.clear() + + await self._update_metadata( + update_modified_at=True, + update_accessed_at=True, + new_handled_request_count=0, + new_pending_request_count=0, + new_total_request_count=0, ) - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) - - async with existing_queue_by_id.file_operation_lock: - await existing_queue_by_id.update_timestamps(has_been_modified=False) - - requests: list[Request] = [] - - # Iterate all requests in the queue which have sorted key larger than infinity, which means - # `order_no` is not `None`. This will iterate them in order of `order_no`. - for request_key in existing_queue_by_id.requests.irange_key( # type: ignore[attr-defined] # irange_key is a valid SortedDict method but not recognized by mypy - min_key=-float('inf'), inclusive=(False, True) - ): - if len(requests) == limit: - break - - if skip_in_progress and request_key in existing_queue_by_id._in_progress: # noqa: SLF001 - continue - internal_request = existing_queue_by_id.requests.get(request_key) - - # Check that the request still exists and was not handled, - # in case something deleted it or marked it as handled concurrenctly - if internal_request and not internal_request.handled_at: - requests.append(internal_request.to_request()) - - return RequestQueueHead( - limit=limit, - had_multiple_clients=False, - queue_modified_at=existing_queue_by_id._modified_at, # noqa: SLF001 - items=requests, - ) - @override - async def list_and_lock_head(self, *, lock_secs: int, limit: int | None = None) -> RequestQueueHeadWithLocks: - existing_queue_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=RequestQueueClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) + async def purge(self) -> None: + """Delete all requests from the queue, but keep the queue itself. - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) - - result = await self.list_head(limit=limit, skip_in_progress=True) - - for item in result.items: - existing_queue_by_id._in_progress.add(item.id) # noqa: SLF001 - - return RequestQueueHeadWithLocks( - queue_has_locked_requests=len(existing_queue_by_id._in_progress) > 0, # noqa: SLF001 - lock_secs=lock_secs, - limit=result.limit, - had_multiple_clients=result.had_multiple_clients, - queue_modified_at=result.queue_modified_at, - items=result.items, + This method clears all requests including both pending and handled ones, + but preserves the queue structure. + """ + self._records.clear() + self._in_progress.clear() + + await self._update_metadata( + update_modified_at=True, + update_accessed_at=True, + new_handled_request_count=0, + new_pending_request_count=0, + new_total_request_count=0, ) @override - async def add_request( + async def add_batch_of_requests( self, - request: Request, + requests: Sequence[Request], *, forefront: bool = False, - ) -> ProcessedRequest: - existing_queue_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=RequestQueueClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) + ) -> AddRequestsResponse: + """Add a batch of requests to the queue. - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + Args: + requests: The requests to add. + forefront: Whether to add the requests to the beginning of the queue. - internal_request = await self._create_internal_request(request, forefront) + Returns: + Response containing information about the added requests. + """ + processed_requests = [] + for request in requests: + # Ensure the request has an ID + if not request.id: + request.id = crypto_random_object_id() - async with existing_queue_by_id.file_operation_lock: - existing_internal_request_with_id = existing_queue_by_id.requests.get(internal_request.id) + # Check if the request is already in the queue by unique_key + existing_request = next((r for r in self._records if r.unique_key == request.unique_key), None) - # We already have the request present, so we return information about it - if existing_internal_request_with_id is not None: - await existing_queue_by_id.update_timestamps(has_been_modified=False) + was_already_present = existing_request is not None + was_already_handled = was_already_present and existing_request and existing_request.handled_at is not None - return ProcessedRequest( - id=internal_request.id, - unique_key=internal_request.unique_key, - was_already_present=True, - was_already_handled=existing_internal_request_with_id.handled_at is not None, + # If the request is already in the queue and handled, don't add it again + if was_already_handled: + processed_requests.append( + ProcessedRequest( + id=request.id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=True, + ) ) - - existing_queue_by_id.requests[internal_request.id] = internal_request - if internal_request.handled_at: - existing_queue_by_id.handled_request_count += 1 + continue + + # If the request is already in the queue but not handled, update it + if was_already_present: + # Update the existing request with any new data + for idx, rec in enumerate(self._records): + if rec.unique_key == request.unique_key: + self._records[idx] = request + break else: - existing_queue_by_id.pending_request_count += 1 - await existing_queue_by_id.update_timestamps(has_been_modified=True) - await self._persist_single_request_to_storage( - request=internal_request, - entity_directory=existing_queue_by_id.resource_directory, - persist_storage=self._memory_storage_client.persist_storage, + # Add the new request to the queue + if forefront: + self._records.insert(0, request) + else: + self._records.append(request) + + # Update metadata counts + self._metadata.total_request_count += 1 + self._metadata.pending_request_count += 1 + + processed_requests.append( + ProcessedRequest( + id=request.id, + unique_key=request.unique_key, + was_already_present=was_already_present, + was_already_handled=False, + ) ) - # We return was_already_handled=False even though the request may have been added as handled, - # because that's how API behaves. - return ProcessedRequest( - id=internal_request.id, - unique_key=internal_request.unique_key, - was_already_present=False, - was_already_handled=False, - ) + await self._update_metadata(update_accessed_at=True, update_modified_at=True) - @override - async def get_request(self, request_id: str) -> Request | None: - existing_queue_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=RequestQueueClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, + return AddRequestsResponse( + processed_requests=processed_requests, + unprocessed_requests=[], ) - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + @override + async def fetch_next_request(self) -> Request | None: + """Return the next request in the queue to be processed. - async with existing_queue_by_id.file_operation_lock: - await existing_queue_by_id.update_timestamps(has_been_modified=False) + Returns: + The request or `None` if there are no more pending requests. + """ + # Find the first request that's not handled or in progress + for request in self._records: + if request.handled_at is None and request.id not in self._in_progress: + # Mark as in progress + self._in_progress[request.id] = request + return request - internal_request = existing_queue_by_id.requests.get(request_id) - return internal_request.to_request() if internal_request else None + return None @override - async def update_request( - self, - request: Request, - *, - forefront: bool = False, - ) -> ProcessedRequest: - existing_queue_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=RequestQueueClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + async def get_request(self, request_id: str) -> Request | None: + """Retrieve a request from the queue. - internal_request = await self._create_internal_request(request, forefront) + Args: + request_id: ID of the request to retrieve. - # First we need to check the existing request to be able to return information about its handled state. - existing_internal_request = existing_queue_by_id.requests.get(internal_request.id) + Returns: + The retrieved request, or None, if it did not exist. + """ + # Check in-progress requests first + if request_id in self._in_progress: + return self._in_progress[request_id] - # Undefined means that the request is not present in the queue. - # We need to insert it, to behave the same as API. - if existing_internal_request is None: - return await self.add_request(request, forefront=forefront) + # Otherwise search in the records + for request in self._records: + if request.id == request_id: + return request - async with existing_queue_by_id.file_operation_lock: - # When updating the request, we need to make sure that - # the handled counts are updated correctly in all cases. - existing_queue_by_id.requests[internal_request.id] = internal_request + return None - pending_count_adjustment = 0 - is_request_handled_state_changing = existing_internal_request.handled_at != internal_request.handled_at + @override + async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: + """Mark a request as handled after successful processing. - request_was_handled_before_update = existing_internal_request.handled_at is not None + Handled requests will never again be returned by the `fetch_next_request` method. - # We add 1 pending request if previous state was handled - if is_request_handled_state_changing: - pending_count_adjustment = 1 if request_was_handled_before_update else -1 + Args: + request: The request to mark as handled. - existing_queue_by_id.pending_request_count += pending_count_adjustment - existing_queue_by_id.handled_request_count -= pending_count_adjustment - await existing_queue_by_id.update_timestamps(has_been_modified=True) - await self._persist_single_request_to_storage( - request=internal_request, - entity_directory=existing_queue_by_id.resource_directory, - persist_storage=self._memory_storage_client.persist_storage, - ) + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + # Check if the request is in progress + if request.id not in self._in_progress: + return None - if request.handled_at is not None: - existing_queue_by_id._in_progress.discard(request.id) # noqa: SLF001 + # Set handled_at timestamp if not already set + if request.handled_at is None: + request.handled_at = datetime.now(timezone.utc) - return ProcessedRequest( - id=internal_request.id, - unique_key=internal_request.unique_key, - was_already_present=True, - was_already_handled=request_was_handled_before_update, - ) + # Update the request in records + for idx, rec in enumerate(self._records): + if rec.id == request.id: + self._records[idx] = request + break - @override - async def delete_request(self, request_id: str) -> None: - existing_queue_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=RequestQueueClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) + # Remove from in-progress + del self._in_progress[request.id] - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + # Update metadata counts + self._metadata.handled_request_count += 1 + self._metadata.pending_request_count -= 1 - async with existing_queue_by_id.file_operation_lock: - internal_request = existing_queue_by_id.requests.get(request_id) + # Update metadata timestamps + await self._update_metadata(update_modified_at=True) - if internal_request: - del existing_queue_by_id.requests[request_id] - if internal_request.handled_at: - existing_queue_by_id.handled_request_count -= 1 - else: - existing_queue_by_id.pending_request_count -= 1 - await existing_queue_by_id.update_timestamps(has_been_modified=True) - await self._delete_request_file_from_storage( - entity_directory=existing_queue_by_id.resource_directory, - request_id=request_id, - ) + return ProcessedRequest( + id=request.id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=True, + ) @override - async def prolong_request_lock( + async def reclaim_request( self, - request_id: str, + request: Request, *, forefront: bool = False, - lock_secs: int, - ) -> ProlongRequestLockResponse: - return ProlongRequestLockResponse(lock_expires_at=datetime.now(timezone.utc)) + ) -> ProcessedRequest | None: + """Reclaim a failed request back to the queue. - @override - async def delete_request_lock( - self, - request_id: str, - *, - forefront: bool = False, - ) -> None: - existing_queue_by_id = find_or_create_client_by_id_or_name_inner( - resource_client_class=RequestQueueClient, - memory_storage_client=self._memory_storage_client, - id=self.id, - name=self.name, - ) + The request will be returned for processing later again by another call to `fetch_next_request`. - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + Args: + request: The request to return to the queue. + forefront: Whether to add the request to the head or the end of the queue. - existing_queue_by_id._in_progress.discard(request_id) # noqa: SLF001 + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + # Check if the request is in progress + if request.id not in self._in_progress: + return None - @override - async def batch_add_requests( - self, - requests: Sequence[Request], - *, - forefront: bool = False, - ) -> BatchRequestsOperationResponse: - processed_requests = list[ProcessedRequest]() - unprocessed_requests = list[UnprocessedRequest]() + # Remove from in-progress + del self._in_progress[request.id] - for request in requests: - try: - processed_request = await self.add_request(request, forefront=forefront) - processed_requests.append( - ProcessedRequest( - id=processed_request.id, - unique_key=processed_request.unique_key, - was_already_present=processed_request.was_already_present, - was_already_handled=processed_request.was_already_handled, - ) - ) - except Exception as exc: # noqa: PERF203 - logger.warning(f'Error adding request to the queue: {exc}') - unprocessed_requests.append( - UnprocessedRequest( - unique_key=request.unique_key, - url=request.url, - method=request.method, - ) - ) + # If forefront is true, move the request to the beginning of the queue + if forefront: + # First remove the request from its current position + for idx, rec in enumerate(self._records): + if rec.id == request.id: + self._records.pop(idx) + break - return BatchRequestsOperationResponse( - processed_requests=processed_requests, - unprocessed_requests=unprocessed_requests, + # Then insert it at the beginning + self._records.insert(0, request) + + # Update metadata timestamps + await self._update_metadata(update_modified_at=True) + + return ProcessedRequest( + id=request.id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=False, ) @override - async def batch_delete_requests(self, requests: list[Request]) -> BatchRequestsOperationResponse: - raise NotImplementedError('This method is not supported in memory storage.') + async def is_empty(self) -> bool: + """Check if the queue is empty. - async def update_timestamps(self, *, has_been_modified: bool) -> None: - """Update the timestamps of the request queue.""" - self._accessed_at = datetime.now(timezone.utc) - - if has_been_modified: - self._modified_at = datetime.now(timezone.utc) + Returns: + True if the queue is empty, False otherwise. + """ + await self._update_metadata(update_accessed_at=True) - await persist_metadata_if_enabled( - data=self.resource_info.model_dump(), - entity_directory=self.resource_directory, - write_metadata=self._memory_storage_client.write_metadata, - ) + # Queue is empty if there are no pending requests + pending_requests = [r for r in self._records if r.handled_at is None] + return len(pending_requests) == 0 - async def _persist_single_request_to_storage( + async def _update_metadata( self, *, - request: InternalRequest, - entity_directory: str, - persist_storage: bool, + update_accessed_at: bool = False, + update_modified_at: bool = False, + new_handled_request_count: int | None = None, + new_pending_request_count: int | None = None, + new_total_request_count: int | None = None, ) -> None: - """Update or writes a single request item to the disk. - - This function writes a given request dictionary to a JSON file, named after the request's ID, - within a specified directory. The writing process is skipped if `persist_storage` is False. - Before writing, it ensures that the target directory exists, creating it if necessary. + """Update the request queue metadata with current information. Args: - request: The dictionary containing the request data. - entity_directory: The directory path where the request file should be stored. - persist_storage: A boolean flag indicating whether the request should be persisted to the disk. + update_accessed_at: If True, update the `accessed_at` timestamp to the current time. + update_modified_at: If True, update the `modified_at` timestamp to the current time. + new_handled_request_count: If provided, set the handled request count to this value. + new_pending_request_count: If provided, set the pending request count to this value. + new_total_request_count: If provided, set the total request count to this value. """ - # Skip writing files to the disk if the client has the option set to false - if not persist_storage: - return - - # Ensure the directory for the entity exists - await asyncio.to_thread(os.makedirs, entity_directory, exist_ok=True) - - # Write the request to the file - file_path = os.path.join(entity_directory, f'{request.id}.json') - f = await asyncio.to_thread(open, file_path, mode='w', encoding='utf-8') - try: - s = await json_dumps(request.model_dump()) - await asyncio.to_thread(f.write, s) - finally: - f.close() - - async def _delete_request_file_from_storage(self, *, request_id: str, entity_directory: str) -> None: - """Delete a specific request item from the disk. - - This function removes a file representing a request, identified by the request's ID, from a - specified directory. Before attempting to remove the file, it ensures that the target directory - exists, creating it if necessary. - - Args: - request_id: The identifier of the request to be deleted. - entity_directory: The directory path where the request file is stored. - """ - # Ensure the directory for the entity exists - await asyncio.to_thread(os.makedirs, entity_directory, exist_ok=True) - - file_path = os.path.join(entity_directory, f'{request_id}.json') - await force_remove(file_path) - - async def _create_internal_request(self, request: Request, forefront: bool | None) -> InternalRequest: - order_no = self._calculate_order_no(request, forefront) - id = unique_key_to_request_id(request.unique_key) - - if request.id is not None and request.id != id: - logger.warning( - f'The request ID does not match the ID from the unique_key (request.id={request.id}, id={id}).' - ) - - return InternalRequest.from_request(request=request, id=id, order_no=order_no) - - def _calculate_order_no(self, request: Request, forefront: bool | None) -> Decimal | None: - if request.handled_at is not None: - return None - - # Get the current timestamp in milliseconds - timestamp = Decimal(str(datetime.now(tz=timezone.utc).timestamp())) * Decimal(1000) - timestamp = round(timestamp, 6) - - # Make sure that this timestamp was not used yet, so that we have unique order_nos - if timestamp <= self._last_used_timestamp: - timestamp = self._last_used_timestamp + Decimal('0.000001') - - self._last_used_timestamp = timestamp - - return -timestamp if forefront else timestamp + now = datetime.now(timezone.utc) + + if update_accessed_at: + self._metadata.accessed_at = now + if update_modified_at: + self._metadata.modified_at = now + if new_handled_request_count is not None: + self._metadata.handled_request_count = new_handled_request_count + if new_pending_request_count is not None: + self._metadata.pending_request_count = new_pending_request_count + if new_total_request_count is not None: + self._metadata.total_request_count = new_total_request_count diff --git a/src/crawlee/storage_clients/_memory/_request_queue_collection_client.py b/src/crawlee/storage_clients/_memory/_request_queue_collection_client.py deleted file mode 100644 index 2f2df2be89..0000000000 --- a/src/crawlee/storage_clients/_memory/_request_queue_collection_client.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee.storage_clients._base import RequestQueueCollectionClient as BaseRequestQueueCollectionClient -from crawlee.storage_clients.models import RequestQueueListPage, RequestQueueMetadata - -from ._creation_management import get_or_create_inner -from ._request_queue_client import RequestQueueClient - -if TYPE_CHECKING: - from ._memory_storage_client import MemoryStorageClient - - -class RequestQueueCollectionClient(BaseRequestQueueCollectionClient): - """Subclient for manipulating request queues.""" - - def __init__(self, *, memory_storage_client: MemoryStorageClient) -> None: - self._memory_storage_client = memory_storage_client - - @property - def _storage_client_cache(self) -> list[RequestQueueClient]: - return self._memory_storage_client.request_queues_handled - - @override - async def get_or_create( - self, - *, - name: str | None = None, - schema: dict | None = None, - id: str | None = None, - ) -> RequestQueueMetadata: - resource_client = await get_or_create_inner( - memory_storage_client=self._memory_storage_client, - storage_client_cache=self._storage_client_cache, - resource_client_class=RequestQueueClient, - name=name, - id=id, - ) - return resource_client.resource_info - - @override - async def list( - self, - *, - unnamed: bool = False, - limit: int | None = None, - offset: int | None = None, - desc: bool = False, - ) -> RequestQueueListPage: - items = [storage.resource_info for storage in self._storage_client_cache] - - return RequestQueueListPage( - total=len(items), - count=len(items), - offset=0, - limit=len(items), - desc=False, - items=sorted(items, key=lambda item: item.created_at), - ) diff --git a/src/crawlee/storage_clients/_memory/_storage_client.py b/src/crawlee/storage_clients/_memory/_storage_client.py new file mode 100644 index 0000000000..ad34d99638 --- /dev/null +++ b/src/crawlee/storage_clients/_memory/_storage_client.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from typing_extensions import override + +from crawlee.configuration import Configuration +from crawlee.storage_clients._base import StorageClient + +from ._dataset_client import MemoryDatasetClient +from ._key_value_store_client import MemoryKeyValueStoreClient +from ._request_queue_client import MemoryRequestQueueClient + + +class MemoryStorageClient(StorageClient): + """Memory storage client.""" + + @override + async def open_dataset_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> MemoryDatasetClient: + configuration = configuration or Configuration.get_global_configuration() + client = await MemoryDatasetClient.open(id=id, name=name, configuration=configuration) + + if configuration.purge_on_start and client.metadata.name is None: + await client.purge() + + return client + + @override + async def open_key_value_store_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> MemoryKeyValueStoreClient: + configuration = configuration or Configuration.get_global_configuration() + client = await MemoryKeyValueStoreClient.open(id=id, name=name, configuration=configuration) + + if configuration.purge_on_start and client.metadata.name is None: + await client.purge() + + return client + + @override + async def open_request_queue_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> MemoryRequestQueueClient: + configuration = configuration or Configuration.get_global_configuration() + client = await MemoryRequestQueueClient.open(id=id, name=name, configuration=configuration) + + if configuration.purge_on_start and client.metadata.name is None: + await client.purge() + + return client diff --git a/src/crawlee/storage_clients/models.py b/src/crawlee/storage_clients/models.py index f016e24730..8b5f0c6d0a 100644 --- a/src/crawlee/storage_clients/models.py +++ b/src/crawlee/storage_clients/models.py @@ -1,7 +1,7 @@ from __future__ import annotations import json -from datetime import datetime +from datetime import datetime, timedelta from decimal import Decimal from typing import Annotated, Any, Generic @@ -26,10 +26,19 @@ class StorageMetadata(BaseModel): model_config = ConfigDict(populate_by_name=True, extra='allow') id: Annotated[str, Field(alias='id')] - name: Annotated[str | None, Field(alias='name', default='')] + """The unique identifier of the storage.""" + + name: Annotated[str | None, Field(alias='name', default=None)] + """The name of the storage.""" + accessed_at: Annotated[datetime, Field(alias='accessedAt')] + """The timestamp when the storage was last accessed.""" + created_at: Annotated[datetime, Field(alias='createdAt')] + """The timestamp when the storage was created.""" + modified_at: Annotated[datetime, Field(alias='modifiedAt')] + """The timestamp when the storage was last modified.""" @docs_group('Data structures') @@ -39,6 +48,7 @@ class DatasetMetadata(StorageMetadata): model_config = ConfigDict(populate_by_name=True) item_count: Annotated[int, Field(alias='itemCount')] + """The number of items in the dataset.""" @docs_group('Data structures') @@ -47,8 +57,6 @@ class KeyValueStoreMetadata(StorageMetadata): model_config = ConfigDict(populate_by_name=True) - user_id: Annotated[str, Field(alias='userId')] - @docs_group('Data structures') class RequestQueueMetadata(StorageMetadata): @@ -57,44 +65,51 @@ class RequestQueueMetadata(StorageMetadata): model_config = ConfigDict(populate_by_name=True) had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')] + """Indicates whether the queue has been accessed by multiple clients (consumers).""" + handled_request_count: Annotated[int, Field(alias='handledRequestCount')] + """The number of requests that have been handled from the queue.""" + pending_request_count: Annotated[int, Field(alias='pendingRequestCount')] + """The number of requests that are still pending in the queue.""" + stats: Annotated[dict, Field(alias='stats')] + """Statistics about the request queue, TODO?""" + total_request_count: Annotated[int, Field(alias='totalRequestCount')] - user_id: Annotated[str, Field(alias='userId')] - resource_directory: Annotated[str, Field(alias='resourceDirectory')] + """The total number of requests that have been added to the queue.""" @docs_group('Data structures') -class KeyValueStoreRecord(BaseModel, Generic[KvsValueType]): - """Model for a key-value store record.""" +class KeyValueStoreRecordMetadata(BaseModel): + """Model for a key-value store record metadata.""" model_config = ConfigDict(populate_by_name=True) key: Annotated[str, Field(alias='key')] - value: Annotated[KvsValueType, Field(alias='value')] - content_type: Annotated[str | None, Field(alias='contentType', default=None)] - filename: Annotated[str | None, Field(alias='filename', default=None)] + """The key of the record. + A unique identifier for the record in the key-value store. + """ -@docs_group('Data structures') -class KeyValueStoreRecordMetadata(BaseModel): - """Model for a key-value store record metadata.""" + content_type: Annotated[str, Field(alias='contentType')] + """The MIME type of the record. - model_config = ConfigDict(populate_by_name=True) + Describe the format and type of data stored in the record, following the MIME specification. + """ - key: Annotated[str, Field(alias='key')] - content_type: Annotated[str, Field(alias='contentType')] + size: Annotated[int, Field(alias='size')] + """The size of the record in bytes.""" @docs_group('Data structures') -class KeyValueStoreKeyInfo(BaseModel): - """Model for a key-value store key info.""" +class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]): + """Model for a key-value store record.""" model_config = ConfigDict(populate_by_name=True) - key: Annotated[str, Field(alias='key')] - size: Annotated[int, Field(alias='size')] + value: Annotated[KvsValueType, Field(alias='value')] + """The value of the record.""" @docs_group('Data structures') @@ -104,11 +119,22 @@ class KeyValueStoreListKeysPage(BaseModel): model_config = ConfigDict(populate_by_name=True) count: Annotated[int, Field(alias='count')] + """The number of keys returned on this page.""" + limit: Annotated[int, Field(alias='limit')] + """The maximum number of keys to return.""" + is_truncated: Annotated[bool, Field(alias='isTruncated')] - items: Annotated[list[KeyValueStoreKeyInfo], Field(alias='items', default_factory=list)] + """Indicates whether there are more keys to retrieve.""" + exclusive_start_key: Annotated[str | None, Field(alias='exclusiveStartKey', default=None)] + """The key from which to start this page of results.""" + next_exclusive_start_key: Annotated[str | None, Field(alias='nextExclusiveStartKey', default=None)] + """The key from which to start the next page of results.""" + + items: Annotated[list[KeyValueStoreRecordMetadata], Field(alias='items', default_factory=list)] + """The list of KVS items metadata returned on this page.""" @docs_group('Data structures') @@ -126,22 +152,31 @@ class RequestQueueHeadState(BaseModel): @docs_group('Data structures') class RequestQueueHead(BaseModel): - """Model for the request queue head.""" + """Model for request queue head. + + Represents a collection of requests retrieved from the beginning of a queue, + including metadata about the queue's state and lock information for the requests. + """ model_config = ConfigDict(populate_by_name=True) limit: Annotated[int | None, Field(alias='limit', default=None)] - had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')] + """The maximum number of requests that were requested from the queue.""" + + had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients', default=False)] + """Indicates whether the queue has been accessed by multiple clients (consumers).""" + queue_modified_at: Annotated[datetime, Field(alias='queueModifiedAt')] - items: Annotated[list[Request], Field(alias='items', default_factory=list)] + """The timestamp when the queue was last modified.""" + lock_time: Annotated[timedelta | None, Field(alias='lockSecs', default=None)] + """The duration for which the returned requests are locked and cannot be processed by other clients.""" -@docs_group('Data structures') -class RequestQueueHeadWithLocks(RequestQueueHead): - """Model for request queue head with locks.""" + queue_has_locked_requests: Annotated[bool | None, Field(alias='queueHasLockedRequests', default=False)] + """Indicates whether the queue contains any locked requests.""" - lock_secs: Annotated[int, Field(alias='lockSecs')] - queue_has_locked_requests: Annotated[bool | None, Field(alias='queueHasLockedRequests')] = None + items: Annotated[list[Request], Field(alias='items', default_factory=list[Request])] + """The list of request objects retrieved from the beginning of the queue.""" class _ListPage(BaseModel): @@ -230,13 +265,22 @@ class UnprocessedRequest(BaseModel): @docs_group('Data structures') -class BatchRequestsOperationResponse(BaseModel): - """Response to batch request deletion calls.""" +class AddRequestsResponse(BaseModel): + """Model for a response to add requests to a queue. + + Contains detailed information about the processing results when adding multiple requests + to a queue. This includes which requests were successfully processed and which ones + encountered issues during processing. + """ model_config = ConfigDict(populate_by_name=True) processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')] + """Successfully processed requests, including information about whether they were + already present in the queue and whether they had been handled previously.""" + unprocessed_requests: Annotated[list[UnprocessedRequest], Field(alias='unprocessedRequests')] + """Requests that could not be processed, typically due to validation errors or other issues.""" class InternalRequest(BaseModel): @@ -275,3 +319,22 @@ def from_request(cls, request: Request, id: str, order_no: Decimal | None) -> In def to_request(self) -> Request: """Convert the internal request back to a `Request` object.""" return self.request + + +class CachedRequest(BaseModel): + """Pydantic model for cached request information.""" + + id: str + """The ID of the request.""" + + was_already_handled: bool + """Whether the request was already handled.""" + + hydrated: Request | None = None + """The hydrated request object (the original one).""" + + lock_expires_at: datetime | None = None + """The expiration time of the lock on the request.""" + + forefront: bool = False + """Whether the request was added to the forefront of the queue.""" diff --git a/src/crawlee/storages/_base.py b/src/crawlee/storages/_base.py index 08d2cbd7be..fc0a04979c 100644 --- a/src/crawlee/storages/_base.py +++ b/src/crawlee/storages/_base.py @@ -6,7 +6,7 @@ if TYPE_CHECKING: from crawlee.configuration import Configuration from crawlee.storage_clients._base import StorageClient - from crawlee.storage_clients.models import StorageMetadata + from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata class Storage(ABC): @@ -24,13 +24,8 @@ def name(self) -> str | None: @property @abstractmethod - def storage_object(self) -> StorageMetadata: - """Get the full storage object.""" - - @storage_object.setter - @abstractmethod - def storage_object(self, storage_object: StorageMetadata) -> None: - """Set the full storage object.""" + def metadata(self) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata: + """Get the storage metadata.""" @classmethod @abstractmethod @@ -55,3 +50,11 @@ async def open( @abstractmethod async def drop(self) -> None: """Drop the storage, removing it from the underlying storage client and clearing the cache.""" + + @abstractmethod + async def purge(self) -> None: + """Purge the storage, removing all items from the underlying storage client. + + This method does not remove the storage itself, e.g. don't remove the metadata, + but clears all items within it. + """ diff --git a/src/crawlee/storages/_creation_management.py b/src/crawlee/storages/_creation_management.py deleted file mode 100644 index 14d9b1719e..0000000000 --- a/src/crawlee/storages/_creation_management.py +++ /dev/null @@ -1,231 +0,0 @@ -from __future__ import annotations - -import asyncio -from typing import TYPE_CHECKING, TypeVar -from weakref import WeakKeyDictionary - -from crawlee.storage_clients import MemoryStorageClient - -from ._dataset import Dataset -from ._key_value_store import KeyValueStore -from ._request_queue import RequestQueue - -if TYPE_CHECKING: - from crawlee.configuration import Configuration - from crawlee.storage_clients._base import ResourceClient, ResourceCollectionClient, StorageClient - -TResource = TypeVar('TResource', Dataset, KeyValueStore, RequestQueue) - - -_creation_locks = WeakKeyDictionary[asyncio.AbstractEventLoop, asyncio.Lock]() -"""Locks for storage creation (we need a separate lock for every event loop so that tests work as expected).""" - -_cache_dataset_by_id: dict[str, Dataset] = {} -_cache_dataset_by_name: dict[str, Dataset] = {} -_cache_kvs_by_id: dict[str, KeyValueStore] = {} -_cache_kvs_by_name: dict[str, KeyValueStore] = {} -_cache_rq_by_id: dict[str, RequestQueue] = {} -_cache_rq_by_name: dict[str, RequestQueue] = {} - - -def _get_from_cache_by_name( - storage_class: type[TResource], - name: str, -) -> TResource | None: - """Try to restore storage from cache by name.""" - if issubclass(storage_class, Dataset): - return _cache_dataset_by_name.get(name) - if issubclass(storage_class, KeyValueStore): - return _cache_kvs_by_name.get(name) - if issubclass(storage_class, RequestQueue): - return _cache_rq_by_name.get(name) - raise ValueError(f'Unknown storage class: {storage_class.__name__}') - - -def _get_from_cache_by_id( - storage_class: type[TResource], - id: str, -) -> TResource | None: - """Try to restore storage from cache by ID.""" - if issubclass(storage_class, Dataset): - return _cache_dataset_by_id.get(id) - if issubclass(storage_class, KeyValueStore): - return _cache_kvs_by_id.get(id) - if issubclass(storage_class, RequestQueue): - return _cache_rq_by_id.get(id) - raise ValueError(f'Unknown storage: {storage_class.__name__}') - - -def _add_to_cache_by_name(name: str, storage: TResource) -> None: - """Add storage to cache by name.""" - if isinstance(storage, Dataset): - _cache_dataset_by_name[name] = storage - elif isinstance(storage, KeyValueStore): - _cache_kvs_by_name[name] = storage - elif isinstance(storage, RequestQueue): - _cache_rq_by_name[name] = storage - else: - raise TypeError(f'Unknown storage: {storage}') - - -def _add_to_cache_by_id(id: str, storage: TResource) -> None: - """Add storage to cache by ID.""" - if isinstance(storage, Dataset): - _cache_dataset_by_id[id] = storage - elif isinstance(storage, KeyValueStore): - _cache_kvs_by_id[id] = storage - elif isinstance(storage, RequestQueue): - _cache_rq_by_id[id] = storage - else: - raise TypeError(f'Unknown storage: {storage}') - - -def _rm_from_cache_by_id(storage_class: type, id: str) -> None: - """Remove a storage from cache by ID.""" - try: - if issubclass(storage_class, Dataset): - del _cache_dataset_by_id[id] - elif issubclass(storage_class, KeyValueStore): - del _cache_kvs_by_id[id] - elif issubclass(storage_class, RequestQueue): - del _cache_rq_by_id[id] - else: - raise TypeError(f'Unknown storage class: {storage_class.__name__}') - except KeyError as exc: - raise RuntimeError(f'Storage with provided ID was not found ({id}).') from exc - - -def _rm_from_cache_by_name(storage_class: type, name: str) -> None: - """Remove a storage from cache by name.""" - try: - if issubclass(storage_class, Dataset): - del _cache_dataset_by_name[name] - elif issubclass(storage_class, KeyValueStore): - del _cache_kvs_by_name[name] - elif issubclass(storage_class, RequestQueue): - del _cache_rq_by_name[name] - else: - raise TypeError(f'Unknown storage class: {storage_class.__name__}') - except KeyError as exc: - raise RuntimeError(f'Storage with provided name was not found ({name}).') from exc - - -def _get_default_storage_id(configuration: Configuration, storage_class: type[TResource]) -> str: - if issubclass(storage_class, Dataset): - return configuration.default_dataset_id - if issubclass(storage_class, KeyValueStore): - return configuration.default_key_value_store_id - if issubclass(storage_class, RequestQueue): - return configuration.default_request_queue_id - - raise TypeError(f'Unknown storage class: {storage_class.__name__}') - - -async def open_storage( - *, - storage_class: type[TResource], - id: str | None, - name: str | None, - configuration: Configuration, - storage_client: StorageClient, -) -> TResource: - """Open either a new storage or restore an existing one and return it.""" - # Try to restore the storage from cache by name - if name: - cached_storage = _get_from_cache_by_name(storage_class=storage_class, name=name) - if cached_storage: - return cached_storage - - default_id = _get_default_storage_id(configuration, storage_class) - - if not id and not name: - id = default_id - - # Find out if the storage is a default on memory storage - is_default_on_memory = id == default_id and isinstance(storage_client, MemoryStorageClient) - - # Try to restore storage from cache by ID - if id: - cached_storage = _get_from_cache_by_id(storage_class=storage_class, id=id) - if cached_storage: - return cached_storage - - # Purge on start if configured - if configuration.purge_on_start: - await storage_client.purge_on_start() - - # Lock and create new storage - loop = asyncio.get_running_loop() - if loop not in _creation_locks: - _creation_locks[loop] = asyncio.Lock() - - async with _creation_locks[loop]: - if id and not is_default_on_memory: - resource_client = _get_resource_client(storage_class, storage_client, id) - storage_object = await resource_client.get() - if not storage_object: - raise RuntimeError(f'{storage_class.__name__} with id "{id}" does not exist!') - - elif is_default_on_memory: - resource_collection_client = _get_resource_collection_client(storage_class, storage_client) - storage_object = await resource_collection_client.get_or_create(name=name, id=id) - - else: - resource_collection_client = _get_resource_collection_client(storage_class, storage_client) - storage_object = await resource_collection_client.get_or_create(name=name) - - storage = storage_class.from_storage_object(storage_client=storage_client, storage_object=storage_object) - - # Cache the storage by ID and name - _add_to_cache_by_id(storage.id, storage) - if storage.name is not None: - _add_to_cache_by_name(storage.name, storage) - - return storage - - -def remove_storage_from_cache( - *, - storage_class: type, - id: str | None = None, - name: str | None = None, -) -> None: - """Remove a storage from cache by ID or name.""" - if id: - _rm_from_cache_by_id(storage_class=storage_class, id=id) - - if name: - _rm_from_cache_by_name(storage_class=storage_class, name=name) - - -def _get_resource_client( - storage_class: type[TResource], - storage_client: StorageClient, - id: str, -) -> ResourceClient: - if issubclass(storage_class, Dataset): - return storage_client.dataset(id) - - if issubclass(storage_class, KeyValueStore): - return storage_client.key_value_store(id) - - if issubclass(storage_class, RequestQueue): - return storage_client.request_queue(id) - - raise ValueError(f'Unknown storage class label: {storage_class.__name__}') - - -def _get_resource_collection_client( - storage_class: type, - storage_client: StorageClient, -) -> ResourceCollectionClient: - if issubclass(storage_class, Dataset): - return storage_client.datasets() - - if issubclass(storage_class, KeyValueStore): - return storage_client.key_value_stores() - - if issubclass(storage_class, RequestQueue): - return storage_client.request_queues() - - raise ValueError(f'Unknown storage class: {storage_class.__name__}') diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index 7cb58ae817..a46673fe65 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -1,243 +1,104 @@ from __future__ import annotations -import csv -import io -import json import logging -from datetime import datetime, timezone -from typing import TYPE_CHECKING, Literal, TextIO, TypedDict, cast +from io import StringIO +from typing import TYPE_CHECKING, overload -from typing_extensions import NotRequired, Required, Unpack, override +from typing_extensions import override from crawlee import service_locator -from crawlee._utils.byte_size import ByteSize from crawlee._utils.docs import docs_group -from crawlee._utils.file import json_dumps -from crawlee.storage_clients.models import DatasetMetadata, StorageMetadata +from crawlee._utils.file import export_csv_to_stream, export_json_to_stream from ._base import Storage from ._key_value_store import KeyValueStore if TYPE_CHECKING: - from collections.abc import AsyncIterator, Callable + from collections.abc import AsyncIterator + from typing import Any, ClassVar, Literal + + from typing_extensions import Unpack - from crawlee._types import JsonSerializable, PushDataKwargs from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient - from crawlee.storage_clients.models import DatasetItemsListPage - -logger = logging.getLogger(__name__) - - -class GetDataKwargs(TypedDict): - """Keyword arguments for dataset's `get_data` method.""" - - offset: NotRequired[int] - """Skip the specified number of items at the start.""" - - limit: NotRequired[int] - """The maximum number of items to retrieve. Unlimited if None.""" - - clean: NotRequired[bool] - """Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.""" - - desc: NotRequired[bool] - """Set to True to sort results in descending order.""" - - fields: NotRequired[list[str]] - """Fields to include in each item. Sorts fields as specified if provided.""" - - omit: NotRequired[list[str]] - """Fields to exclude from each item.""" - - unwind: NotRequired[str] - """Unwind items by a specified array field, turning each element into a separate item.""" - - skip_empty: NotRequired[bool] - """Exclude empty items from the results if True.""" - - skip_hidden: NotRequired[bool] - """Exclude fields starting with '#' if True.""" - - flatten: NotRequired[list[str]] - """Field to be flattened in returned items.""" - - view: NotRequired[str] - """Specify the dataset view to be used.""" - - -class ExportToKwargs(TypedDict): - """Keyword arguments for dataset's `export_to` method.""" - - key: Required[str] - """The key under which to save the data.""" - - content_type: NotRequired[Literal['json', 'csv']] - """The format in which to export the data. Either 'json' or 'csv'.""" - - to_key_value_store_id: NotRequired[str] - """ID of the key-value store to save the exported file.""" - - to_key_value_store_name: NotRequired[str] - """Name of the key-value store to save the exported file.""" - - -class ExportDataJsonKwargs(TypedDict): - """Keyword arguments for dataset's `export_data_json` method.""" - - skipkeys: NotRequired[bool] - """If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped - instead of raising a `TypeError`.""" - - ensure_ascii: NotRequired[bool] - """Determines if non-ASCII characters should be escaped in the output JSON string.""" - - check_circular: NotRequired[bool] - """If False (default: True), skips the circular reference check for container types. A circular reference will - result in a `RecursionError` or worse if unchecked.""" - - allow_nan: NotRequired[bool] - """If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply - with the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity).""" - - cls: NotRequired[type[json.JSONEncoder]] - """Allows specifying a custom JSON encoder.""" - - indent: NotRequired[int] - """Specifies the number of spaces for indentation in the pretty-printed JSON output.""" - - separators: NotRequired[tuple[str, str]] - """A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ') - otherwise.""" - - default: NotRequired[Callable] - """A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version - of the object or raise a `TypeError`.""" - - sort_keys: NotRequired[bool] - """Specifies whether the output JSON object should have keys sorted alphabetically.""" - - -class ExportDataCsvKwargs(TypedDict): - """Keyword arguments for dataset's `export_data_csv` method.""" - - dialect: NotRequired[str] - """Specifies a dialect to be used in CSV parsing and writing.""" - - delimiter: NotRequired[str] - """A one-character string used to separate fields. Defaults to ','.""" - - doublequote: NotRequired[bool] - """Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled; - when False, the `escapechar` is used as a prefix. Defaults to True.""" - - escapechar: NotRequired[str] - """A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar` - if `doublequote` is False. Defaults to None, disabling escaping.""" - - lineterminator: NotRequired[str] - """The string used to terminate lines produced by the writer. Defaults to '\\r\\n'.""" - - quotechar: NotRequired[str] - """A one-character string used to quote fields containing special characters, like the delimiter or quotechar, - or fields containing new-line characters. Defaults to '\"'.""" + from crawlee.storage_clients._base import DatasetClient + from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata - quoting: NotRequired[int] - """Controls when quotes should be generated by the writer and recognized by the reader. Can take any of - the `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`.""" + from ._types import ExportDataCsvKwargs, ExportDataJsonKwargs - skipinitialspace: NotRequired[bool] - """When True, spaces immediately following the delimiter are ignored. Defaults to False.""" - - strict: NotRequired[bool] - """When True, raises an exception on bad CSV input. Defaults to False.""" +logger = logging.getLogger(__name__) @docs_group('Classes') class Dataset(Storage): - """Represents an append-only structured storage, ideal for tabular data similar to database tables. - - The `Dataset` class is designed to store structured data, where each entry (row) maintains consistent attributes - (columns) across the dataset. It operates in an append-only mode, allowing new records to be added, but not - modified or deleted. This makes it particularly useful for storing results from web crawling operations. + """Dataset is a storage for managing structured tabular data. - Data can be stored either locally or in the cloud. It depends on the setup of underlying storage client. - By default a `MemoryStorageClient` is used, but it can be changed to a different one. + The dataset class provides a high-level interface for storing and retrieving structured data + with consistent schema, similar to database tables or spreadsheets. It abstracts the underlying + storage implementation details, offering a consistent API regardless of where the data is + physically stored. - By default, data is stored using the following path structure: - ``` - {CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json - ``` - - `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable. - - `{DATASET_ID}`: Specifies the dataset, either "default" or a custom dataset ID. - - `{INDEX}`: Represents the zero-based index of the record within the dataset. + Dataset operates in an append-only mode, allowing new records to be added but not modified + or deleted after creation. This makes it particularly suitable for storing crawling results + and other data that should be immutable once collected. - To open a dataset, use the `open` class method by specifying an `id`, `name`, or `configuration`. If none are - provided, the default dataset for the current crawler run is used. Attempting to open a dataset by `id` that does - not exist will raise an error; however, if accessed by `name`, the dataset will be created if it doesn't already - exist. + The class provides methods for adding data, retrieving data with various filtering options, + and exporting data to different formats. You can create a dataset using the `open` class method, + specifying either a name or ID. The underlying storage implementation is determined by + the configured storage client. ### Usage ```python from crawlee.storages import Dataset + # Open a dataset dataset = await Dataset.open(name='my_dataset') + + # Add data + await dataset.push_data({'title': 'Example Product', 'price': 99.99}) + + # Retrieve filtered data + results = await dataset.get_data(limit=10, desc=True) + + # Export data + await dataset.export_to('results.json', content_type='json') ``` """ - _MAX_PAYLOAD_SIZE = ByteSize.from_mb(9) - """Maximum size for a single payload.""" + _cache_by_id: ClassVar[dict[str, Dataset]] = {} + """A dictionary to cache datasets by ID.""" - _SAFETY_BUFFER_PERCENT = 0.01 / 100 # 0.01% - """Percentage buffer to reduce payload limit slightly for safety.""" + _cache_by_name: ClassVar[dict[str, Dataset]] = {} + """A dictionary to cache datasets by name.""" - _EFFECTIVE_LIMIT_SIZE = _MAX_PAYLOAD_SIZE - (_MAX_PAYLOAD_SIZE * _SAFETY_BUFFER_PERCENT) - """Calculated payload limit considering safety buffer.""" - - def __init__(self, id: str, name: str | None, storage_client: StorageClient) -> None: - self._id = id - self._name = name - datetime_now = datetime.now(timezone.utc) - self._storage_object = StorageMetadata( - id=id, name=name, accessed_at=datetime_now, created_at=datetime_now, modified_at=datetime_now - ) + _default_instance: ClassVar[Dataset | None] = None + """Cache for the default dataset instance.""" - # Get resource clients from the storage client. - self._resource_client = storage_client.dataset(self._id) - self._resource_collection_client = storage_client.datasets() + def __init__(self, client: DatasetClient) -> None: + """Initialize a new instance. - @classmethod - def from_storage_object(cls, storage_client: StorageClient, storage_object: StorageMetadata) -> Dataset: - """Initialize a new instance of Dataset from a storage metadata object.""" - dataset = Dataset( - id=storage_object.id, - name=storage_object.name, - storage_client=storage_client, - ) + Preferably use the `Dataset.open` constructor to create a new instance. - dataset.storage_object = storage_object - return dataset + Args: + client: An instance of a dataset client. + """ + self._client = client - @property @override + @property def id(self) -> str: - return self._id + return self._client.metadata.id - @property @override - def name(self) -> str | None: - return self._name - @property - @override - def storage_object(self) -> StorageMetadata: - return self._storage_object + def name(self) -> str | None: + return self._client.metadata.name - @storage_object.setter @override - def storage_object(self, storage_object: StorageMetadata) -> None: - self._storage_object = storage_object + @property + def metadata(self) -> DatasetMetadata: + return self._client.metadata @override @classmethod @@ -249,27 +110,55 @@ async def open( configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> Dataset: - from crawlee.storages._creation_management import open_storage + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # Check for default instance if no id or name provided + if id is None and name is None and cls._default_instance is not None: + return cls._default_instance + + # Check if the dataset is already cached + if id is not None and id in cls._cache_by_id: + return cls._cache_by_id[id] + if name is not None and name in cls._cache_by_name: + return cls._cache_by_name[name] - configuration = configuration or service_locator.get_configuration() - storage_client = storage_client or service_locator.get_storage_client() + configuration = service_locator.get_configuration() if configuration is None else configuration + storage_client = service_locator.get_storage_client() if storage_client is None else storage_client - return await open_storage( - storage_class=cls, + client = await storage_client.open_dataset_client( id=id, name=name, configuration=configuration, - storage_client=storage_client, ) + dataset = cls(client) + + # Cache the dataset instance by ID and name + cls._cache_by_id[dataset.id] = dataset + if dataset.name is not None: + cls._cache_by_name[dataset.name] = dataset + + # Store as default instance if neither id nor name was provided + if id is None and name is None: + cls._default_instance = dataset + + return dataset @override async def drop(self) -> None: - from crawlee.storages._creation_management import remove_storage_from_cache + if self.id in self._cache_by_id: + del self._cache_by_id[self.id] + + if self.name in self._cache_by_name: + del self._cache_by_name[self.name] - await self._resource_client.delete() - remove_storage_from_cache(storage_class=self.__class__, id=self._id, name=self._name) + await self._client.drop() - async def push_data(self, data: JsonSerializable, **kwargs: Unpack[PushDataKwargs]) -> None: + @override + async def purge(self) -> None: + await self._client.purge() + + async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: """Store an object or an array of objects to the dataset. The size of the data is limited by the receiving API and therefore `push_data()` will only @@ -279,127 +168,65 @@ async def push_data(self, data: JsonSerializable, **kwargs: Unpack[PushDataKwarg Args: data: A JSON serializable data structure to be stored in the dataset. The JSON representation of each item must be smaller than 9MB. - kwargs: Keyword arguments for the storage client method. """ - # Handle singular items - if not isinstance(data, list): - items = await self.check_and_serialize(data) - return await self._resource_client.push_items(items, **kwargs) - - # Handle lists - payloads_generator = (await self.check_and_serialize(item, index) for index, item in enumerate(data)) - - # Invoke client in series to preserve the order of data - async for items in self._chunk_by_size(payloads_generator): - await self._resource_client.push_items(items, **kwargs) - - return None + await self._client.push_data(data=data) - async def get_data(self, **kwargs: Unpack[GetDataKwargs]) -> DatasetItemsListPage: - """Retrieve dataset items based on filtering, sorting, and pagination parameters. + async def get_data( + self, + *, + offset: int = 0, + limit: int | None = 999_999_999_999, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + flatten: list[str] | None = None, + view: str | None = None, + ) -> DatasetItemsListPage: + """Retrieve a paginated list of items from a dataset based on various filtering parameters. - This method allows customization of the data retrieval process from a dataset, supporting operations such as - field selection, ordering, and skipping specific records based on provided parameters. + This method provides the flexibility to filter, sort, and modify the appearance of dataset items + when listed. Each parameter modifies the result set according to its purpose. The method also + supports pagination through 'offset' and 'limit' parameters. Args: - kwargs: Keyword arguments for the storage client method. + offset: Skips the specified number of items at the start. + limit: The maximum number of items to retrieve. Unlimited if None. + clean: Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty. + desc: Set to True to sort results in descending order. + fields: Fields to include in each item. Sorts fields as specified if provided. + omit: Fields to exclude from each item. + unwind: Unwinds items by a specified array field, turning each element into a separate item. + skip_empty: Excludes empty items from the results if True. + skip_hidden: Excludes fields starting with '#' if True. + flatten: Fields to be flattened in returned items. + view: Specifies the dataset view to be used. Returns: - List page containing filtered and paginated dataset items. + An object with filtered, sorted, and paginated dataset items plus pagination details. """ - return await self._resource_client.list_items(**kwargs) - - async def write_to_csv(self, destination: TextIO, **kwargs: Unpack[ExportDataCsvKwargs]) -> None: - """Export the entire dataset into an arbitrary stream. - - Args: - destination: The stream into which the dataset contents should be written. - kwargs: Additional keyword arguments for `csv.writer`. - """ - items: list[dict] = [] - limit = 1000 - offset = 0 - - while True: - list_items = await self._resource_client.list_items(limit=limit, offset=offset) - items.extend(list_items.items) - if list_items.total <= offset + list_items.count: - break - offset += list_items.count - - if items: - writer = csv.writer(destination, **kwargs) - writer.writerows([items[0].keys(), *[item.values() for item in items]]) - else: - logger.warning('Attempting to export an empty dataset - no file will be created') - - async def write_to_json(self, destination: TextIO, **kwargs: Unpack[ExportDataJsonKwargs]) -> None: - """Export the entire dataset into an arbitrary stream. - - Args: - destination: The stream into which the dataset contents should be written. - kwargs: Additional keyword arguments for `json.dump`. - """ - items: list[dict] = [] - limit = 1000 - offset = 0 - - while True: - list_items = await self._resource_client.list_items(limit=limit, offset=offset) - items.extend(list_items.items) - if list_items.total <= offset + list_items.count: - break - offset += list_items.count - - if items: - json.dump(items, destination, **kwargs) - else: - logger.warning('Attempting to export an empty dataset - no file will be created') - - async def export_to(self, **kwargs: Unpack[ExportToKwargs]) -> None: - """Export the entire dataset into a specified file stored under a key in a key-value store. - - This method consolidates all entries from a specified dataset into one file, which is then saved under a - given key in a key-value store. The format of the exported file is determined by the `content_type` parameter. - Either the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or - name should be used. - - Args: - kwargs: Keyword arguments for the storage client method. - """ - key = cast('str', kwargs.get('key')) - content_type = kwargs.get('content_type', 'json') - to_key_value_store_id = kwargs.get('to_key_value_store_id') - to_key_value_store_name = kwargs.get('to_key_value_store_name') - - key_value_store = await KeyValueStore.open(id=to_key_value_store_id, name=to_key_value_store_name) - - output = io.StringIO() - if content_type == 'csv': - await self.write_to_csv(output) - elif content_type == 'json': - await self.write_to_json(output) - else: - raise ValueError('Unsupported content type, expecting CSV or JSON') - - if content_type == 'csv': - await key_value_store.set_value(key, output.getvalue(), 'text/csv') - - if content_type == 'json': - await key_value_store.set_value(key, output.getvalue(), 'application/json') - - async def get_info(self) -> DatasetMetadata | None: - """Get an object containing general information about the dataset.""" - metadata = await self._resource_client.get() - if isinstance(metadata, DatasetMetadata): - return metadata - return None + return await self._client.get_data( + offset=offset, + limit=limit, + clean=clean, + desc=desc, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_hidden=skip_hidden, + flatten=flatten, + view=view, + ) async def iterate_items( self, *, offset: int = 0, - limit: int | None = None, + limit: int | None = 999_999_999_999, clean: bool = False, desc: bool = False, fields: list[str] | None = None, @@ -408,27 +235,29 @@ async def iterate_items( skip_empty: bool = False, skip_hidden: bool = False, ) -> AsyncIterator[dict]: - """Iterate over dataset items, applying filtering, sorting, and pagination. + """Iterate over items in the dataset according to specified filters and sorting. - Retrieve dataset items incrementally, allowing fine-grained control over the data fetched. The function - supports various parameters to filter, sort, and limit the data returned, facilitating tailored dataset - queries. + This method allows for asynchronously iterating through dataset items while applying various filters such as + skipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit` + parameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and + `skip_hidden` parameters. Args: - offset: Initial number of items to skip. - limit: Max number of items to return. No limit if None. - clean: Filter out empty items and hidden fields if True. - desc: Return items in reverse order if True. - fields: Specific fields to include in each item. - omit: Fields to omit from each item. - unwind: Field name to unwind items by. - skip_empty: Omits empty items if True. + offset: Skips the specified number of items at the start. + limit: The maximum number of items to retrieve. Unlimited if None. + clean: Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty. + desc: Set to True to sort results in descending order. + fields: Fields to include in each item. Sorts fields as specified if provided. + omit: Fields to exclude from each item. + unwind: Unwinds items by a specified array field, turning each element into a separate item. + skip_empty: Excludes empty items from the results if True. skip_hidden: Excludes fields starting with '#' if True. Yields: - Each item from the dataset as a dictionary. + An asynchronous iterator of dictionary objects, each representing a dataset item after applying + the specified filters and transformations. """ - async for item in self._resource_client.iterate_items( + async for item in self._client.iterate_items( offset=offset, limit=limit, clean=clean, @@ -441,59 +270,121 @@ async def iterate_items( ): yield item - @classmethod - async def check_and_serialize(cls, item: JsonSerializable, index: int | None = None) -> str: - """Serialize a given item to JSON, checks its serializability and size against a limit. + async def list_items( + self, + *, + offset: int = 0, + limit: int | None = 999_999_999_999, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + ) -> list[dict]: + """Retrieve a list of all items from the dataset according to specified filters and sorting. + + This method collects all dataset items into a list while applying various filters such as + skipping empty items, hiding specific fields, and sorting. It supports pagination via `offset` and `limit` + parameters, and can modify the appearance of dataset items using `fields`, `omit`, `unwind`, `skip_empty`, and + `skip_hidden` parameters. Args: - item: The item to serialize. - index: Index of the item, used for error context. + offset: Skips the specified number of items at the start. + limit: The maximum number of items to retrieve. Unlimited if None. + clean: Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty. + desc: Set to True to sort results in descending order. + fields: Fields to include in each item. Sorts fields as specified if provided. + omit: Fields to exclude from each item. + unwind: Unwinds items by a specified array field, turning each element into a separate item. + skip_empty: Excludes empty items from the results if True. + skip_hidden: Excludes fields starting with '#' if True. Returns: - Serialized JSON string. - - Raises: - ValueError: If item is not JSON serializable or exceeds size limit. + A list of dictionary objects, each representing a dataset item after applying + the specified filters and transformations. """ - s = ' ' if index is None else f' at index {index} ' - - try: - payload = await json_dumps(item) - except Exception as exc: - raise ValueError(f'Data item{s}is not serializable to JSON.') from exc - - payload_size = ByteSize(len(payload.encode('utf-8'))) - if payload_size > cls._EFFECTIVE_LIMIT_SIZE: - raise ValueError(f'Data item{s}is too large (size: {payload_size}, limit: {cls._EFFECTIVE_LIMIT_SIZE})') - - return payload - - async def _chunk_by_size(self, items: AsyncIterator[str]) -> AsyncIterator[str]: - """Yield chunks of JSON arrays composed of input strings, respecting a size limit. + return [ + item + async for item in self.iterate_items( + offset=offset, + limit=limit, + clean=clean, + desc=desc, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_hidden=skip_hidden, + ) + ] + + @overload + async def export_to( + self, + key: str, + content_type: Literal['json'], + to_kvs_id: str | None = None, + to_kvs_name: str | None = None, + to_kvs_storage_client: StorageClient | None = None, + to_kvs_configuration: Configuration | None = None, + **kwargs: Unpack[ExportDataJsonKwargs], + ) -> None: ... + + @overload + async def export_to( + self, + key: str, + content_type: Literal['csv'], + to_kvs_id: str | None = None, + to_kvs_name: str | None = None, + to_kvs_storage_client: StorageClient | None = None, + to_kvs_configuration: Configuration | None = None, + **kwargs: Unpack[ExportDataCsvKwargs], + ) -> None: ... + + async def export_to( + self, + key: str, + content_type: Literal['json', 'csv'] = 'json', + to_kvs_id: str | None = None, + to_kvs_name: str | None = None, + to_kvs_storage_client: StorageClient | None = None, + to_kvs_configuration: Configuration | None = None, + **kwargs: Any, + ) -> None: + """Export the entire dataset into a specified file stored under a key in a key-value store. - Groups an iterable of JSON string payloads into larger JSON arrays, ensuring the total size - of each array does not exceed `EFFECTIVE_LIMIT_SIZE`. Each output is a JSON array string that - contains as many payloads as possible without breaching the size threshold, maintaining the - order of the original payloads. Assumes individual items are below the size limit. + This method consolidates all entries from a specified dataset into one file, which is then saved under a + given key in a key-value store. The format of the exported file is determined by the `content_type` parameter. + Either the dataset's ID or name should be specified, and similarly, either the target key-value store's ID or + name should be used. Args: - items: Iterable of JSON string payloads. - - Yields: - Strings representing JSON arrays of payloads, each staying within the size limit. + key: The key under which to save the data in the key-value store. + content_type: The format in which to export the data. + to_kvs_id: ID of the key-value store to save the exported file. + Specify only one of ID or name. + to_kvs_name: Name of the key-value store to save the exported file. + Specify only one of ID or name. + to_kvs_storage_client: Storage client to use for the key-value store. + to_kvs_configuration: Configuration for the key-value store. + kwargs: Additional parameters for the export operation, specific to the chosen content type. """ - last_chunk_size = ByteSize(2) # Add 2 bytes for [] wrapper. - current_chunk = [] - - async for payload in items: - payload_size = ByteSize(len(payload.encode('utf-8'))) - - if last_chunk_size + payload_size <= self._EFFECTIVE_LIMIT_SIZE: - current_chunk.append(payload) - last_chunk_size += payload_size + ByteSize(1) # Add 1 byte for ',' separator. - else: - yield f'[{",".join(current_chunk)}]' - current_chunk = [payload] - last_chunk_size = payload_size + ByteSize(2) # Add 2 bytes for [] wrapper. + kvs = await KeyValueStore.open( + id=to_kvs_id, + name=to_kvs_name, + configuration=to_kvs_configuration, + storage_client=to_kvs_storage_client, + ) + dst = StringIO() - yield f'[{",".join(current_chunk)}]' + if content_type == 'csv': + await export_csv_to_stream(self.iterate_items(), dst, **kwargs) + await kvs.set_value(key, dst.getvalue(), 'text/csv') + elif content_type == 'json': + await export_json_to_stream(self.iterate_items(), dst, **kwargs) + await kvs.set_value(key, dst.getvalue(), 'application/json') + else: + raise ValueError('Unsupported content type, expecting CSV or JSON') diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index fc077726d1..c24e9a5418 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -2,7 +2,6 @@ import asyncio from collections.abc import AsyncIterator -from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, overload @@ -12,7 +11,7 @@ from crawlee import service_locator from crawlee._types import JsonSerializable # noqa: TC001 from crawlee._utils.docs import docs_group -from crawlee.storage_clients.models import KeyValueStoreKeyInfo, KeyValueStoreMetadata, StorageMetadata +from crawlee.storage_clients.models import KeyValueStoreMetadata from ._base import Storage @@ -22,6 +21,8 @@ from crawlee._utils.recoverable_state import RecoverableState from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient + from crawlee.storage_clients._base import KeyValueStoreClient + from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecordMetadata T = TypeVar('T') @@ -34,94 +35,78 @@ class AutosavedValue(RootModel): @docs_group('Classes') class KeyValueStore(Storage): - """Represents a key-value based storage for reading and writing data records or files. + """Key-value store is a storage for reading and writing data records with unique key identifiers. - Each data record is identified by a unique key and associated with a specific MIME content type. This class is - commonly used in crawler runs to store inputs and outputs, typically in JSON format, but it also supports other - content types. + The key-value store class acts as a high-level interface for storing, retrieving, and managing data records + identified by unique string keys. It abstracts away the underlying storage implementation details, + allowing you to work with the same API regardless of whether data is stored in memory, on disk, + or in the cloud. - Data can be stored either locally or in the cloud. It depends on the setup of underlying storage client. - By default a `MemoryStorageClient` is used, but it can be changed to a different one. + Each data record is associated with a specific MIME content type, allowing storage of various + data formats such as JSON, text, images, HTML snapshots or any binary data. This class is + commonly used to store inputs, outputs, and other artifacts of crawler operations. - By default, data is stored using the following path structure: - ``` - {CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}.{EXT} - ``` - - `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable. - - `{STORE_ID}`: The identifier for the key-value store, either "default" or as specified by - `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`. - - `{KEY}`: The unique key for the record. - - `{EXT}`: The file extension corresponding to the MIME type of the content. - - To open a key-value store, use the `open` class method, providing an `id`, `name`, or optional `configuration`. - If none are specified, the default store for the current crawler run is used. Attempting to open a store by `id` - that does not exist will raise an error; however, if accessed by `name`, the store will be created if it does not - already exist. + You can instantiate a key-value store using the `open` class method, which will create a store + with the specified name or id. The underlying storage implementation is determined by the configured + storage client. ### Usage ```python from crawlee.storages import KeyValueStore - kvs = await KeyValueStore.open(name='my_kvs') + # Open a named key-value store + kvs = await KeyValueStore.open(name='my-store') + + # Store and retrieve data + await kvs.set_value('product-1234.json', [{'name': 'Smartphone', 'price': 799.99}]) + product = await kvs.get_value('product-1234') ``` """ - # Cache for recoverable (auto-saved) values + _cache_by_id: ClassVar[dict[str, KeyValueStore]] = {} + """A dictionary to cache key-value stores by ID.""" + + _cache_by_name: ClassVar[dict[str, KeyValueStore]] = {} + """A dictionary to cache key-value stores by name.""" + + _default_instance: ClassVar[KeyValueStore | None] = None + """Cache for the default key-value store instance.""" + _autosaved_values: ClassVar[ dict[ str, dict[str, RecoverableState[AutosavedValue]], ] ] = {} + """Cache for recoverable (auto-saved) values.""" - def __init__(self, id: str, name: str | None, storage_client: StorageClient) -> None: - self._id = id - self._name = name - datetime_now = datetime.now(timezone.utc) - self._storage_object = StorageMetadata( - id=id, name=name, accessed_at=datetime_now, created_at=datetime_now, modified_at=datetime_now - ) - - # Get resource clients from storage client - self._resource_client = storage_client.key_value_store(self._id) - self._autosave_lock = asyncio.Lock() + def __init__(self, client: KeyValueStoreClient) -> None: + """Initialize a new instance. - @classmethod - def from_storage_object(cls, storage_client: StorageClient, storage_object: StorageMetadata) -> KeyValueStore: - """Initialize a new instance of KeyValueStore from a storage metadata object.""" - key_value_store = KeyValueStore( - id=storage_object.id, - name=storage_object.name, - storage_client=storage_client, - ) + Preferably use the `KeyValueStore.open` constructor to create a new instance. - key_value_store.storage_object = storage_object - return key_value_store + Args: + client: An instance of a key-value store client. + """ + self._client = client + self._autosave_lock = asyncio.Lock() + self._persist_state_event_started = False - @property @override + @property def id(self) -> str: - return self._id + return self._client.metadata.id - @property @override - def name(self) -> str | None: - return self._name - @property - @override - def storage_object(self) -> StorageMetadata: - return self._storage_object + def name(self) -> str | None: + return self._client.metadata.name - @storage_object.setter @override - def storage_object(self, storage_object: StorageMetadata) -> None: - self._storage_object = storage_object - - async def get_info(self) -> KeyValueStoreMetadata | None: - """Get an object containing general information about the key value store.""" - return await self._resource_client.get() + @property + def metadata(self) -> KeyValueStoreMetadata: + return self._client.metadata @override @classmethod @@ -133,26 +118,54 @@ async def open( configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> KeyValueStore: - from crawlee.storages._creation_management import open_storage + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # Check for default instance if no id or name provided + if id is None and name is None and cls._default_instance is not None: + return cls._default_instance - configuration = configuration or service_locator.get_configuration() - storage_client = storage_client or service_locator.get_storage_client() + # Check if the key-value store is already cached + if id is not None and id in cls._cache_by_id: + return cls._cache_by_id[id] + if name is not None and name in cls._cache_by_name: + return cls._cache_by_name[name] - return await open_storage( - storage_class=cls, + configuration = service_locator.get_configuration() if configuration is None else configuration + storage_client = service_locator.get_storage_client() if storage_client is None else storage_client + + client = await storage_client.open_key_value_store_client( id=id, name=name, configuration=configuration, - storage_client=storage_client, ) + kvs = cls(client) + + # Cache the key-value store instance by ID and name + cls._cache_by_id[kvs.id] = kvs + if kvs.name is not None: + cls._cache_by_name[kvs.name] = kvs + + # Store as default instance if neither id nor name was provided + if id is None and name is None: + cls._default_instance = kvs + + return kvs + @override async def drop(self) -> None: - from crawlee.storages._creation_management import remove_storage_from_cache + if self.id in self._cache_by_id: + del self._cache_by_id[self.id] + if self.name is not None and self.name in self._cache_by_name: + del self._cache_by_name[self.name] - remove_storage_from_cache(storage_class=self.__class__, id=self._id, name=self._name) - await self._clear_cache() - await self._resource_client.delete() + await self._clear_cache() # Clear cache with persistent values. + await self._client.drop() + + @override + async def purge(self) -> None: + await self._client.purge() @overload async def get_value(self, key: str) -> Any: ... @@ -173,44 +186,75 @@ async def get_value(self, key: str, default_value: T | None = None) -> T | None: Returns: The value associated with the given key. `default_value` is used in case the record does not exist. """ - record = await self._resource_client.get_record(key) + record = await self._client.get_value(key=key) return record.value if record else default_value - async def iterate_keys(self, exclusive_start_key: str | None = None) -> AsyncIterator[KeyValueStoreKeyInfo]: + async def set_value( + self, + key: str, + value: Any, + content_type: str | None = None, + ) -> None: + """Set a value in the KVS. + + Args: + key: Key of the record to set. + value: Value to set. + content_type: The MIME content type string. + """ + await self._client.set_value(key=key, value=value, content_type=content_type) + + async def delete_value(self, key: str) -> None: + """Delete a value from the KVS. + + Args: + key: Key of the record to delete. + """ + await self._client.delete_value(key=key) + + async def iterate_keys( + self, + exclusive_start_key: str | None = None, + limit: int | None = None, + ) -> AsyncIterator[KeyValueStoreRecordMetadata]: """Iterate over the existing keys in the KVS. Args: exclusive_start_key: Key to start the iteration from. + limit: Maximum number of keys to return. None means no limit. Yields: Information about the key. """ - while True: - list_keys = await self._resource_client.list_keys(exclusive_start_key=exclusive_start_key) - for item in list_keys.items: - yield KeyValueStoreKeyInfo(key=item.key, size=item.size) + async for item in self._client.iterate_keys( + exclusive_start_key=exclusive_start_key, + limit=limit, + ): + yield item - if not list_keys.is_truncated: - break - exclusive_start_key = list_keys.next_exclusive_start_key - - async def set_value( + async def list_keys( self, - key: str, - value: Any, - content_type: str | None = None, - ) -> None: - """Set a value in the KVS. + exclusive_start_key: str | None = None, + limit: int = 1000, + ) -> list[KeyValueStoreRecordMetadata]: + """List all the existing keys in the KVS. + + It uses client's `iterate_keys` method to get the keys. Args: - key: Key of the record to set. - value: Value to set. If `None`, the record is deleted. - content_type: Content type of the record. - """ - if value is None: - return await self._resource_client.delete_record(key) + exclusive_start_key: Key to start the iteration from. + limit: Maximum number of keys to return. - return await self._resource_client.set_record(key, value, content_type) + Returns: + A list of keys in the KVS. + """ + return [ + key + async for key in self._client.iterate_keys( + exclusive_start_key=exclusive_start_key, + limit=limit, + ) + ] async def get_public_url(self, key: str) -> str: """Get the public URL for the given key. @@ -221,7 +265,7 @@ async def get_public_url(self, key: str) -> str: Returns: The public URL for the given key. """ - return await self._resource_client.get_public_url(key) + return await self._client.get_public_url(key=key) async def get_auto_saved_value( self, @@ -242,7 +286,7 @@ async def get_auto_saved_value( default_value = {} if default_value is None else default_value async with self._autosave_lock: - cache = self._autosaved_values.setdefault(self._id, {}) + cache = self._autosaved_values.setdefault(self.id, {}) if key in cache: return cache[key].current_value.root @@ -250,7 +294,7 @@ async def get_auto_saved_value( cache[key] = recoverable_state = RecoverableState( default_state=AutosavedValue(default_value), persistence_enabled=True, - persist_state_kvs_id=self._id, + persist_state_kvs_id=self.id, persist_state_key=key, logger=logger, ) @@ -259,17 +303,17 @@ async def get_auto_saved_value( return recoverable_state.current_value.root - async def _clear_cache(self) -> None: - """Clear cache with autosaved values.""" + async def persist_autosaved_values(self) -> None: + """Force autosaved values to be saved without waiting for an event in Event Manager.""" if self.id in self._autosaved_values: cache = self._autosaved_values[self.id] for value in cache.values(): - await value.teardown() - cache.clear() + await value.persist_state() - async def persist_autosaved_values(self) -> None: - """Force autosaved values to be saved without waiting for an event in Event Manager.""" + async def _clear_cache(self) -> None: + """Clear cache with autosaved values.""" if self.id in self._autosaved_values: cache = self._autosaved_values[self.id] for value in cache.values(): - await value.persist_state() + await value.teardown() + cache.clear() diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index b3274ccc81..c9fa3a1bff 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -1,23 +1,16 @@ from __future__ import annotations import asyncio -from collections import deque -from contextlib import suppress -from datetime import datetime, timedelta, timezone +from datetime import timedelta from logging import getLogger -from typing import TYPE_CHECKING, Any, TypedDict, TypeVar +from typing import TYPE_CHECKING, ClassVar, TypeVar -from cachetools import LRUCache from typing_extensions import override -from crawlee import service_locator -from crawlee._utils.crypto import crypto_random_object_id +from crawlee import Request, service_locator from crawlee._utils.docs import docs_group -from crawlee._utils.requests import unique_key_to_request_id from crawlee._utils.wait import wait_for_all_tasks_for_finish -from crawlee.events import Event from crawlee.request_loaders import RequestManager -from crawlee.storage_clients.models import ProcessedRequest, RequestQueueMetadata, StorageMetadata from ._base import Storage @@ -27,131 +20,102 @@ from crawlee import Request from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient + from crawlee.storage_clients._base import RequestQueueClient + from crawlee.storage_clients.models import ProcessedRequest, RequestQueueMetadata logger = getLogger(__name__) T = TypeVar('T') -class CachedRequest(TypedDict): - id: str - was_already_handled: bool - hydrated: Request | None - lock_expires_at: datetime | None - forefront: bool - - @docs_group('Classes') class RequestQueue(Storage, RequestManager): - """Represents a queue storage for managing HTTP requests in web crawling operations. + """Request queue is a storage for managing HTTP requests. - The `RequestQueue` class handles a queue of HTTP requests, each identified by a unique URL, to facilitate structured - web crawling. It supports both breadth-first and depth-first crawling strategies, allowing for recursive crawling - starting from an initial set of URLs. Each URL in the queue is uniquely identified by a `unique_key`, which can be - customized to allow the same URL to be added multiple times under different keys. + The request queue class serves as a high-level interface for organizing and managing HTTP requests + during web crawling. It provides methods for adding, retrieving, and manipulating requests throughout + the crawling lifecycle, abstracting away the underlying storage implementation details. - Data can be stored either locally or in the cloud. It depends on the setup of underlying storage client. - By default a `MemoryStorageClient` is used, but it can be changed to a different one. + Request queue maintains the state of each URL to be crawled, tracking whether it has been processed, + is currently being handled, or is waiting in the queue. Each URL in the queue is uniquely identified + by a `unique_key` property, which prevents duplicate processing unless explicitly configured otherwise. - By default, data is stored using the following path structure: - ``` - {CRAWLEE_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json - ``` - - `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable. - - `{QUEUE_ID}`: The identifier for the request queue, either "default" or as specified. - - `{REQUEST_ID}`: The unique identifier for each request in the queue. + The class supports both breadth-first and depth-first crawling strategies through its `forefront` parameter + when adding requests. It also provides mechanisms for error handling and request reclamation when + processing fails. - The `RequestQueue` supports both creating new queues and opening existing ones by `id` or `name`. Named queues - persist indefinitely, while unnamed queues expire after 7 days unless specified otherwise. The queue supports - mutable operations, allowing URLs to be added and removed as needed. + You can open a request queue using the `open` class method, specifying either a name or ID to identify + the queue. The underlying storage implementation is determined by the configured storage client. ### Usage ```python from crawlee.storages import RequestQueue - rq = await RequestQueue.open(name='my_rq') + # Open a request queue + rq = await RequestQueue.open(name='my_queue') + + # Add a request + await rq.add_request('https://example.com') + + # Process requests + request = await rq.fetch_next_request() + if request: + try: + # Process the request + # ... + await rq.mark_request_as_handled(request) + except Exception: + await rq.reclaim_request(request) ``` """ - _MAX_CACHED_REQUESTS = 1_000_000 - """Maximum number of requests that can be cached.""" + _cache_by_id: ClassVar[dict[str, RequestQueue]] = {} + """A dictionary to cache request queues by ID.""" - def __init__( - self, - id: str, - name: str | None, - storage_client: StorageClient, - ) -> None: - config = service_locator.get_configuration() - event_manager = service_locator.get_event_manager() + _cache_by_name: ClassVar[dict[str, RequestQueue]] = {} + """A dictionary to cache request queues by name.""" - self._id = id - self._name = name + _default_instance: ClassVar[RequestQueue | None] = None + """Cache for the default request queue instance.""" - datetime_now = datetime.now(timezone.utc) - self._storage_object = StorageMetadata( - id=id, name=name, accessed_at=datetime_now, created_at=datetime_now, modified_at=datetime_now - ) + def __init__(self, client: RequestQueueClient) -> None: + """Initialize a new instance. - # Get resource clients from storage client - self._resource_client = storage_client.request_queue(self._id) - self._resource_collection_client = storage_client.request_queues() - - self._request_lock_time = timedelta(minutes=3) - self._queue_paused_for_migration = False - self._queue_has_locked_requests: bool | None = None - self._should_check_for_forefront_requests = False - - self._is_finished_log_throttle_counter = 0 - self._dequeued_request_count = 0 - - event_manager.on(event=Event.MIGRATING, listener=lambda _: setattr(self, '_queue_paused_for_migration', True)) - event_manager.on(event=Event.MIGRATING, listener=self._clear_possible_locks) - event_manager.on(event=Event.ABORTING, listener=self._clear_possible_locks) - - # Other internal attributes - self._tasks = list[asyncio.Task]() - self._client_key = crypto_random_object_id() - self._internal_timeout = config.internal_timeout or timedelta(minutes=5) - self._assumed_total_count = 0 - self._assumed_handled_count = 0 - self._queue_head = deque[str]() - self._list_head_and_lock_task: asyncio.Task | None = None - self._last_activity = datetime.now(timezone.utc) - self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=self._MAX_CACHED_REQUESTS) + Preferably use the `RequestQueue.open` constructor to create a new instance. - @classmethod - def from_storage_object(cls, storage_client: StorageClient, storage_object: StorageMetadata) -> RequestQueue: - """Initialize a new instance of RequestQueue from a storage metadata object.""" - request_queue = RequestQueue( - id=storage_object.id, - name=storage_object.name, - storage_client=storage_client, - ) + Args: + client: An instance of a request queue client. + """ + self._client = client - request_queue.storage_object = storage_object - return request_queue + self._add_requests_tasks = list[asyncio.Task]() + """A list of tasks for adding requests to the queue.""" - @property @override + @property def id(self) -> str: - return self._id + return self._client.metadata.id - @property @override + @property def name(self) -> str | None: - return self._name + return self._client.metadata.name + @override @property + def metadata(self) -> RequestQueueMetadata: + return self._client.metadata + @override - def storage_object(self) -> StorageMetadata: - return self._storage_object + @property + async def handled_count(self) -> int: + return self._client.metadata.handled_request_count - @storage_object.setter @override - def storage_object(self, storage_object: StorageMetadata) -> None: - self._storage_object = storage_object + @property + async def total_count(self) -> int: + return self._client.metadata.total_request_count @override @classmethod @@ -163,29 +127,55 @@ async def open( configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> RequestQueue: - from crawlee.storages._creation_management import open_storage + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') - configuration = configuration or service_locator.get_configuration() - storage_client = storage_client or service_locator.get_storage_client() + # Check for default instance if no id or name provided + if id is None and name is None and cls._default_instance is not None: + return cls._default_instance - return await open_storage( - storage_class=cls, + # Check if the request queue is already cached + if id is not None and id in cls._cache_by_id: + return cls._cache_by_id[id] + if name is not None and name in cls._cache_by_name: + return cls._cache_by_name[name] + + configuration = service_locator.get_configuration() if configuration is None else configuration + storage_client = service_locator.get_storage_client() if storage_client is None else storage_client + + client = await storage_client.open_request_queue_client( id=id, name=name, configuration=configuration, - storage_client=storage_client, ) + request_queue = cls(client) + + # Cache the request queue instance by ID and name + cls._cache_by_id[request_queue.id] = request_queue + if request_queue.name is not None: + cls._cache_by_name[request_queue.name] = request_queue + + # Store as default instance if neither id nor name was provided + if id is None and name is None: + cls._default_instance = request_queue + + return request_queue + @override - async def drop(self, *, timeout: timedelta | None = None) -> None: - from crawlee.storages._creation_management import remove_storage_from_cache + async def drop(self) -> None: + # Remove from cache before dropping + if self.id in self._cache_by_id: + del self._cache_by_id[self.id] + + if self.name is not None and self.name in self._cache_by_name: + del self._cache_by_name[self.name] - # Wait for all tasks to finish - await wait_for_all_tasks_for_finish(self._tasks, logger=logger, timeout=timeout) + await self._client.drop() - # Delete the storage from the underlying client and remove it from the cache - await self._resource_client.delete() - remove_storage_from_cache(storage_class=self.__class__, id=self._id, name=self._name) + @override + async def purge(self) -> None: + await self._client.purge() @override async def add_request( @@ -195,40 +185,15 @@ async def add_request( forefront: bool = False, ) -> ProcessedRequest: request = self._transform_request(request) - self._last_activity = datetime.now(timezone.utc) - - cache_key = unique_key_to_request_id(request.unique_key) - cached_info = self._requests_cache.get(cache_key) - - if cached_info: - request.id = cached_info['id'] - # We may assume that if request is in local cache then also the information if the request was already - # handled is there because just one client should be using one queue. - return ProcessedRequest( - id=request.id, - unique_key=request.unique_key, - was_already_present=True, - was_already_handled=cached_info['was_already_handled'], - ) - - processed_request = await self._resource_client.add_request(request, forefront=forefront) - processed_request.unique_key = request.unique_key - - self._cache_request(cache_key, processed_request, forefront=forefront) - - if not processed_request.was_already_present and forefront: - self._should_check_for_forefront_requests = True - - if request.handled_at is None and not processed_request.was_already_present: - self._assumed_total_count += 1 - - return processed_request + response = await self._client.add_batch_of_requests([request], forefront=forefront) + return response.processed_requests[0] @override - async def add_requests_batched( + async def add_requests( self, requests: Sequence[str | Request], *, + forefront: bool = False, batch_size: int = 1000, wait_time_between_batches: timedelta = timedelta(seconds=1), wait_for_all_requests_to_be_added: bool = False, @@ -240,21 +205,31 @@ async def add_requests_batched( # Wait for the first batch to be added first_batch = transformed_requests[:batch_size] if first_batch: - await self._process_batch(first_batch, base_retry_wait=wait_time_between_batches) + await self._process_batch( + first_batch, + base_retry_wait=wait_time_between_batches, + forefront=forefront, + ) async def _process_remaining_batches() -> None: for i in range(batch_size, len(transformed_requests), batch_size): batch = transformed_requests[i : i + batch_size] - await self._process_batch(batch, base_retry_wait=wait_time_between_batches) + await self._process_batch( + batch, + base_retry_wait=wait_time_between_batches, + forefront=forefront, + ) if i + batch_size < len(transformed_requests): await asyncio.sleep(wait_time_secs) # Create and start the task to process remaining batches in the background remaining_batches_task = asyncio.create_task( - _process_remaining_batches(), name='request_queue_process_remaining_batches_task' + _process_remaining_batches(), + name='request_queue_process_remaining_batches_task', ) - self._tasks.append(remaining_batches_task) - remaining_batches_task.add_done_callback(lambda _: self._tasks.remove(remaining_batches_task)) + + self._add_requests_tasks.append(remaining_batches_task) + remaining_batches_task.add_done_callback(lambda _: self._add_requests_tasks.remove(remaining_batches_task)) # Wait for all tasks to finish if requested if wait_for_all_requests_to_be_added: @@ -264,42 +239,6 @@ async def _process_remaining_batches() -> None: timeout=wait_for_all_requests_to_be_added_timeout, ) - async def _process_batch(self, batch: Sequence[Request], base_retry_wait: timedelta, attempt: int = 1) -> None: - max_attempts = 5 - response = await self._resource_client.batch_add_requests(batch) - - if response.unprocessed_requests: - logger.debug(f'Following requests were not processed: {response.unprocessed_requests}.') - if attempt > max_attempts: - logger.warning( - f'Following requests were not processed even after {max_attempts} attempts:\n' - f'{response.unprocessed_requests}' - ) - else: - logger.debug('Retry to add requests.') - unprocessed_requests_unique_keys = {request.unique_key for request in response.unprocessed_requests} - retry_batch = [request for request in batch if request.unique_key in unprocessed_requests_unique_keys] - await asyncio.sleep((base_retry_wait * attempt).total_seconds()) - await self._process_batch(retry_batch, base_retry_wait=base_retry_wait, attempt=attempt + 1) - - request_count = len(batch) - len(response.unprocessed_requests) - self._assumed_total_count += request_count - if request_count: - logger.debug( - f'Added {request_count} requests to the queue. Processed requests: {response.processed_requests}' - ) - - async def get_request(self, request_id: str) -> Request | None: - """Retrieve a request from the queue. - - Args: - request_id: ID of the request to retrieve. - - Returns: - The retrieved request, or `None`, if it does not exist. - """ - return await self._resource_client.get_request(request_id) - async def fetch_next_request(self) -> Request | None: """Return the next request in the queue to be processed. @@ -313,75 +252,35 @@ async def fetch_next_request(self) -> Request | None: instead. Returns: - The request or `None` if there are no more pending requests. + The next request to process, or `None` if there are no more pending requests. """ - self._last_activity = datetime.now(timezone.utc) - - await self._ensure_head_is_non_empty() - - # We are likely done at this point. - if len(self._queue_head) == 0: - return None + return await self._client.fetch_next_request() - next_request_id = self._queue_head.popleft() - request = await self._get_or_hydrate_request(next_request_id) - - # NOTE: It can happen that the queue head index is inconsistent with the main queue table. - # This can occur in two situations: + async def get_request(self, request_id: str) -> Request | None: + """Retrieve a specific request from the queue by its ID. - # 1) - # Queue head index is ahead of the main table and the request is not present in the main table yet - # (i.e. get_request() returned null). In this case, keep the request marked as in progress for a short while, - # so that is_finished() doesn't return true and _ensure_head_is_non_empty() doesn't not load the request into - # the queueHeadDict straight again. After the interval expires, fetch_next_request() will try to fetch this - # request again, until it eventually appears in the main table. - if request is None: - logger.debug( - 'Cannot find a request from the beginning of queue, will be retried later', - extra={'nextRequestId': next_request_id}, - ) - return None - - # 2) - # Queue head index is behind the main table and the underlying request was already handled (by some other - # client, since we keep the track of handled requests in recently_handled dictionary). We just add the request - # to the recently_handled dictionary so that next call to _ensure_head_is_non_empty() will not put the request - # again to queue_head_dict. - if request.handled_at is not None: - logger.debug( - 'Request fetched from the beginning of queue was already handled', - extra={'nextRequestId': next_request_id}, - ) - return None + Args: + request_id: The ID of the request to retrieve. - self._dequeued_request_count += 1 - return request + Returns: + The request with the specified ID, or `None` if no such request exists. + """ + return await self._client.get_request(request_id) async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: """Mark a request as handled after successful processing. - Handled requests will never again be returned by the `RequestQueue.fetch_next_request` method. + This method should be called after a request has been successfully processed. + Once marked as handled, the request will be removed from the queue and will + not be returned in subsequent calls to `fetch_next_request` method. Args: request: The request to mark as handled. Returns: - Information about the queue operation. `None` if the given request was not in progress. + Information about the queue operation. """ - self._last_activity = datetime.now(timezone.utc) - - if request.handled_at is None: - request.handled_at = datetime.now(timezone.utc) - - processed_request = await self._resource_client.update_request(request) - processed_request.unique_key = request.unique_key - self._dequeued_request_count -= 1 - - if not processed_request.was_already_handled: - self._assumed_handled_count += 1 - - self._cache_request(unique_key_to_request_id(request.unique_key), processed_request, forefront=False) - return processed_request + return await self._client.mark_request_as_handled(request) async def reclaim_request( self, @@ -389,325 +288,83 @@ async def reclaim_request( *, forefront: bool = False, ) -> ProcessedRequest | None: - """Reclaim a failed request back to the queue. + """Reclaim a failed request back to the queue for later processing. - The request will be returned for processing later again by another call to `RequestQueue.fetch_next_request`. + If a request fails during processing, this method can be used to return it to the queue. + The request will be returned for processing again in a subsequent call + to `RequestQueue.fetch_next_request`. Args: request: The request to return to the queue. - forefront: Whether to add the request to the head or the end of the queue. + forefront: If true, the request will be added to the beginning of the queue. + Otherwise, it will be added to the end. Returns: - Information about the queue operation. `None` if the given request was not in progress. + Information about the queue operation. """ - self._last_activity = datetime.now(timezone.utc) - - processed_request = await self._resource_client.update_request(request, forefront=forefront) - processed_request.unique_key = request.unique_key - self._cache_request(unique_key_to_request_id(request.unique_key), processed_request, forefront=forefront) - - if forefront: - self._should_check_for_forefront_requests = True - - if processed_request: - # Try to delete the request lock if possible - try: - await self._resource_client.delete_request_lock(request.id, forefront=forefront) - except Exception as err: - logger.debug(f'Failed to delete request lock for request {request.id}', exc_info=err) - - return processed_request + return await self._client.reclaim_request(request, forefront=forefront) async def is_empty(self) -> bool: - """Check whether the queue is empty. + """Check if the request queue is empty. + + An empty queue means that there are no requests currently in the queue, either pending or being processed. + However, this does not necessarily mean that the crawling operation is finished, as there still might be + tasks that could add additional requests to the queue. Returns: - bool: `True` if the next call to `RequestQueue.fetch_next_request` would return `None`, otherwise `False`. + True if the request queue is empty, False otherwise. """ - await self._ensure_head_is_non_empty() - return len(self._queue_head) == 0 + return await self._client.is_empty() async def is_finished(self) -> bool: - """Check whether the queue is finished. + """Check if the request queue is finished. - Due to the nature of distributed storage used by the queue, the function might occasionally return a false - negative, but it will never return a false positive. + A finished queue means that all requests in the queue have been processed (the queue is empty) and there + are no more tasks that could add additional requests to the queue. This is the definitive way to check + if a crawling operation is complete. Returns: - bool: `True` if all requests were already handled and there are no more left. `False` otherwise. + True if the request queue is finished (empty and no pending add operations), False otherwise. """ - if self._tasks: - logger.debug('Background tasks are still in progress') - return False - - if self._queue_head: - logger.debug( - 'There are still ids in the queue head that are pending processing', - extra={ - 'queue_head_ids_pending': len(self._queue_head), - }, - ) - - return False - - await self._ensure_head_is_non_empty() - - if self._queue_head: - logger.debug('Queue head still returned requests that need to be processed') - + if self._add_requests_tasks: + logger.debug('Background add requests tasks are still in progress.') return False - # Could not lock any new requests - decide based on whether the queue contains requests locked by another client - if self._queue_has_locked_requests is not None: - if self._queue_has_locked_requests and self._dequeued_request_count == 0: - # The `% 25` was absolutely arbitrarily picked. It's just to not spam the logs too much. - if self._is_finished_log_throttle_counter % 25 == 0: - logger.info('The queue still contains requests locked by another client') - - self._is_finished_log_throttle_counter += 1 - - logger.debug( - f'Deciding if we are finished based on `queue_has_locked_requests` = {self._queue_has_locked_requests}' - ) - return not self._queue_has_locked_requests - - metadata = await self._resource_client.get() - if metadata is not None and not metadata.had_multiple_clients and not self._queue_head: - logger.debug('Queue head is empty and there are no other clients - we are finished') - + if await self.is_empty(): + logger.debug('The request queue is empty.') return True - # The following is a legacy algorithm for checking if the queue is finished. - # It is used only for request queue clients that do not provide the `queue_has_locked_requests` flag. - current_head = await self._resource_client.list_head(limit=2) - - if current_head.items: - logger.debug('The queue still contains unfinished requests or requests locked by another client') + return False - return len(current_head.items) == 0 - - async def get_info(self) -> RequestQueueMetadata | None: - """Get an object containing general information about the request queue.""" - return await self._resource_client.get() - - @override - async def get_handled_count(self) -> int: - return self._assumed_handled_count - - @override - async def get_total_count(self) -> int: - return self._assumed_total_count - - async def _ensure_head_is_non_empty(self) -> None: - # Stop fetching if we are paused for migration - if self._queue_paused_for_migration: - return - - # We want to fetch ahead of time to minimize dead time - if len(self._queue_head) > 1 and not self._should_check_for_forefront_requests: - return - - if self._list_head_and_lock_task is None: - task = asyncio.create_task(self._list_head_and_lock(), name='request_queue_list_head_and_lock_task') - - def callback(_: Any) -> None: - self._list_head_and_lock_task = None - - task.add_done_callback(callback) - self._list_head_and_lock_task = task - - await self._list_head_and_lock_task - - async def _list_head_and_lock(self) -> None: - # Make a copy so that we can clear the flag only if the whole method executes after the flag was set - # (i.e, it was not set in the middle of the execution of the method) - should_check_for_forefront_requests = self._should_check_for_forefront_requests - - limit = 25 - - response = await self._resource_client.list_and_lock_head( - limit=limit, lock_secs=int(self._request_lock_time.total_seconds()) - ) - - self._queue_has_locked_requests = response.queue_has_locked_requests - - head_id_buffer = list[str]() - forefront_head_id_buffer = list[str]() + async def _process_batch( + self, + batch: Sequence[Request], + *, + base_retry_wait: timedelta, + attempt: int = 1, + forefront: bool = False, + ) -> None: + """Process a batch of requests with automatic retry mechanism.""" + max_attempts = 5 + response = await self._client.add_batch_of_requests(batch, forefront=forefront) - for request in response.items: - # Queue head index might be behind the main table, so ensure we don't recycle requests - if not request.id or not request.unique_key: - logger.debug( - 'Skipping request from queue head, already in progress or recently handled', - extra={ - 'id': request.id, - 'unique_key': request.unique_key, - }, + if response.unprocessed_requests: + logger.debug(f'Following requests were not processed: {response.unprocessed_requests}.') + if attempt > max_attempts: + logger.warning( + f'Following requests were not processed even after {max_attempts} attempts:\n' + f'{response.unprocessed_requests}' ) - - # Remove the lock from the request for now, so that it can be picked up later - # This may/may not succeed, but that's fine - with suppress(Exception): - await self._resource_client.delete_request_lock(request.id) - - continue - - # If we remember that we added the request ourselves and we added it to the forefront, - # we will put it to the beginning of the local queue head to preserve the expected order. - # If we do not remember that, we will enqueue it normally. - cached_request = self._requests_cache.get(unique_key_to_request_id(request.unique_key)) - forefront = cached_request['forefront'] if cached_request else False - - if forefront: - forefront_head_id_buffer.insert(0, request.id) else: - head_id_buffer.append(request.id) - - self._cache_request( - unique_key_to_request_id(request.unique_key), - ProcessedRequest( - id=request.id, - unique_key=request.unique_key, - was_already_present=True, - was_already_handled=False, - ), - forefront=forefront, - ) - - for request_id in head_id_buffer: - self._queue_head.append(request_id) - - for request_id in forefront_head_id_buffer: - self._queue_head.appendleft(request_id) - - # If the queue head became too big, unlock the excess requests - to_unlock = list[str]() - while len(self._queue_head) > limit: - to_unlock.append(self._queue_head.pop()) - - if to_unlock: - await asyncio.gather( - *[self._resource_client.delete_request_lock(request_id) for request_id in to_unlock], - return_exceptions=True, # Just ignore the exceptions - ) - - # Unset the should_check_for_forefront_requests flag - the check is finished - if should_check_for_forefront_requests: - self._should_check_for_forefront_requests = False - - def _reset(self) -> None: - self._queue_head.clear() - self._list_head_and_lock_task = None - self._assumed_total_count = 0 - self._assumed_handled_count = 0 - self._requests_cache.clear() - self._last_activity = datetime.now(timezone.utc) - - def _cache_request(self, cache_key: str, processed_request: ProcessedRequest, *, forefront: bool) -> None: - self._requests_cache[cache_key] = { - 'id': processed_request.id, - 'was_already_handled': processed_request.was_already_handled, - 'hydrated': None, - 'lock_expires_at': None, - 'forefront': forefront, - } - - async def _get_or_hydrate_request(self, request_id: str) -> Request | None: - cached_entry = self._requests_cache.get(request_id) - - if not cached_entry: - # 2.1. Attempt to prolong the request lock to see if we still own the request - prolong_result = await self._prolong_request_lock(request_id) - - if not prolong_result: - return None - - # 2.1.1. If successful, hydrate the request and return it - hydrated_request = await self.get_request(request_id) - - # Queue head index is ahead of the main table and the request is not present in the main table yet - # (i.e. get_request() returned null). - if not hydrated_request: - # Remove the lock from the request for now, so that it can be picked up later - # This may/may not succeed, but that's fine - with suppress(Exception): - await self._resource_client.delete_request_lock(request_id) - - return None - - self._requests_cache[request_id] = { - 'id': request_id, - 'hydrated': hydrated_request, - 'was_already_handled': hydrated_request.handled_at is not None, - 'lock_expires_at': prolong_result, - 'forefront': False, - } - - return hydrated_request - - # 1.1. If hydrated, prolong the lock more and return it - if cached_entry['hydrated']: - # 1.1.1. If the lock expired on the hydrated requests, try to prolong. If we fail, we lost the request - # (or it was handled already) - if cached_entry['lock_expires_at'] and cached_entry['lock_expires_at'] < datetime.now(timezone.utc): - prolonged = await self._prolong_request_lock(cached_entry['id']) - - if not prolonged: - return None - - cached_entry['lock_expires_at'] = prolonged - - return cached_entry['hydrated'] - - # 1.2. If not hydrated, try to prolong the lock first (to ensure we keep it in our queue), hydrate and return it - prolonged = await self._prolong_request_lock(cached_entry['id']) - - if not prolonged: - return None - - # This might still return null if the queue head is inconsistent with the main queue table. - hydrated_request = await self.get_request(cached_entry['id']) - - cached_entry['hydrated'] = hydrated_request - - # Queue head index is ahead of the main table and the request is not present in the main table yet - # (i.e. get_request() returned null). - if not hydrated_request: - # Remove the lock from the request for now, so that it can be picked up later - # This may/may not succeed, but that's fine - with suppress(Exception): - await self._resource_client.delete_request_lock(cached_entry['id']) - - return None + logger.debug('Retry to add requests.') + unprocessed_requests_unique_keys = {request.unique_key for request in response.unprocessed_requests} + retry_batch = [request for request in batch if request.unique_key in unprocessed_requests_unique_keys] + await asyncio.sleep((base_retry_wait * attempt).total_seconds()) + await self._process_batch(retry_batch, base_retry_wait=base_retry_wait, attempt=attempt + 1) - return hydrated_request + request_count = len(batch) - len(response.unprocessed_requests) - async def _prolong_request_lock(self, request_id: str) -> datetime | None: - try: - res = await self._resource_client.prolong_request_lock( - request_id, lock_secs=int(self._request_lock_time.total_seconds()) - ) - except Exception as err: - # Most likely we do not own the lock anymore - logger.warning( - f'Failed to prolong lock for cached request {request_id}, either lost the lock ' - 'or the request was already handled\n', - exc_info=err, + if request_count: + logger.debug( + f'Added {request_count} requests to the queue. Processed requests: {response.processed_requests}' ) - return None - else: - return res.lock_expires_at - - async def _clear_possible_locks(self) -> None: - self._queue_paused_for_migration = True - request_id: str | None = None - - while True: - try: - request_id = self._queue_head.pop() - except LookupError: - break - - with suppress(Exception): - await self._resource_client.delete_request_lock(request_id) - # If this fails, we don't have the lock, or the request was never locked. Either way it's fine diff --git a/src/crawlee/storages/_types.py b/src/crawlee/storages/_types.py new file mode 100644 index 0000000000..e8c1b135e0 --- /dev/null +++ b/src/crawlee/storages/_types.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal, TypedDict + +if TYPE_CHECKING: + import json + from collections.abc import Callable + from datetime import datetime + + from typing_extensions import NotRequired, Required + + from crawlee import Request + from crawlee.configuration import Configuration + from crawlee.storage_clients import StorageClient + + +class CachedRequest(TypedDict): + """Represent a cached request in the `RequestQueue`.""" + + id: str + """The ID of the request.""" + + was_already_handled: bool + """Indicates whether the request was already handled.""" + + hydrated: Request | None + """The hydrated request object.""" + + lock_expires_at: datetime | None + """The time at which the lock on the request expires.""" + + forefront: bool + """Indicates whether the request is at the forefront of the queue.""" + + +class IterateKwargs(TypedDict): + """Keyword arguments for dataset's `iterate` method.""" + + offset: NotRequired[int] + """Skips the specified number of items at the start.""" + + limit: NotRequired[int | None] + """The maximum number of items to retrieve. Unlimited if None.""" + + clean: NotRequired[bool] + """Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.""" + + desc: NotRequired[bool] + """Set to True to sort results in descending order.""" + + fields: NotRequired[list[str]] + """Fields to include in each item. Sorts fields as specified if provided.""" + + omit: NotRequired[list[str]] + """Fields to exclude from each item.""" + + unwind: NotRequired[str] + """Unwinds items by a specified array field, turning each element into a separate item.""" + + skip_empty: NotRequired[bool] + """Excludes empty items from the results if True.""" + + skip_hidden: NotRequired[bool] + """Excludes fields starting with '#' if True.""" + + +class GetDataKwargs(IterateKwargs): + """Keyword arguments for dataset's `get_data` method.""" + + flatten: NotRequired[list[str]] + """Fields to be flattened in returned items.""" + + view: NotRequired[str] + """Specifies the dataset view to be used.""" + + +class ExportToKwargs(TypedDict): + """Keyword arguments for dataset's `export_to` method.""" + + key: Required[str] + """The key under which to save the data.""" + + content_type: NotRequired[Literal['json', 'csv']] + """The format in which to export the data. Either 'json' or 'csv'.""" + + to_kvs_id: NotRequired[str] + """ID of the key-value store to save the exported file.""" + + to_kvs_name: NotRequired[str] + """Name of the key-value store to save the exported file.""" + + to_kvs_storage_client: NotRequired[StorageClient] + """The storage client to use for saving the exported file.""" + + to_kvs_configuration: NotRequired[Configuration] + """The configuration to use for saving the exported file.""" + + +class ExportDataJsonKwargs(TypedDict): + """Keyword arguments for dataset's `export_data_json` method.""" + + skipkeys: NotRequired[bool] + """If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped + instead of raising a `TypeError`.""" + + ensure_ascii: NotRequired[bool] + """Determines if non-ASCII characters should be escaped in the output JSON string.""" + + check_circular: NotRequired[bool] + """If False (default: True), skips the circular reference check for container types. A circular reference will + result in a `RecursionError` or worse if unchecked.""" + + allow_nan: NotRequired[bool] + """If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply + with the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity).""" + + cls: NotRequired[type[json.JSONEncoder]] + """Allows specifying a custom JSON encoder.""" + + indent: NotRequired[int] + """Specifies the number of spaces for indentation in the pretty-printed JSON output.""" + + separators: NotRequired[tuple[str, str]] + """A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ') + otherwise.""" + + default: NotRequired[Callable] + """A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version + of the object or raise a `TypeError`.""" + + sort_keys: NotRequired[bool] + """Specifies whether the output JSON object should have keys sorted alphabetically.""" + + +class ExportDataCsvKwargs(TypedDict): + """Keyword arguments for dataset's `export_data_csv` method.""" + + dialect: NotRequired[str] + """Specifies a dialect to be used in CSV parsing and writing.""" + + delimiter: NotRequired[str] + """A one-character string used to separate fields. Defaults to ','.""" + + doublequote: NotRequired[bool] + """Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled; + when False, the `escapechar` is used as a prefix. Defaults to True.""" + + escapechar: NotRequired[str] + """A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar` + if `doublequote` is False. Defaults to None, disabling escaping.""" + + lineterminator: NotRequired[str] + """The string used to terminate lines produced by the writer. Defaults to '\\r\\n'.""" + + quotechar: NotRequired[str] + """A one-character string used to quote fields containing special characters, like the delimiter or quotechar, + or fields containing new-line characters. Defaults to '\"'.""" + + quoting: NotRequired[int] + """Controls when quotes should be generated by the writer and recognized by the reader. Can take any of + the `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`.""" + + skipinitialspace: NotRequired[bool] + """When True, spaces immediately following the delimiter are ignored. Defaults to False.""" + + strict: NotRequired[bool] + """When True, raises an exception on bad CSV input. Defaults to False.""" diff --git a/tests/e2e/project_template/utils.py b/tests/e2e/project_template/utils.py index 3bc5be4ea6..685e8c45e8 100644 --- a/tests/e2e/project_template/utils.py +++ b/tests/e2e/project_template/utils.py @@ -20,23 +20,25 @@ def patch_crawlee_version_in_project( def _patch_crawlee_version_in_requirements_txt_based_project(project_path: Path, wheel_path: Path) -> None: # Get any extras - with open(project_path / 'requirements.txt') as f: + requirements_path = project_path / 'requirements.txt' + with requirements_path.open() as f: requirements = f.read() crawlee_extras = re.findall(r'crawlee(\[.*\])', requirements)[0] or '' # Modify requirements.txt to use crawlee from wheel file instead of from Pypi - with open(project_path / 'requirements.txt') as f: + with requirements_path.open() as f: modified_lines = [] for line in f: if 'crawlee' in line: modified_lines.append(f'./{wheel_path.name}{crawlee_extras}\n') else: modified_lines.append(line) - with open(project_path / 'requirements.txt', 'w') as f: + with requirements_path.open('w') as f: f.write(''.join(modified_lines)) # Patch the dockerfile to have wheel file available - with open(project_path / 'Dockerfile') as f: + dockerfile_path = project_path / 'Dockerfile' + with dockerfile_path.open() as f: modified_lines = [] for line in f: modified_lines.append(line) @@ -49,19 +51,21 @@ def _patch_crawlee_version_in_requirements_txt_based_project(project_path: Path, f'RUN pip install ./{wheel_path.name}{crawlee_extras} --force-reinstall\n', ] ) - with open(project_path / 'Dockerfile', 'w') as f: + with dockerfile_path.open('w') as f: f.write(''.join(modified_lines)) def _patch_crawlee_version_in_pyproject_toml_based_project(project_path: Path, wheel_path: Path) -> None: """Ensure that the test is using current version of the crawlee from the source and not from Pypi.""" # Get any extras - with open(project_path / 'pyproject.toml') as f: + pyproject_path = project_path / 'pyproject.toml' + with pyproject_path.open() as f: pyproject = f.read() crawlee_extras = re.findall(r'crawlee(\[.*\])', pyproject)[0] or '' # Inject crawlee wheel file to the docker image and update project to depend on it.""" - with open(project_path / 'Dockerfile') as f: + dockerfile_path = project_path / 'Dockerfile' + with dockerfile_path.open() as f: modified_lines = [] for line in f: modified_lines.append(line) @@ -94,5 +98,5 @@ def _patch_crawlee_version_in_pyproject_toml_based_project(project_path: Path, w f'RUN {package_manager} lock\n', ] ) - with open(project_path / 'Dockerfile', 'w') as f: + with dockerfile_path.open('w') as f: f.write(''.join(modified_lines)) diff --git a/tests/unit/_autoscaling/test_autoscaled_pool.py b/tests/unit/_autoscaling/test_autoscaled_pool.py index 717b178738..b4e82fee76 100644 --- a/tests/unit/_autoscaling/test_autoscaled_pool.py +++ b/tests/unit/_autoscaling/test_autoscaled_pool.py @@ -328,6 +328,8 @@ async def run() -> None: assert done_count == 4 done_count = 0 + await asyncio.sleep(0.2) # Allow any lingering callbacks to complete + done_count = 0 # Reset again to ensure clean state await pool.run() assert done_count == 4 diff --git a/tests/unit/_utils/test_file.py b/tests/unit/_utils/test_file.py index a86291b43f..0762e1d966 100644 --- a/tests/unit/_utils/test_file.py +++ b/tests/unit/_utils/test_file.py @@ -1,6 +1,5 @@ from __future__ import annotations -import io from datetime import datetime, timezone from pathlib import Path @@ -12,7 +11,6 @@ force_remove, force_rename, is_content_type, - is_file_or_bytes, json_dumps, ) @@ -25,15 +23,6 @@ async def test_json_dumps() -> None: assert await json_dumps(datetime(2022, 1, 1, tzinfo=timezone.utc)) == '"2022-01-01 00:00:00+00:00"' -def test_is_file_or_bytes() -> None: - assert is_file_or_bytes(b'bytes') is True - assert is_file_or_bytes(bytearray(b'bytearray')) is True - assert is_file_or_bytes(io.BytesIO(b'some bytes')) is True - assert is_file_or_bytes(io.StringIO('string')) is True - assert is_file_or_bytes('just a regular string') is False - assert is_file_or_bytes(12345) is False - - @pytest.mark.parametrize( ('content_type_enum', 'content_type', 'expected_result'), [ @@ -115,7 +104,7 @@ async def test_force_remove(tmp_path: Path) -> None: assert test_file_path.exists() is False # Remove the file if it exists - with open(test_file_path, 'a', encoding='utf-8'): # noqa: ASYNC230 + with test_file_path.open('a', encoding='utf-8'): pass assert test_file_path.exists() is True await force_remove(test_file_path) @@ -134,11 +123,11 @@ async def test_force_rename(tmp_path: Path) -> None: # Will remove dst_dir if it exists (also covers normal case) # Create the src_dir with a file in it src_dir.mkdir() - with open(src_file, 'a', encoding='utf-8'): # noqa: ASYNC230 + with src_file.open('a', encoding='utf-8'): pass # Create the dst_dir with a file in it dst_dir.mkdir() - with open(dst_file, 'a', encoding='utf-8'): # noqa: ASYNC230 + with dst_file.open('a', encoding='utf-8'): pass assert src_file.exists() is True assert dst_file.exists() is True diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index b7ac06d124..f7aa551dd7 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -13,12 +13,10 @@ from uvicorn.config import Config from crawlee import service_locator -from crawlee.configuration import Configuration from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_network from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient from crawlee.proxy_configuration import ProxyInfo -from crawlee.storage_clients import MemoryStorageClient -from crawlee.storages import KeyValueStore, _creation_management +from crawlee.storages import Dataset, KeyValueStore, RequestQueue from tests.unit.server import TestServer, app, serve_in_thread if TYPE_CHECKING: @@ -64,20 +62,24 @@ def _prepare_test_env() -> None: service_locator._event_manager = None service_locator._storage_client = None - # Clear creation-related caches to ensure no state is carried over between tests. - monkeypatch.setattr(_creation_management, '_cache_dataset_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_dataset_by_name', {}) - monkeypatch.setattr(_creation_management, '_cache_kvs_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_kvs_by_name', {}) - monkeypatch.setattr(_creation_management, '_cache_rq_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_rq_by_name', {}) - # Verify that the test environment was set up correctly. assert os.environ.get('CRAWLEE_STORAGE_DIR') == str(tmp_path) assert service_locator._configuration_was_retrieved is False assert service_locator._storage_client_was_retrieved is False assert service_locator._event_manager_was_retrieved is False + Dataset._cache_by_id.clear() + Dataset._cache_by_name.clear() + Dataset._default_instance = None + + KeyValueStore._cache_by_id.clear() + KeyValueStore._cache_by_name.clear() + KeyValueStore._default_instance = None + + RequestQueue._cache_by_id.clear() + RequestQueue._cache_by_name.clear() + RequestQueue._default_instance = None + return _prepare_test_env @@ -149,18 +151,6 @@ async def disabled_proxy(proxy_info: ProxyInfo) -> AsyncGenerator[ProxyInfo, Non yield proxy_info -@pytest.fixture -def memory_storage_client(tmp_path: Path) -> MemoryStorageClient: - """A fixture for testing the memory storage client and its resource clients.""" - config = Configuration( - persist_storage=True, - write_metadata=True, - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - ) - - return MemoryStorageClient.from_config(config) - - @pytest.fixture(scope='session') def header_network() -> dict: return get_available_header_network() diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 3ee386324a..19ebfa9cf7 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -356,9 +356,11 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: ): await crawler.run(requests) - mocked_predict.assert_called_once_with(requests[0]) + assert mocked_predict.call_count == 1 + assert mocked_predict.call_args[0][0].url == requests[0].url + # If `static` and `client only` results are same, `store_result` should be called with `static`. - mocked_store_result.assert_called_once_with(requests[0], expected_result_rendering_type) + mocked_store_result.assert_called_once_with(mocked_predict.call_args[0][0], expected_result_rendering_type) async def test_adaptive_crawling_result_use_state_isolation( @@ -500,10 +502,10 @@ async def test_adaptive_playwright_crawler_timeout_in_sub_crawler(test_urls: lis """Tests that timeout in static sub crawler forces fall back to browser sub crawler. Create situation where static sub crawler blocks(should time out), such error should start browser sub - crawler.""" - + crawler. + """ static_only_predictor_no_detection = _SimpleRenderingTypePredictor(detection_probability_recommendation=cycle([0])) - request_handler_timeout = timedelta(seconds=0.1) + request_handler_timeout = timedelta(seconds=1) crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( max_request_retries=1, @@ -522,9 +524,9 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: except AdaptiveContextError: mocked_static_handler() # Relax timeout for the fallback browser request to avoid flakiness in test - crawler._request_handler_timeout = timedelta(seconds=5) + crawler._request_handler_timeout = timedelta(seconds=10) # Sleep for time obviously larger than top crawler timeout. - await asyncio.sleep(request_handler_timeout.total_seconds() * 2) + await asyncio.sleep(request_handler_timeout.total_seconds() * 3) await crawler.run(test_urls[:1]) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index ebbe229627..fcf4971f51 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -10,7 +10,6 @@ from collections import Counter from dataclasses import dataclass from datetime import timedelta -from pathlib import Path from typing import TYPE_CHECKING, Any, Literal, cast from unittest.mock import AsyncMock, Mock, call, patch @@ -32,16 +31,16 @@ if TYPE_CHECKING: from collections.abc import Callable, Sequence + from pathlib import Path from yarl import URL from crawlee._types import JsonSerializable - from crawlee.storage_clients._memory import DatasetClient async def test_processes_requests_from_explicit_queue() -> None: queue = await RequestQueue.open() - await queue.add_requests_batched(['http://a.com/', 'http://b.com/', 'http://c.com/']) + await queue.add_requests(['http://a.com/', 'http://b.com/', 'http://c.com/']) crawler = BasicCrawler(request_manager=queue) calls = list[str]() @@ -57,7 +56,7 @@ async def handler(context: BasicCrawlingContext) -> None: async def test_processes_requests_from_request_source_tandem() -> None: request_queue = await RequestQueue.open() - await request_queue.add_requests_batched(['http://a.com/', 'http://b.com/', 'http://c.com/']) + await request_queue.add_requests(['http://a.com/', 'http://b.com/', 'http://c.com/']) request_list = RequestList(['http://a.com/', 'http://d.com', 'http://e.com']) @@ -537,8 +536,8 @@ async def handler(context: BasicCrawlingContext) -> None: assert visited == set(test_input.expected_urls) -async def test_session_rotation() -> None: - track_session_usage = Mock() +async def test_session_rotation(server_url: URL) -> None: + session_ids: list[str | None] = [] crawler = BasicCrawler( max_session_rotations=7, @@ -547,16 +546,20 @@ async def test_session_rotation() -> None: @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - track_session_usage(context.session.id if context.session else None) + session_ids.append(context.session.id if context.session else None) raise SessionError('Test error') - await crawler.run([Request.from_url('https://someplace.com/', label='start')]) - assert track_session_usage.call_count == 7 + await crawler.run([str(server_url)]) - session_ids = {call[0][0] for call in track_session_usage.call_args_list} + # exactly 7 handler calls happened assert len(session_ids) == 7 + + # all session ids are not None assert None not in session_ids + # and each was a different session + assert len(set(session_ids)) == 7 + async def test_final_statistics() -> None: crawler = BasicCrawler(max_request_retries=3) @@ -615,6 +618,7 @@ async def test_crawler_get_storages() -> None: assert isinstance(kvs, KeyValueStore) +# THIS async def test_crawler_run_requests() -> None: crawler = BasicCrawler() seen_urls = list[str]() @@ -639,14 +643,14 @@ async def test_context_push_and_get_data() -> None: crawler = BasicCrawler() dataset = await Dataset.open() - await dataset.push_data('{"a": 1}') + await dataset.push_data({'a': 1}) assert (await crawler.get_data()).items == [{'a': 1}] @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - await context.push_data('{"b": 2}') + await context.push_data({'b': 2}) - await dataset.push_data('{"c": 3}') + await dataset.push_data({'c': 3}) assert (await crawler.get_data()).items == [{'a': 1}, {'c': 3}] stats = await crawler.run(['http://test.io/1']) @@ -661,7 +665,7 @@ async def test_context_push_and_get_data_handler_error() -> None: @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - await context.push_data('{"b": 2}') + await context.push_data({'b': 2}) raise RuntimeError('Watch me crash') stats = await crawler.run(['https://a.com']) @@ -679,8 +683,8 @@ async def test_crawler_push_and_export_data(tmp_path: Path) -> None: await dataset.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}]) await dataset.push_data({'id': 2, 'test': 'test'}) - await crawler.export_data_json(path=tmp_path / 'dataset.json') - await crawler.export_data_csv(path=tmp_path / 'dataset.csv') + await crawler.export_data(path=tmp_path / 'dataset.json') + await crawler.export_data(path=tmp_path / 'dataset.csv') assert json.load((tmp_path / 'dataset.json').open()) == [ {'id': 0, 'test': 'test'}, @@ -700,8 +704,8 @@ async def handler(context: BasicCrawlingContext) -> None: await crawler.run(['http://test.io/1']) - await crawler.export_data_json(path=tmp_path / 'dataset.json') - await crawler.export_data_csv(path=tmp_path / 'dataset.csv') + await crawler.export_data(path=tmp_path / 'dataset.json') + await crawler.export_data(path=tmp_path / 'dataset.csv') assert json.load((tmp_path / 'dataset.json').open()) == [ {'id': 0, 'test': 'test'}, @@ -712,45 +716,6 @@ async def handler(context: BasicCrawlingContext) -> None: assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n' -async def test_crawler_push_and_export_data_and_json_dump_parameter(tmp_path: Path) -> None: - crawler = BasicCrawler() - - @crawler.router.default_handler - async def handler(context: BasicCrawlingContext) -> None: - await context.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}]) - await context.push_data({'id': 2, 'test': 'test'}) - - await crawler.run(['http://test.io/1']) - - await crawler.export_data_json(path=tmp_path / 'dataset.json', indent=3) - - with (tmp_path / 'dataset.json').open() as json_file: - exported_json_str = json_file.read() - - # Expected data in JSON format with 3 spaces indent - expected_data = [ - {'id': 0, 'test': 'test'}, - {'id': 1, 'test': 'test'}, - {'id': 2, 'test': 'test'}, - ] - expected_json_str = json.dumps(expected_data, indent=3) - - # Assert that the exported JSON string matches the expected JSON string - assert exported_json_str == expected_json_str - - -async def test_crawler_push_data_over_limit() -> None: - crawler = BasicCrawler() - - @crawler.router.default_handler - async def handler(context: BasicCrawlingContext) -> None: - # Push a roughly 15MB payload - this should be enough to break the 9MB limit - await context.push_data({'hello': 'world' * 3 * 1024 * 1024}) - - stats = await crawler.run(['http://example.tld/1']) - assert stats.requests_failed == 1 - - async def test_context_update_kv_store() -> None: crawler = BasicCrawler() @@ -765,7 +730,7 @@ async def handler(context: BasicCrawlingContext) -> None: assert (await store.get_value('foo')) == 'bar' -async def test_context_use_state(key_value_store: KeyValueStore) -> None: +async def test_context_use_state() -> None: crawler = BasicCrawler() @crawler.router.default_handler @@ -774,9 +739,10 @@ async def handler(context: BasicCrawlingContext) -> None: await crawler.run(['https://hello.world']) - store = await crawler.get_key_value_store() + kvs = await crawler.get_key_value_store() + value = await kvs.get_value(BasicCrawler._CRAWLEE_STATE_KEY) - assert (await store.get_value(BasicCrawler._CRAWLEE_STATE_KEY)) == {'hello': 'world'} + assert value == {'hello': 'world'} async def test_context_handlers_use_state(key_value_store: KeyValueStore) -> None: @@ -940,18 +906,6 @@ async def handler(context: BasicCrawlingContext) -> None: } -async def test_respects_no_persist_storage() -> None: - configuration = Configuration(persist_storage=False) - crawler = BasicCrawler(configuration=configuration) - - @crawler.router.default_handler - async def handler(context: BasicCrawlingContext) -> None: - await context.push_data({'something': 'something'}) - - datasets_path = Path(configuration.storage_dir) / 'datasets' / 'default' - assert not datasets_path.exists() or list(datasets_path.iterdir()) == [] - - @pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI') @pytest.mark.parametrize( ('statistics_log_format'), @@ -1091,9 +1045,9 @@ async def handler(context: BasicCrawlingContext) -> None: async def test_sets_services() -> None: custom_configuration = Configuration() custom_event_manager = LocalEventManager.from_config(custom_configuration) - custom_storage_client = MemoryStorageClient.from_config(custom_configuration) + custom_storage_client = MemoryStorageClient() - crawler = BasicCrawler( + _ = BasicCrawler( configuration=custom_configuration, event_manager=custom_event_manager, storage_client=custom_storage_client, @@ -1103,12 +1057,9 @@ async def test_sets_services() -> None: assert service_locator.get_event_manager() is custom_event_manager assert service_locator.get_storage_client() is custom_storage_client - dataset = await crawler.get_dataset(name='test') - assert cast('DatasetClient', dataset._resource_client)._memory_storage_client is custom_storage_client - async def test_allows_storage_client_overwrite_before_run(monkeypatch: pytest.MonkeyPatch) -> None: - custom_storage_client = MemoryStorageClient.from_config() + custom_storage_client = MemoryStorageClient() crawler = BasicCrawler( storage_client=custom_storage_client, @@ -1118,7 +1069,7 @@ async def test_allows_storage_client_overwrite_before_run(monkeypatch: pytest.Mo async def handler(context: BasicCrawlingContext) -> None: await context.push_data({'foo': 'bar'}) - other_storage_client = MemoryStorageClient.from_config() + other_storage_client = MemoryStorageClient() service_locator.set_storage_client(other_storage_client) with monkeypatch.context() as monkey: @@ -1128,8 +1079,6 @@ async def handler(context: BasicCrawlingContext) -> None: assert spy.call_count >= 1 dataset = await crawler.get_dataset() - assert cast('DatasetClient', dataset._resource_client)._memory_storage_client is other_storage_client - data = await dataset.get_data() assert data.items == [{'foo': 'bar'}] @@ -1396,23 +1345,30 @@ async def test_lock_with_get_robots_txt_file_for_url(server_url: URL) -> None: assert spy.call_count == 1 -async def test_reduced_logs_from_timed_out_request_handler( - monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture -) -> None: +async def test_reduced_logs_from_timed_out_request_handler(caplog: pytest.LogCaptureFixture) -> None: caplog.set_level(logging.INFO) - crawler = BasicCrawler(configure_logging=False, request_handler_timeout=timedelta(seconds=1)) + crawler = BasicCrawler( + configure_logging=False, + request_handler_timeout=timedelta(seconds=1), + ) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: + # Intentionally add a delay longer than the timeout to trigger the timeout mechanism await asyncio.sleep(10) # INJECTED DELAY - await crawler.run([Request.from_url('http://a.com/')]) + # Capture all logs from the 'crawlee' logger at INFO level or higher + with caplog.at_level(logging.INFO, logger='crawlee'): + await crawler.run([Request.from_url('http://a.com/')]) + # Check for the timeout message in any of the logs + found_timeout_message = False for record in caplog.records: - if record.funcName == '_handle_failed_request': + if record.message and 'timed out after 1.0 seconds' in record.message: full_message = (record.message or '') + (record.exc_text or '') assert Counter(full_message)['\n'] < 10 assert '# INJECTED DELAY' in full_message + found_timeout_message = True break - else: - raise AssertionError('Expected log message about request handler error was not found.') + + assert found_timeout_message, 'Expected log message about request handler error was not found.' diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py index 7f00ff2166..807e14e9cc 100644 --- a/tests/unit/crawlers/_http/test_http_crawler.py +++ b/tests/unit/crawlers/_http/test_http_crawler.py @@ -544,7 +544,8 @@ async def request_handler(context: HttpCrawlingContext) -> None: async def test_error_snapshot_through_statistics(server_url: URL) -> None: - crawler = HttpCrawler(statistics=Statistics.with_default_state(save_error_snapshots=True)) + statistics = Statistics.with_default_state(save_error_snapshots=True) + crawler = HttpCrawler(statistics=statistics) @crawler.router.default_handler async def request_handler(context: HttpCrawlingContext) -> None: diff --git a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py new file mode 100644 index 0000000000..bdc1a361a9 --- /dev/null +++ b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py @@ -0,0 +1,317 @@ +from __future__ import annotations + +import asyncio +import json +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +from crawlee._consts import METADATA_FILENAME +from crawlee.configuration import Configuration +from crawlee.storage_clients import FileSystemStorageClient +from crawlee.storage_clients._file_system import FileSystemDatasetClient +from crawlee.storage_clients.models import DatasetItemsListPage + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + +@pytest.fixture +def configuration(tmp_path: Path) -> Configuration: + return Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + ) + + +@pytest.fixture +async def dataset_client(configuration: Configuration) -> AsyncGenerator[FileSystemDatasetClient, None]: + """A fixture for a file system dataset client.""" + client = await FileSystemStorageClient().open_dataset_client( + name='test_dataset', + configuration=configuration, + ) + yield client + await client.drop() + + +async def test_open_creates_new_dataset(configuration: Configuration) -> None: + """Test that open() creates a new dataset with proper metadata when it doesn't exist.""" + client = await FileSystemStorageClient().open_dataset_client( + name='new_dataset', + configuration=configuration, + ) + + # Verify correct client type and properties + assert isinstance(client, FileSystemDatasetClient) + assert client.metadata.id is not None + assert client.metadata.name == 'new_dataset' + assert client.metadata.item_count == 0 + assert isinstance(client.metadata.created_at, datetime) + assert isinstance(client.metadata.accessed_at, datetime) + assert isinstance(client.metadata.modified_at, datetime) + + # Verify files were created + assert client.path_to_dataset.exists() + assert client.path_to_metadata.exists() + + # Verify metadata content + with client.path_to_metadata.open() as f: + metadata = json.load(f) + assert metadata['id'] == client.metadata.id + assert metadata['name'] == 'new_dataset' + assert metadata['item_count'] == 0 + + +async def test_dataset_client_purge_on_start(configuration: Configuration) -> None: + """Test that purge_on_start=True clears existing data in the dataset.""" + configuration.purge_on_start = True + + # Create dataset and add data + dataset_client1 = await FileSystemStorageClient().open_dataset_client( + configuration=configuration, + ) + await dataset_client1.push_data({'item': 'initial data'}) + + # Verify data was added + items = await dataset_client1.get_data() + assert len(items.items) == 1 + + # Reopen + dataset_client2 = await FileSystemStorageClient().open_dataset_client( + configuration=configuration, + ) + + # Verify data was purged + items = await dataset_client2.get_data() + assert len(items.items) == 0 + + +async def test_dataset_client_no_purge_on_start(configuration: Configuration) -> None: + """Test that purge_on_start=False keeps existing data in the dataset.""" + configuration.purge_on_start = False + + # Create dataset and add data + dataset_client1 = await FileSystemStorageClient().open_dataset_client( + name='test-no-purge-dataset', + configuration=configuration, + ) + await dataset_client1.push_data({'item': 'preserved data'}) + + # Reopen + dataset_client2 = await FileSystemStorageClient().open_dataset_client( + name='test-no-purge-dataset', + configuration=configuration, + ) + + # Verify data was preserved + items = await dataset_client2.get_data() + assert len(items.items) == 1 + assert items.items[0]['item'] == 'preserved data' + + +async def test_push_data_single_item(dataset_client: FileSystemDatasetClient) -> None: + """Test pushing a single item to the dataset.""" + item = {'key': 'value', 'number': 42} + await dataset_client.push_data(item) + + # Verify item count was updated + assert dataset_client.metadata.item_count == 1 + + all_files = list(dataset_client.path_to_dataset.glob('*.json')) + assert len(all_files) == 2 # 1 data file + 1 metadata file + + # Verify item was persisted + data_files = [item for item in all_files if item.name != METADATA_FILENAME] + assert len(data_files) == 1 + + # Verify file content + with Path(data_files[0]).open() as f: + saved_item = json.load(f) + assert saved_item == item + + +async def test_push_data_multiple_items(dataset_client: FileSystemDatasetClient) -> None: + """Test pushing multiple items to the dataset.""" + items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}] + await dataset_client.push_data(items) + + # Verify item count was updated + assert dataset_client.metadata.item_count == 3 + + all_files = list(dataset_client.path_to_dataset.glob('*.json')) + assert len(all_files) == 4 # 3 data files + 1 metadata file + + # Verify items were saved to files + data_files = [f for f in all_files if f.name != METADATA_FILENAME] + assert len(data_files) == 3 + + +async def test_get_data_empty_dataset(dataset_client: FileSystemDatasetClient) -> None: + """Test getting data from an empty dataset returns empty list.""" + result = await dataset_client.get_data() + + assert isinstance(result, DatasetItemsListPage) + assert result.count == 0 + assert result.total == 0 + assert result.items == [] + + +async def test_get_data_with_items(dataset_client: FileSystemDatasetClient) -> None: + """Test getting data from a dataset returns all items in order with correct properties.""" + # Add some items + items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}] + await dataset_client.push_data(items) + + # Get all items + result = await dataset_client.get_data() + + assert result.count == 3 + assert result.total == 3 + assert len(result.items) == 3 + assert result.items[0]['id'] == 1 + assert result.items[1]['id'] == 2 + assert result.items[2]['id'] == 3 + + +async def test_get_data_with_pagination(dataset_client: FileSystemDatasetClient) -> None: + """Test getting data with offset and limit parameters for pagination implementation.""" + # Add some items + items = [{'id': i} for i in range(1, 11)] # 10 items + await dataset_client.push_data(items) + + # Test offset + result = await dataset_client.get_data(offset=3) + assert result.count == 7 + assert result.offset == 3 + assert result.items[0]['id'] == 4 + + # Test limit + result = await dataset_client.get_data(limit=5) + assert result.count == 5 + assert result.limit == 5 + assert result.items[-1]['id'] == 5 + + # Test both offset and limit + result = await dataset_client.get_data(offset=2, limit=3) + assert result.count == 3 + assert result.offset == 2 + assert result.limit == 3 + assert result.items[0]['id'] == 3 + assert result.items[-1]['id'] == 5 + + +async def test_get_data_descending_order(dataset_client: FileSystemDatasetClient) -> None: + """Test getting data in descending order reverses the item order.""" + # Add some items + items = [{'id': i} for i in range(1, 6)] # 5 items + await dataset_client.push_data(items) + + # Get items in descending order + result = await dataset_client.get_data(desc=True) + + assert result.desc is True + assert result.items[0]['id'] == 5 + assert result.items[-1]['id'] == 1 + + +async def test_get_data_skip_empty(dataset_client: FileSystemDatasetClient) -> None: + """Test getting data with skip_empty option filters out empty items when True.""" + # Add some items including an empty one + items = [ + {'id': 1, 'name': 'Item 1'}, + {}, # Empty item + {'id': 3, 'name': 'Item 3'}, + ] + await dataset_client.push_data(items) + + # Get all items + result = await dataset_client.get_data() + assert result.count == 3 + + # Get non-empty items + result = await dataset_client.get_data(skip_empty=True) + assert result.count == 2 + assert all(item != {} for item in result.items) + + +async def test_iterate(dataset_client: FileSystemDatasetClient) -> None: + """Test iterating over dataset items yields each item in the original order.""" + # Add some items + items = [{'id': i} for i in range(1, 6)] # 5 items + await dataset_client.push_data(items) + + # Iterate over all items + collected_items = [item async for item in dataset_client.iterate_items()] + + assert len(collected_items) == 5 + assert collected_items[0]['id'] == 1 + assert collected_items[-1]['id'] == 5 + + +async def test_iterate_with_options(dataset_client: FileSystemDatasetClient) -> None: + """Test iterating with offset, limit and desc parameters works the same as with get_data().""" + # Add some items + items = [{'id': i} for i in range(1, 11)] # 10 items + await dataset_client.push_data(items) + + # Test with offset and limit + collected_items = [item async for item in dataset_client.iterate_items(offset=3, limit=3)] + + assert len(collected_items) == 3 + assert collected_items[0]['id'] == 4 + assert collected_items[-1]['id'] == 6 + + # Test with descending order + collected_items = [] + async for item in dataset_client.iterate_items(desc=True, limit=3): + collected_items.append(item) + + assert len(collected_items) == 3 + assert collected_items[0]['id'] == 10 + assert collected_items[-1]['id'] == 8 + + +async def test_drop(dataset_client: FileSystemDatasetClient) -> None: + """Test dropping a dataset removes the entire dataset directory from disk.""" + await dataset_client.push_data({'test': 'data'}) + + assert dataset_client.path_to_dataset.exists() + + # Drop the dataset + await dataset_client.drop() + + assert not dataset_client.path_to_dataset.exists() + + +async def test_metadata_updates(dataset_client: FileSystemDatasetClient) -> None: + """Test that metadata timestamps are updated correctly after read and write operations.""" + # Record initial timestamps + initial_created = dataset_client.metadata.created_at + initial_accessed = dataset_client.metadata.accessed_at + initial_modified = dataset_client.metadata.modified_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates accessed_at + await dataset_client.get_data() + + # Verify timestamps + assert dataset_client.metadata.created_at == initial_created + assert dataset_client.metadata.accessed_at > initial_accessed + assert dataset_client.metadata.modified_at == initial_modified + + accessed_after_get = dataset_client.metadata.accessed_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates modified_at + await dataset_client.push_data({'new': 'item'}) + + # Verify timestamps again + assert dataset_client.metadata.created_at == initial_created + assert dataset_client.metadata.modified_at > initial_modified + assert dataset_client.metadata.accessed_at > accessed_after_get diff --git a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py new file mode 100644 index 0000000000..bfc91af2cc --- /dev/null +++ b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py @@ -0,0 +1,360 @@ +from __future__ import annotations + +import asyncio +import json +from datetime import datetime +from typing import TYPE_CHECKING + +import pytest + +from crawlee._consts import METADATA_FILENAME +from crawlee.configuration import Configuration +from crawlee.storage_clients import FileSystemStorageClient +from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + from pathlib import Path + + +@pytest.fixture +def configuration(tmp_path: Path) -> Configuration: + return Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + ) + + +@pytest.fixture +async def kvs_client(configuration: Configuration) -> AsyncGenerator[FileSystemKeyValueStoreClient, None]: + """A fixture for a file system key-value store client.""" + client = await FileSystemStorageClient().open_key_value_store_client( + name='test_kvs', + configuration=configuration, + ) + yield client + await client.drop() + + +async def test_open_creates_new_kvs(configuration: Configuration) -> None: + """Test that open() creates a new key-value store with proper metadata and files on disk.""" + client = await FileSystemStorageClient().open_key_value_store_client( + name='new_kvs', + configuration=configuration, + ) + + # Verify correct client type and properties + assert isinstance(client, FileSystemKeyValueStoreClient) + assert client.metadata.id is not None + assert client.metadata.name == 'new_kvs' + assert isinstance(client.metadata.created_at, datetime) + assert isinstance(client.metadata.accessed_at, datetime) + assert isinstance(client.metadata.modified_at, datetime) + + # Verify files were created + assert client.path_to_kvs.exists() + assert client.path_to_metadata.exists() + + # Verify metadata content + with client.path_to_metadata.open() as f: + metadata = json.load(f) + assert metadata['id'] == client.metadata.id + assert metadata['name'] == 'new_kvs' + + +async def test_kvs_client_purge_on_start(configuration: Configuration) -> None: + """Test that purge_on_start=True clears existing data in the key-value store.""" + configuration.purge_on_start = True + + # Create KVS and add data + kvs_client1 = await FileSystemStorageClient().open_key_value_store_client( + configuration=configuration, + ) + await kvs_client1.set_value(key='test-key', value='initial value') + + # Verify value was set + record = await kvs_client1.get_value(key='test-key') + assert record is not None + assert record.value == 'initial value' + + # Reopen + kvs_client2 = await FileSystemStorageClient().open_key_value_store_client( + configuration=configuration, + ) + + # Verify value was purged + record = await kvs_client2.get_value(key='test-key') + assert record is None + + +async def test_kvs_client_no_purge_on_start(configuration: Configuration) -> None: + """Test that purge_on_start=False keeps existing data in the key-value store.""" + configuration.purge_on_start = False + + # Create KVS and add data + kvs_client1 = await FileSystemStorageClient().open_key_value_store_client( + name='test-no-purge-kvs', + configuration=configuration, + ) + await kvs_client1.set_value(key='test-key', value='preserved value') + + # Reopen + kvs_client2 = await FileSystemStorageClient().open_key_value_store_client( + name='test-no-purge-kvs', + configuration=configuration, + ) + + # Verify value was preserved + record = await kvs_client2.get_value(key='test-key') + assert record is not None + assert record.value == 'preserved value' + + +async def test_set_get_value_string(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test setting and getting a string value with correct file creation and metadata.""" + # Set a value + test_key = 'test-key' + test_value = 'Hello, world!' + await kvs_client.set_value(key=test_key, value=test_value) + + # Check if the file was created + key_path = kvs_client.path_to_kvs / test_key + key_metadata_path = kvs_client.path_to_kvs / f'{test_key}.{METADATA_FILENAME}' + assert key_path.exists() + assert key_metadata_path.exists() + + # Check file content + content = key_path.read_text(encoding='utf-8') + assert content == test_value + + # Check record metadata + with key_metadata_path.open() as f: + metadata = json.load(f) + assert metadata['key'] == test_key + assert metadata['content_type'] == 'text/plain; charset=utf-8' + assert metadata['size'] == len(test_value.encode('utf-8')) + + # Get the value + record = await kvs_client.get_value(key=test_key) + assert record is not None + assert record.key == test_key + assert record.value == test_value + assert record.content_type == 'text/plain; charset=utf-8' + assert record.size == len(test_value.encode('utf-8')) + + +async def test_set_get_value_json(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test setting and getting a JSON value with correct serialization and deserialization.""" + # Set a value + test_key = 'test-json' + test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]} + await kvs_client.set_value(key=test_key, value=test_value) + + # Get the value + record = await kvs_client.get_value(key=test_key) + assert record is not None + assert record.key == test_key + assert record.value == test_value + assert 'application/json' in record.content_type + + +async def test_set_get_value_bytes(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test setting and getting binary data without corruption and with correct content type.""" + # Set a value + test_key = 'test-binary' + test_value = b'\x00\x01\x02\x03\x04' + await kvs_client.set_value(key=test_key, value=test_value) + + # Get the value + record = await kvs_client.get_value(key=test_key) + assert record is not None + assert record.key == test_key + assert record.value == test_value + assert record.content_type == 'application/octet-stream' + assert record.size == len(test_value) + + +async def test_set_value_explicit_content_type(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that an explicitly provided content type overrides the automatically inferred one.""" + test_key = 'test-explicit-content-type' + test_value = 'Hello, world!' + explicit_content_type = 'text/html; charset=utf-8' + + await kvs_client.set_value(key=test_key, value=test_value, content_type=explicit_content_type) + + record = await kvs_client.get_value(key=test_key) + assert record is not None + assert record.content_type == explicit_content_type + + +async def test_get_nonexistent_value(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that attempting to get a non-existent key returns None.""" + record = await kvs_client.get_value(key='nonexistent-key') + assert record is None + + +async def test_overwrite_value(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that an existing value can be overwritten and the updated value is retrieved correctly.""" + test_key = 'test-overwrite' + + # Set initial value + initial_value = 'Initial value' + await kvs_client.set_value(key=test_key, value=initial_value) + + # Overwrite with new value + new_value = 'New value' + await kvs_client.set_value(key=test_key, value=new_value) + + # Verify the updated value + record = await kvs_client.get_value(key=test_key) + assert record is not None + assert record.value == new_value + + +async def test_delete_value(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that deleting a value removes its files from disk and makes it irretrievable.""" + test_key = 'test-delete' + test_value = 'Delete me' + + # Set a value + await kvs_client.set_value(key=test_key, value=test_value) + + # Verify it exists + key_path = kvs_client.path_to_kvs / test_key + metadata_path = kvs_client.path_to_kvs / f'{test_key}.{METADATA_FILENAME}' + assert key_path.exists() + assert metadata_path.exists() + + # Delete the value + await kvs_client.delete_value(key=test_key) + + # Verify files were deleted + assert not key_path.exists() + assert not metadata_path.exists() + + # Verify value is no longer retrievable + record = await kvs_client.get_value(key=test_key) + assert record is None + + +async def test_delete_nonexistent_value(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that attempting to delete a non-existent key is a no-op and doesn't raise errors.""" + # Should not raise an error + await kvs_client.delete_value(key='nonexistent-key') + + +async def test_iterate_keys_empty_store(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that iterating over an empty store yields no keys.""" + keys = [key async for key in kvs_client.iterate_keys()] + assert len(keys) == 0 + + +async def test_iterate_keys(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that all keys can be iterated over and are returned in sorted order.""" + # Add some values + await kvs_client.set_value(key='key1', value='value1') + await kvs_client.set_value(key='key2', value='value2') + await kvs_client.set_value(key='key3', value='value3') + + # Iterate over keys + keys = [key.key async for key in kvs_client.iterate_keys()] + assert len(keys) == 3 + assert sorted(keys) == ['key1', 'key2', 'key3'] + + +async def test_iterate_keys_with_limit(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that the limit parameter returns only the specified number of keys.""" + # Add some values + await kvs_client.set_value(key='key1', value='value1') + await kvs_client.set_value(key='key2', value='value2') + await kvs_client.set_value(key='key3', value='value3') + + # Iterate with limit + keys = [key.key async for key in kvs_client.iterate_keys(limit=2)] + assert len(keys) == 2 + + +async def test_iterate_keys_with_exclusive_start_key(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that exclusive_start_key parameter returns only keys after it alphabetically.""" + # Add some values with alphabetical keys + await kvs_client.set_value(key='a-key', value='value-a') + await kvs_client.set_value(key='b-key', value='value-b') + await kvs_client.set_value(key='c-key', value='value-c') + await kvs_client.set_value(key='d-key', value='value-d') + + # Iterate with exclusive start key + keys = [key.key async for key in kvs_client.iterate_keys(exclusive_start_key='b-key')] + assert len(keys) == 2 + assert 'c-key' in keys + assert 'd-key' in keys + assert 'a-key' not in keys + assert 'b-key' not in keys + + +async def test_drop(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that drop removes the entire store directory from disk.""" + await kvs_client.set_value(key='test', value='test-value') + + assert kvs_client.path_to_kvs.exists() + + # Drop the store + await kvs_client.drop() + + assert not kvs_client.path_to_kvs.exists() + + +async def test_metadata_updates(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that read/write operations properly update accessed_at and modified_at timestamps.""" + # Record initial timestamps + initial_created = kvs_client.metadata.created_at + initial_accessed = kvs_client.metadata.accessed_at + initial_modified = kvs_client.metadata.modified_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates accessed_at + await kvs_client.get_value(key='nonexistent') + + # Verify timestamps + assert kvs_client.metadata.created_at == initial_created + assert kvs_client.metadata.accessed_at > initial_accessed + assert kvs_client.metadata.modified_at == initial_modified + + accessed_after_get = kvs_client.metadata.accessed_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates modified_at + await kvs_client.set_value(key='new-key', value='new-value') + + # Verify timestamps again + assert kvs_client.metadata.created_at == initial_created + assert kvs_client.metadata.modified_at > initial_modified + assert kvs_client.metadata.accessed_at > accessed_after_get + + +async def test_get_public_url_not_supported(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that get_public_url raises NotImplementedError for the file system implementation.""" + with pytest.raises(NotImplementedError, match='Public URLs are not supported'): + await kvs_client.get_public_url(key='any-key') + + +async def test_concurrent_operations(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that multiple concurrent set operations can be performed safely with correct results.""" + + # Create multiple tasks to set different values concurrently + async def set_value(key: str, value: str) -> None: + await kvs_client.set_value(key=key, value=value) + + tasks = [asyncio.create_task(set_value(f'concurrent-key-{i}', f'value-{i}')) for i in range(10)] + + # Wait for all tasks to complete + await asyncio.gather(*tasks) + + # Verify all values were set correctly + for i in range(10): + key = f'concurrent-key-{i}' + record = await kvs_client.get_value(key=key) + assert record is not None + assert record.value == f'value-{i}' diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py new file mode 100644 index 0000000000..10ef63a8ef --- /dev/null +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -0,0 +1,454 @@ +from __future__ import annotations + +import asyncio +import json +from datetime import datetime +from typing import TYPE_CHECKING + +import pytest + +from crawlee import Request +from crawlee._consts import METADATA_FILENAME +from crawlee.configuration import Configuration +from crawlee.storage_clients import FileSystemStorageClient +from crawlee.storage_clients._file_system import FileSystemRequestQueueClient + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + from pathlib import Path + + +@pytest.fixture +def configuration(tmp_path: Path) -> Configuration: + return Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + ) + + +@pytest.fixture +async def rq_client(configuration: Configuration) -> AsyncGenerator[FileSystemRequestQueueClient, None]: + """A fixture for a file system request queue client.""" + client = await FileSystemStorageClient().open_request_queue_client( + name='test_request_queue', + configuration=configuration, + ) + yield client + await client.drop() + + +async def test_open_creates_new_rq(configuration: Configuration) -> None: + """Test that open() creates a new request queue with proper metadata and files on disk.""" + client = await FileSystemStorageClient().open_request_queue_client( + name='new_request_queue', + configuration=configuration, + ) + + # Verify correct client type and properties + assert isinstance(client, FileSystemRequestQueueClient) + assert client.metadata.id is not None + assert client.metadata.name == 'new_request_queue' + assert client.metadata.handled_request_count == 0 + assert client.metadata.pending_request_count == 0 + assert client.metadata.total_request_count == 0 + assert isinstance(client.metadata.created_at, datetime) + assert isinstance(client.metadata.accessed_at, datetime) + assert isinstance(client.metadata.modified_at, datetime) + + # Verify files were created + assert client.path_to_rq.exists() + assert client.path_to_metadata.exists() + + # Verify metadata content + with client.path_to_metadata.open() as f: + metadata = json.load(f) + assert metadata['id'] == client.metadata.id + assert metadata['name'] == 'new_request_queue' + + +async def test_rq_client_purge_on_start(configuration: Configuration) -> None: + """Test that purge_on_start=True clears existing data in the request queue.""" + configuration.purge_on_start = True + + # Create request queue and add data + rq_client1 = await FileSystemStorageClient().open_request_queue_client(configuration=configuration) + await rq_client1.add_batch_of_requests([Request.from_url('https://example.com')]) + + # Verify request was added + assert rq_client1.metadata.total_request_count == 1 + + # Reopen + rq_client2 = await FileSystemStorageClient().open_request_queue_client(configuration=configuration) + + # Verify data was purged + assert rq_client2.metadata.total_request_count == 0 + + +async def test_rq_client_no_purge_on_start(configuration: Configuration) -> None: + """Test that purge_on_start=False keeps existing data in the request queue.""" + configuration.purge_on_start = False + + # Create request queue and add data + rq_client1 = await FileSystemStorageClient().open_request_queue_client( + name='test-no-purge-rq', + configuration=configuration, + ) + await rq_client1.add_batch_of_requests([Request.from_url('https://example.com')]) + + # Reopen + rq_client2 = await FileSystemStorageClient().open_request_queue_client( + name='test-no-purge-rq', + configuration=configuration, + ) + + # Verify data was preserved + assert rq_client2.metadata.total_request_count == 1 + + +@pytest.fixture +def rq_path(rq_client: FileSystemRequestQueueClient) -> Path: + """Return the path to the request queue directory.""" + return rq_client.path_to_rq + + +async def test_add_requests(rq_client: FileSystemRequestQueueClient) -> None: + """Test adding requests creates proper files in the filesystem.""" + # Add a batch of requests + requests = [ + Request.from_url('https://example.com/1'), + Request.from_url('https://example.com/2'), + Request.from_url('https://example.com/3'), + ] + + response = await rq_client.add_batch_of_requests(requests) + + # Verify response + assert len(response.processed_requests) == 3 + for i, processed_request in enumerate(response.processed_requests): + assert processed_request.unique_key == f'https://example.com/{i + 1}' + assert processed_request.was_already_present is False + assert processed_request.was_already_handled is False + + # Verify request files were created + request_files = list(rq_client.path_to_rq.glob('*.json')) + assert len(request_files) == 4 # 3 requests + metadata file + assert rq_client.path_to_metadata in request_files + + # Verify metadata was updated + assert rq_client.metadata.total_request_count == 3 + assert rq_client.metadata.pending_request_count == 3 + + # Verify content of the request files + for req_file in [f for f in request_files if f != rq_client.path_to_metadata]: + with req_file.open() as f: + content = json.load(f) + assert 'url' in content + assert content['url'].startswith('https://example.com/') + assert 'id' in content + assert 'handled_at' not in content # Not yet handled + + +async def test_add_duplicate_request(rq_client: FileSystemRequestQueueClient) -> None: + """Test adding a duplicate request.""" + request = Request.from_url('https://example.com') + + # Add the request the first time + await rq_client.add_batch_of_requests([request]) + + # Add the same request again + second_response = await rq_client.add_batch_of_requests([request]) + + # Verify response indicates it was already present + assert second_response.processed_requests[0].was_already_present is True + + # Verify only one request file exists + request_files = [f for f in rq_client.path_to_rq.glob('*.json') if f.name != METADATA_FILENAME] + assert len(request_files) == 1 + + # Verify metadata counts weren't incremented + assert rq_client.metadata.total_request_count == 1 + assert rq_client.metadata.pending_request_count == 1 + + +async def test_fetch_next_request(rq_client: FileSystemRequestQueueClient) -> None: + """Test fetching the next request from the queue.""" + # Add requests + requests = [ + Request.from_url('https://example.com/1'), + Request.from_url('https://example.com/2'), + ] + await rq_client.add_batch_of_requests(requests) + + # Fetch the first request + first_request = await rq_client.fetch_next_request() + assert first_request is not None + assert first_request.url == 'https://example.com/1' + + # Check that it's marked as in-progress + assert first_request.id in rq_client._in_progress + + # Fetch the second request + second_request = await rq_client.fetch_next_request() + assert second_request is not None + assert second_request.url == 'https://example.com/2' + + # There should be no more requests + empty_request = await rq_client.fetch_next_request() + assert empty_request is None + + +async def test_fetch_forefront_requests(rq_client: FileSystemRequestQueueClient) -> None: + """Test that forefront requests are fetched first.""" + # Add regular requests + await rq_client.add_batch_of_requests( + [ + Request.from_url('https://example.com/regular1'), + Request.from_url('https://example.com/regular2'), + ] + ) + + # Add forefront requests + await rq_client.add_batch_of_requests( + [ + Request.from_url('https://example.com/priority1'), + Request.from_url('https://example.com/priority2'), + ], + forefront=True, + ) + + # Fetch requests - they should come in priority order first + next_request1 = await rq_client.fetch_next_request() + assert next_request1 is not None + assert next_request1.url.startswith('https://example.com/priority') + + next_request2 = await rq_client.fetch_next_request() + assert next_request2 is not None + assert next_request2.url.startswith('https://example.com/priority') + + next_request3 = await rq_client.fetch_next_request() + assert next_request3 is not None + assert next_request3.url.startswith('https://example.com/regular') + + next_request4 = await rq_client.fetch_next_request() + assert next_request4 is not None + assert next_request4.url.startswith('https://example.com/regular') + + +async def test_mark_request_as_handled(rq_client: FileSystemRequestQueueClient) -> None: + """Test marking a request as handled.""" + # Add and fetch a request + await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) + request = await rq_client.fetch_next_request() + assert request is not None + + # Mark it as handled + result = await rq_client.mark_request_as_handled(request) + assert result is not None + assert result.was_already_handled is True + + # Verify it's no longer in-progress + assert request.id not in rq_client._in_progress + + # Verify metadata was updated + assert rq_client.metadata.handled_request_count == 1 + assert rq_client.metadata.pending_request_count == 0 + + # Verify the file was updated with handled_at timestamp + request_files = [f for f in rq_client.path_to_rq.glob('*.json') if f.name != METADATA_FILENAME] + assert len(request_files) == 1 + + with request_files[0].open() as f: + content = json.load(f) + assert 'handled_at' in content + assert content['handled_at'] is not None + + +async def test_reclaim_request(rq_client: FileSystemRequestQueueClient) -> None: + """Test reclaiming a request that failed processing.""" + # Add and fetch a request + await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) + request = await rq_client.fetch_next_request() + assert request is not None + + # Reclaim the request + result = await rq_client.reclaim_request(request) + assert result is not None + assert result.was_already_handled is False + + # Verify it's no longer in-progress + assert request.id not in rq_client._in_progress + + # Should be able to fetch it again + reclaimed_request = await rq_client.fetch_next_request() + assert reclaimed_request is not None + assert reclaimed_request.id == request.id + + +async def test_reclaim_request_with_forefront(rq_client: FileSystemRequestQueueClient) -> None: + """Test reclaiming a request with forefront priority.""" + # Add requests + await rq_client.add_batch_of_requests( + [ + Request.from_url('https://example.com/first'), + Request.from_url('https://example.com/second'), + ] + ) + + # Fetch the first request + first_request = await rq_client.fetch_next_request() + assert first_request is not None + assert first_request.url == 'https://example.com/first' + + # Reclaim it with forefront priority + await rq_client.reclaim_request(first_request, forefront=True) + + # Verify it's in the forefront set + assert first_request.id in rq_client._forefront_requests + + # It should be returned before the second request + reclaimed_request = await rq_client.fetch_next_request() + assert reclaimed_request is not None + assert reclaimed_request.url == 'https://example.com/first' + + +async def test_is_empty(rq_client: FileSystemRequestQueueClient) -> None: + """Test checking if a queue is empty.""" + # Queue should start empty + assert await rq_client.is_empty() is True + + # Add a request + await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) + assert await rq_client.is_empty() is False + + # Fetch and handle the request + request = await rq_client.fetch_next_request() + assert request is not None + await rq_client.mark_request_as_handled(request) + + # Queue should be empty again + assert await rq_client.is_empty() is True + + +async def test_get_request(rq_client: FileSystemRequestQueueClient) -> None: + """Test getting a request by ID.""" + # Add a request + response = await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) + request_id = response.processed_requests[0].id + + # Get the request by ID + request = await rq_client.get_request(request_id) + assert request is not None + assert request.id == request_id + assert request.url == 'https://example.com' + + # Try to get a non-existent request + not_found = await rq_client.get_request('non-existent-id') + assert not_found is None + + +async def test_drop(configuration: Configuration) -> None: + """Test dropping the queue removes files from the filesystem.""" + client = await FileSystemStorageClient().open_request_queue_client( + name='drop_test', + configuration=configuration, + ) + + # Add requests to create files + await client.add_batch_of_requests( + [ + Request.from_url('https://example.com/1'), + Request.from_url('https://example.com/2'), + ] + ) + + # Verify the directory exists + rq_path = client.path_to_rq + assert rq_path.exists() + + # Drop the client + await client.drop() + + # Verify the directory was removed + assert not rq_path.exists() + + +async def test_file_persistence(configuration: Configuration) -> None: + """Test that requests are persisted to files and can be recovered after a 'restart'.""" + # Explicitly set purge_on_start to False to ensure files aren't deleted + configuration.purge_on_start = False + + # Create a client and add requests + client1 = await FileSystemStorageClient().open_request_queue_client( + name='persistence_test', + configuration=configuration, + ) + + await client1.add_batch_of_requests( + [ + Request.from_url('https://example.com/1'), + Request.from_url('https://example.com/2'), + ] + ) + + # Fetch and handle one request + request = await client1.fetch_next_request() + assert request is not None + await client1.mark_request_as_handled(request) + + # Get the storage directory path before clearing the cache + storage_path = client1.path_to_rq + assert storage_path.exists(), 'Request queue directory should exist' + + # Verify files exist + request_files = list(storage_path.glob('*.json')) + assert len(request_files) > 0, 'Request files should exist' + + # Create a new client with same name (which will load from files) + client2 = await FileSystemStorageClient().open_request_queue_client( + name='persistence_test', + configuration=configuration, + ) + + # Verify state was recovered + assert client2.metadata.total_request_count == 2 + assert client2.metadata.handled_request_count == 1 + assert client2.metadata.pending_request_count == 1 + + # Should be able to fetch the remaining request + remaining_request = await client2.fetch_next_request() + assert remaining_request is not None + assert remaining_request.url == 'https://example.com/2' + + # Clean up + await client2.drop() + + +async def test_metadata_updates(rq_client: FileSystemRequestQueueClient) -> None: + """Test that metadata timestamps are updated correctly after operations.""" + # Record initial timestamps + initial_created = rq_client.metadata.created_at + initial_accessed = rq_client.metadata.accessed_at + initial_modified = rq_client.metadata.modified_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates accessed_at + await rq_client.is_empty() + + # Verify timestamps + assert rq_client.metadata.created_at == initial_created + assert rq_client.metadata.accessed_at > initial_accessed + assert rq_client.metadata.modified_at == initial_modified + + accessed_after_get = rq_client.metadata.accessed_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates modified_at + await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) + + # Verify timestamps again + assert rq_client.metadata.created_at == initial_created + assert rq_client.metadata.modified_at > initial_modified + assert rq_client.metadata.accessed_at > accessed_after_get diff --git a/tests/unit/storage_clients/_memory/test_creation_management.py b/tests/unit/storage_clients/_memory/test_creation_management.py deleted file mode 100644 index 88a5e9e283..0000000000 --- a/tests/unit/storage_clients/_memory/test_creation_management.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import annotations - -import json -from pathlib import Path -from unittest.mock import AsyncMock, patch - -import pytest - -from crawlee._consts import METADATA_FILENAME -from crawlee.storage_clients._memory._creation_management import persist_metadata_if_enabled - - -async def test_persist_metadata_skips_when_disabled(tmp_path: Path) -> None: - await persist_metadata_if_enabled(data={'key': 'value'}, entity_directory=str(tmp_path), write_metadata=False) - assert not list(tmp_path.iterdir()) # The directory should be empty since write_metadata is False - - -async def test_persist_metadata_creates_files_and_directories_when_enabled(tmp_path: Path) -> None: - data = {'key': 'value'} - entity_directory = Path(tmp_path, 'new_dir') - await persist_metadata_if_enabled(data=data, entity_directory=str(entity_directory), write_metadata=True) - assert entity_directory.exists() is True # Check if directory was created - assert (entity_directory / METADATA_FILENAME).is_file() # Check if file was created - - -async def test_persist_metadata_correctly_writes_data(tmp_path: Path) -> None: - data = {'key': 'value'} - entity_directory = Path(tmp_path, 'data_dir') - await persist_metadata_if_enabled(data=data, entity_directory=str(entity_directory), write_metadata=True) - metadata_path = entity_directory / METADATA_FILENAME - with open(metadata_path) as f: # noqa: ASYNC230 - content = f.read() - assert json.loads(content) == data # Check if correct data was written - - -async def test_persist_metadata_rewrites_data_with_error(tmp_path: Path) -> None: - init_data = {'key': 'very_long_value'} - update_data = {'key': 'short_value'} - error_data = {'key': 'error'} - - entity_directory = Path(tmp_path, 'data_dir') - metadata_path = entity_directory / METADATA_FILENAME - - # write metadata with init_data - await persist_metadata_if_enabled(data=init_data, entity_directory=str(entity_directory), write_metadata=True) - - # rewrite metadata with new_data - await persist_metadata_if_enabled(data=update_data, entity_directory=str(entity_directory), write_metadata=True) - with open(metadata_path) as f: # noqa: ASYNC230 - content = f.read() - assert json.loads(content) == update_data # Check if correct data was rewritten - - # raise interrupt between opening a file and writing - module_for_patch = 'crawlee.storage_clients._memory._creation_management.json_dumps' - with patch(module_for_patch, AsyncMock(side_effect=KeyboardInterrupt())), pytest.raises(KeyboardInterrupt): - await persist_metadata_if_enabled(data=error_data, entity_directory=str(entity_directory), write_metadata=True) - with open(metadata_path) as f: # noqa: ASYNC230 - content = f.read() - assert content == '' # The file is empty after an error diff --git a/tests/unit/storage_clients/_memory/test_dataset_client.py b/tests/unit/storage_clients/_memory/test_dataset_client.py deleted file mode 100644 index 472d11a8b3..0000000000 --- a/tests/unit/storage_clients/_memory/test_dataset_client.py +++ /dev/null @@ -1,148 +0,0 @@ -from __future__ import annotations - -import asyncio -from pathlib import Path -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from crawlee.storage_clients import MemoryStorageClient - from crawlee.storage_clients._memory import DatasetClient - - -@pytest.fixture -async def dataset_client(memory_storage_client: MemoryStorageClient) -> DatasetClient: - datasets_client = memory_storage_client.datasets() - dataset_info = await datasets_client.get_or_create(name='test') - return memory_storage_client.dataset(dataset_info.id) - - -async def test_nonexistent(memory_storage_client: MemoryStorageClient) -> None: - dataset_client = memory_storage_client.dataset(id='nonexistent-id') - assert await dataset_client.get() is None - with pytest.raises(ValueError, match='Dataset with id "nonexistent-id" does not exist.'): - await dataset_client.update(name='test-update') - - with pytest.raises(ValueError, match='Dataset with id "nonexistent-id" does not exist.'): - await dataset_client.list_items() - - with pytest.raises(ValueError, match='Dataset with id "nonexistent-id" does not exist.'): - await dataset_client.push_items([{'abc': 123}]) - await dataset_client.delete() - - -async def test_not_implemented(dataset_client: DatasetClient) -> None: - with pytest.raises(NotImplementedError, match='This method is not supported in memory storage.'): - await dataset_client.stream_items() - with pytest.raises(NotImplementedError, match='This method is not supported in memory storage.'): - await dataset_client.get_items_as_bytes() - - -async def test_get(dataset_client: DatasetClient) -> None: - await asyncio.sleep(0.1) - info = await dataset_client.get() - assert info is not None - assert info.id == dataset_client.id - assert info.accessed_at != info.created_at - - -async def test_update(dataset_client: DatasetClient) -> None: - new_dataset_name = 'test-update' - await dataset_client.push_items({'abc': 123}) - - old_dataset_info = await dataset_client.get() - assert old_dataset_info is not None - old_dataset_directory = Path(dataset_client._memory_storage_client.datasets_directory, old_dataset_info.name or '') - new_dataset_directory = Path(dataset_client._memory_storage_client.datasets_directory, new_dataset_name) - assert (old_dataset_directory / '000000001.json').exists() is True - assert (new_dataset_directory / '000000001.json').exists() is False - - await asyncio.sleep(0.1) - updated_dataset_info = await dataset_client.update(name=new_dataset_name) - assert (old_dataset_directory / '000000001.json').exists() is False - assert (new_dataset_directory / '000000001.json').exists() is True - # Only modified_at and accessed_at should be different - assert old_dataset_info.created_at == updated_dataset_info.created_at - assert old_dataset_info.modified_at != updated_dataset_info.modified_at - assert old_dataset_info.accessed_at != updated_dataset_info.accessed_at - - # Should fail with the same name - with pytest.raises(ValueError, match='Dataset with name "test-update" already exists.'): - await dataset_client.update(name=new_dataset_name) - - -async def test_delete(dataset_client: DatasetClient) -> None: - await dataset_client.push_items({'abc': 123}) - dataset_info = await dataset_client.get() - assert dataset_info is not None - dataset_directory = Path(dataset_client._memory_storage_client.datasets_directory, dataset_info.name or '') - assert (dataset_directory / '000000001.json').exists() is True - await dataset_client.delete() - assert (dataset_directory / '000000001.json').exists() is False - # Does not crash when called again - await dataset_client.delete() - - -async def test_push_items(dataset_client: DatasetClient) -> None: - await dataset_client.push_items('{"test": "JSON from a string"}') - await dataset_client.push_items({'abc': {'def': {'ghi': '123'}}}) - await dataset_client.push_items(['{"test-json-parse": "JSON from a string"}' for _ in range(10)]) - await dataset_client.push_items([{'test-dict': i} for i in range(10)]) - - list_page = await dataset_client.list_items() - assert list_page.items[0]['test'] == 'JSON from a string' - assert list_page.items[1]['abc']['def']['ghi'] == '123' - assert list_page.items[11]['test-json-parse'] == 'JSON from a string' - assert list_page.items[21]['test-dict'] == 9 - assert list_page.count == 22 - - -async def test_list_items(dataset_client: DatasetClient) -> None: - item_count = 100 - used_offset = 10 - used_limit = 50 - await dataset_client.push_items([{'id': i} for i in range(item_count)]) - # Test without any parameters - list_default = await dataset_client.list_items() - assert list_default.count == item_count - assert list_default.offset == 0 - assert list_default.items[0]['id'] == 0 - assert list_default.desc is False - # Test offset - list_offset_10 = await dataset_client.list_items(offset=used_offset) - assert list_offset_10.count == item_count - used_offset - assert list_offset_10.offset == used_offset - assert list_offset_10.total == item_count - assert list_offset_10.items[0]['id'] == used_offset - # Test limit - list_limit_50 = await dataset_client.list_items(limit=used_limit) - assert list_limit_50.count == used_limit - assert list_limit_50.limit == used_limit - assert list_limit_50.total == item_count - # Test desc - list_desc_true = await dataset_client.list_items(desc=True) - assert list_desc_true.items[0]['id'] == 99 - assert list_desc_true.desc is True - - -async def test_iterate_items(dataset_client: DatasetClient) -> None: - item_count = 100 - await dataset_client.push_items([{'id': i} for i in range(item_count)]) - actual_items = [] - async for item in dataset_client.iterate_items(): - assert 'id' in item - actual_items.append(item) - assert len(actual_items) == item_count - assert actual_items[0]['id'] == 0 - assert actual_items[99]['id'] == 99 - - -async def test_reuse_dataset(dataset_client: DatasetClient, memory_storage_client: MemoryStorageClient) -> None: - item_count = 10 - await dataset_client.push_items([{'id': i} for i in range(item_count)]) - - memory_storage_client.datasets_handled = [] # purge datasets loaded to test create_dataset_from_directory - datasets_client = memory_storage_client.datasets() - dataset_info = await datasets_client.get_or_create(name='test') - assert dataset_info.item_count == item_count diff --git a/tests/unit/storage_clients/_memory/test_dataset_collection_client.py b/tests/unit/storage_clients/_memory/test_dataset_collection_client.py deleted file mode 100644 index d71b7e8f68..0000000000 --- a/tests/unit/storage_clients/_memory/test_dataset_collection_client.py +++ /dev/null @@ -1,45 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from crawlee.storage_clients import MemoryStorageClient - from crawlee.storage_clients._memory import DatasetCollectionClient - - -@pytest.fixture -def datasets_client(memory_storage_client: MemoryStorageClient) -> DatasetCollectionClient: - return memory_storage_client.datasets() - - -async def test_get_or_create(datasets_client: DatasetCollectionClient) -> None: - dataset_name = 'test' - # A new dataset gets created - dataset_info = await datasets_client.get_or_create(name=dataset_name) - assert dataset_info.name == dataset_name - - # Another get_or_create call returns the same dataset - dataset_info_existing = await datasets_client.get_or_create(name=dataset_name) - assert dataset_info.id == dataset_info_existing.id - assert dataset_info.name == dataset_info_existing.name - assert dataset_info.created_at == dataset_info_existing.created_at - - -async def test_list(datasets_client: DatasetCollectionClient) -> None: - dataset_list_1 = await datasets_client.list() - assert dataset_list_1.count == 0 - - dataset_info = await datasets_client.get_or_create(name='dataset') - dataset_list_2 = await datasets_client.list() - - assert dataset_list_2.count == 1 - assert dataset_list_2.items[0].name == dataset_info.name - - # Test sorting behavior - newer_dataset_info = await datasets_client.get_or_create(name='newer-dataset') - dataset_list_sorting = await datasets_client.list() - assert dataset_list_sorting.count == 2 - assert dataset_list_sorting.items[0].name == dataset_info.name - assert dataset_list_sorting.items[1].name == newer_dataset_info.name diff --git a/tests/unit/storage_clients/_memory/test_key_value_store_client.py b/tests/unit/storage_clients/_memory/test_key_value_store_client.py deleted file mode 100644 index 26d1f8f974..0000000000 --- a/tests/unit/storage_clients/_memory/test_key_value_store_client.py +++ /dev/null @@ -1,443 +0,0 @@ -from __future__ import annotations - -import asyncio -import base64 -import json -from datetime import datetime, timezone -from pathlib import Path -from typing import TYPE_CHECKING - -import pytest - -from crawlee._consts import METADATA_FILENAME -from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.data_processing import maybe_parse_body -from crawlee._utils.file import json_dumps -from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecordMetadata - -if TYPE_CHECKING: - from crawlee.storage_clients import MemoryStorageClient - from crawlee.storage_clients._memory import KeyValueStoreClient - -TINY_PNG = base64.b64decode( - s='iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVQYV2NgYAAAAAMAAWgmWQ0AAAAASUVORK5CYII=', -) -TINY_BYTES = b'\x12\x34\x56\x78\x90\xab\xcd\xef' -TINY_DATA = {'a': 'b'} -TINY_TEXT = 'abcd' - - -@pytest.fixture -async def key_value_store_client(memory_storage_client: MemoryStorageClient) -> KeyValueStoreClient: - key_value_stores_client = memory_storage_client.key_value_stores() - kvs_info = await key_value_stores_client.get_or_create(name='test') - return memory_storage_client.key_value_store(kvs_info.id) - - -async def test_nonexistent(memory_storage_client: MemoryStorageClient) -> None: - kvs_client = memory_storage_client.key_value_store(id='nonexistent-id') - assert await kvs_client.get() is None - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.update(name='test-update') - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.list_keys() - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.set_record('test', {'abc': 123}) - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.get_record('test') - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.get_record_as_bytes('test') - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.delete_record('test') - - await kvs_client.delete() - - -async def test_not_implemented(key_value_store_client: KeyValueStoreClient) -> None: - with pytest.raises(NotImplementedError, match='This method is not supported in memory storage.'): - await key_value_store_client.stream_record('test') - - -async def test_get(key_value_store_client: KeyValueStoreClient) -> None: - await asyncio.sleep(0.1) - info = await key_value_store_client.get() - assert info is not None - assert info.id == key_value_store_client.id - assert info.accessed_at != info.created_at - - -async def test_update(key_value_store_client: KeyValueStoreClient) -> None: - new_kvs_name = 'test-update' - await key_value_store_client.set_record('test', {'abc': 123}) - old_kvs_info = await key_value_store_client.get() - assert old_kvs_info is not None - old_kvs_directory = Path( - key_value_store_client._memory_storage_client.key_value_stores_directory, old_kvs_info.name or '' - ) - new_kvs_directory = Path(key_value_store_client._memory_storage_client.key_value_stores_directory, new_kvs_name) - assert (old_kvs_directory / 'test.json').exists() is True - assert (new_kvs_directory / 'test.json').exists() is False - - await asyncio.sleep(0.1) - updated_kvs_info = await key_value_store_client.update(name=new_kvs_name) - assert (old_kvs_directory / 'test.json').exists() is False - assert (new_kvs_directory / 'test.json').exists() is True - # Only modified_at and accessed_at should be different - assert old_kvs_info.created_at == updated_kvs_info.created_at - assert old_kvs_info.modified_at != updated_kvs_info.modified_at - assert old_kvs_info.accessed_at != updated_kvs_info.accessed_at - - # Should fail with the same name - with pytest.raises(ValueError, match='Key-value store with name "test-update" already exists.'): - await key_value_store_client.update(name=new_kvs_name) - - -async def test_delete(key_value_store_client: KeyValueStoreClient) -> None: - await key_value_store_client.set_record('test', {'abc': 123}) - kvs_info = await key_value_store_client.get() - assert kvs_info is not None - kvs_directory = Path(key_value_store_client._memory_storage_client.key_value_stores_directory, kvs_info.name or '') - assert (kvs_directory / 'test.json').exists() is True - await key_value_store_client.delete() - assert (kvs_directory / 'test.json').exists() is False - # Does not crash when called again - await key_value_store_client.delete() - - -async def test_list_keys_empty(key_value_store_client: KeyValueStoreClient) -> None: - keys = await key_value_store_client.list_keys() - assert len(keys.items) == 0 - assert keys.count == 0 - assert keys.is_truncated is False - - -async def test_list_keys(key_value_store_client: KeyValueStoreClient) -> None: - record_count = 4 - used_limit = 2 - used_exclusive_start_key = 'a' - await key_value_store_client.set_record('b', 'test') - await key_value_store_client.set_record('a', 'test') - await key_value_store_client.set_record('d', 'test') - await key_value_store_client.set_record('c', 'test') - - # Default settings - keys = await key_value_store_client.list_keys() - assert keys.items[0].key == 'a' - assert keys.items[3].key == 'd' - assert keys.count == record_count - assert keys.is_truncated is False - # Test limit - keys_limit_2 = await key_value_store_client.list_keys(limit=used_limit) - assert keys_limit_2.count == record_count - assert keys_limit_2.limit == used_limit - assert keys_limit_2.items[1].key == 'b' - # Test exclusive start key - keys_exclusive_start = await key_value_store_client.list_keys(exclusive_start_key=used_exclusive_start_key, limit=2) - assert keys_exclusive_start.exclusive_start_key == used_exclusive_start_key - assert keys_exclusive_start.is_truncated is True - assert keys_exclusive_start.next_exclusive_start_key == 'c' - assert keys_exclusive_start.items[0].key == 'b' - assert keys_exclusive_start.items[-1].key == keys_exclusive_start.next_exclusive_start_key - - -async def test_get_and_set_record(tmp_path: Path, key_value_store_client: KeyValueStoreClient) -> None: - # Test setting dict record - dict_record_key = 'test-dict' - await key_value_store_client.set_record(dict_record_key, {'test': 123}) - dict_record_info = await key_value_store_client.get_record(dict_record_key) - assert dict_record_info is not None - assert 'application/json' in str(dict_record_info.content_type) - assert dict_record_info.value['test'] == 123 - - # Test setting str record - str_record_key = 'test-str' - await key_value_store_client.set_record(str_record_key, 'test') - str_record_info = await key_value_store_client.get_record(str_record_key) - assert str_record_info is not None - assert 'text/plain' in str(str_record_info.content_type) - assert str_record_info.value == 'test' - - # Test setting explicit json record but use str as value, i.e. json dumps is skipped - explicit_json_key = 'test-json' - await key_value_store_client.set_record(explicit_json_key, '{"test": "explicit string"}', 'application/json') - bytes_record_info = await key_value_store_client.get_record(explicit_json_key) - assert bytes_record_info is not None - assert 'application/json' in str(bytes_record_info.content_type) - assert bytes_record_info.value['test'] == 'explicit string' - - # Test using bytes - bytes_key = 'test-json' - bytes_value = b'testing bytes set_record' - await key_value_store_client.set_record(bytes_key, bytes_value, 'unknown') - bytes_record_info = await key_value_store_client.get_record(bytes_key) - assert bytes_record_info is not None - assert 'unknown' in str(bytes_record_info.content_type) - assert bytes_record_info.value == bytes_value - assert bytes_record_info.value.decode('utf-8') == bytes_value.decode('utf-8') - - # Test using file descriptor - with open(tmp_path / 'test.json', 'w+', encoding='utf-8') as f: # noqa: ASYNC230 - f.write('Test') - with pytest.raises(NotImplementedError, match='File-like values are not supported in local memory storage'): - await key_value_store_client.set_record('file', f) - - -async def test_get_record_as_bytes(key_value_store_client: KeyValueStoreClient) -> None: - record_key = 'test' - record_value = 'testing' - await key_value_store_client.set_record(record_key, record_value) - record_info = await key_value_store_client.get_record_as_bytes(record_key) - assert record_info is not None - assert record_info.value == record_value.encode('utf-8') - - -async def test_delete_record(key_value_store_client: KeyValueStoreClient) -> None: - record_key = 'test' - await key_value_store_client.set_record(record_key, 'test') - await key_value_store_client.delete_record(record_key) - # Does not crash when called again - await key_value_store_client.delete_record(record_key) - - -@pytest.mark.parametrize( - ('input_data', 'expected_output'), - [ - ( - {'key': 'image', 'value': TINY_PNG, 'contentType': None}, - {'filename': 'image', 'key': 'image', 'contentType': 'application/octet-stream'}, - ), - ( - {'key': 'image', 'value': TINY_PNG, 'contentType': 'image/png'}, - {'filename': 'image.png', 'key': 'image', 'contentType': 'image/png'}, - ), - ( - {'key': 'image.png', 'value': TINY_PNG, 'contentType': None}, - {'filename': 'image.png', 'key': 'image.png', 'contentType': 'application/octet-stream'}, - ), - ( - {'key': 'image.png', 'value': TINY_PNG, 'contentType': 'image/png'}, - {'filename': 'image.png', 'key': 'image.png', 'contentType': 'image/png'}, - ), - ( - {'key': 'data', 'value': TINY_DATA, 'contentType': None}, - {'filename': 'data.json', 'key': 'data', 'contentType': 'application/json'}, - ), - ( - {'key': 'data', 'value': TINY_DATA, 'contentType': 'application/json'}, - {'filename': 'data.json', 'key': 'data', 'contentType': 'application/json'}, - ), - ( - {'key': 'data.json', 'value': TINY_DATA, 'contentType': None}, - {'filename': 'data.json', 'key': 'data.json', 'contentType': 'application/json'}, - ), - ( - {'key': 'data.json', 'value': TINY_DATA, 'contentType': 'application/json'}, - {'filename': 'data.json', 'key': 'data.json', 'contentType': 'application/json'}, - ), - ( - {'key': 'text', 'value': TINY_TEXT, 'contentType': None}, - {'filename': 'text.txt', 'key': 'text', 'contentType': 'text/plain'}, - ), - ( - {'key': 'text', 'value': TINY_TEXT, 'contentType': 'text/plain'}, - {'filename': 'text.txt', 'key': 'text', 'contentType': 'text/plain'}, - ), - ( - {'key': 'text.txt', 'value': TINY_TEXT, 'contentType': None}, - {'filename': 'text.txt', 'key': 'text.txt', 'contentType': 'text/plain'}, - ), - ( - {'key': 'text.txt', 'value': TINY_TEXT, 'contentType': 'text/plain'}, - {'filename': 'text.txt', 'key': 'text.txt', 'contentType': 'text/plain'}, - ), - ], -) -async def test_writes_correct_metadata( - memory_storage_client: MemoryStorageClient, - input_data: dict, - expected_output: dict, -) -> None: - key_value_store_name = crypto_random_object_id() - - # Get KVS client - kvs_info = await memory_storage_client.key_value_stores().get_or_create(name=key_value_store_name) - kvs_client = memory_storage_client.key_value_store(kvs_info.id) - - # Write the test input item to the store - await kvs_client.set_record( - key=input_data['key'], - value=input_data['value'], - content_type=input_data['contentType'], - ) - - # Check that everything was written correctly, both the data and metadata - storage_path = Path(memory_storage_client.key_value_stores_directory, key_value_store_name) - item_path = Path(storage_path, expected_output['filename']) - item_metadata_path = storage_path / f'{expected_output["filename"]}.__metadata__.json' - - assert item_path.exists() - assert item_metadata_path.exists() - - # Test the actual value of the item - with open(item_path, 'rb') as item_file: # noqa: ASYNC230 - actual_value = maybe_parse_body(item_file.read(), expected_output['contentType']) - assert actual_value == input_data['value'] - - # Test the actual metadata of the item - with open(item_metadata_path, encoding='utf-8') as metadata_file: # noqa: ASYNC230 - json_content = json.load(metadata_file) - metadata = KeyValueStoreRecordMetadata(**json_content) - assert metadata.key == expected_output['key'] - assert expected_output['contentType'] in metadata.content_type - - -@pytest.mark.parametrize( - ('input_data', 'expected_output'), - [ - ( - {'filename': 'image', 'value': TINY_PNG, 'metadata': None}, - {'key': 'image', 'filename': 'image', 'contentType': 'application/octet-stream'}, - ), - ( - {'filename': 'image.png', 'value': TINY_PNG, 'metadata': None}, - {'key': 'image', 'filename': 'image.png', 'contentType': 'image/png'}, - ), - ( - { - 'filename': 'image', - 'value': TINY_PNG, - 'metadata': {'key': 'image', 'contentType': 'application/octet-stream'}, - }, - {'key': 'image', 'contentType': 'application/octet-stream'}, - ), - ( - {'filename': 'image', 'value': TINY_PNG, 'metadata': {'key': 'image', 'contentType': 'image/png'}}, - {'key': 'image', 'filename': 'image', 'contentType': 'image/png'}, - ), - ( - { - 'filename': 'image.png', - 'value': TINY_PNG, - 'metadata': {'key': 'image.png', 'contentType': 'application/octet-stream'}, - }, - {'key': 'image.png', 'contentType': 'application/octet-stream'}, - ), - ( - {'filename': 'image.png', 'value': TINY_PNG, 'metadata': {'key': 'image.png', 'contentType': 'image/png'}}, - {'key': 'image.png', 'contentType': 'image/png'}, - ), - ( - {'filename': 'image.png', 'value': TINY_PNG, 'metadata': {'key': 'image', 'contentType': 'image/png'}}, - {'key': 'image', 'contentType': 'image/png'}, - ), - ( - {'filename': 'input', 'value': TINY_BYTES, 'metadata': None}, - {'key': 'input', 'contentType': 'application/octet-stream'}, - ), - ( - {'filename': 'input.json', 'value': TINY_DATA, 'metadata': None}, - {'key': 'input', 'contentType': 'application/json'}, - ), - ( - {'filename': 'input.txt', 'value': TINY_TEXT, 'metadata': None}, - {'key': 'input', 'contentType': 'text/plain'}, - ), - ( - {'filename': 'input.bin', 'value': TINY_BYTES, 'metadata': None}, - {'key': 'input', 'contentType': 'application/octet-stream'}, - ), - ( - { - 'filename': 'input', - 'value': TINY_BYTES, - 'metadata': {'key': 'input', 'contentType': 'application/octet-stream'}, - }, - {'key': 'input', 'contentType': 'application/octet-stream'}, - ), - ( - { - 'filename': 'input.json', - 'value': TINY_DATA, - 'metadata': {'key': 'input', 'contentType': 'application/json'}, - }, - {'key': 'input', 'contentType': 'application/json'}, - ), - ( - {'filename': 'input.txt', 'value': TINY_TEXT, 'metadata': {'key': 'input', 'contentType': 'text/plain'}}, - {'key': 'input', 'contentType': 'text/plain'}, - ), - ( - { - 'filename': 'input.bin', - 'value': TINY_BYTES, - 'metadata': {'key': 'input', 'contentType': 'application/octet-stream'}, - }, - {'key': 'input', 'contentType': 'application/octet-stream'}, - ), - ], -) -async def test_reads_correct_metadata( - memory_storage_client: MemoryStorageClient, - input_data: dict, - expected_output: dict, -) -> None: - key_value_store_name = crypto_random_object_id() - - # Ensure the directory for the store exists - storage_path = Path(memory_storage_client.key_value_stores_directory, key_value_store_name) - storage_path.mkdir(exist_ok=True, parents=True) - - store_metadata = KeyValueStoreMetadata( - id=crypto_random_object_id(), - name='', - accessed_at=datetime.now(timezone.utc), - created_at=datetime.now(timezone.utc), - modified_at=datetime.now(timezone.utc), - user_id='1', - ) - - # Write the store metadata to disk - storage_metadata_path = storage_path / METADATA_FILENAME - with open(storage_metadata_path, mode='wb') as f: # noqa: ASYNC230 - f.write(store_metadata.model_dump_json().encode('utf-8')) - - # Write the test input item to the disk - item_path = storage_path / input_data['filename'] - with open(item_path, 'wb') as item_file: # noqa: ASYNC230 - if isinstance(input_data['value'], bytes): - item_file.write(input_data['value']) - elif isinstance(input_data['value'], str): - item_file.write(input_data['value'].encode('utf-8')) - else: - s = await json_dumps(input_data['value']) - item_file.write(s.encode('utf-8')) - - # Optionally write the metadata to disk if there is some - if input_data['metadata'] is not None: - storage_metadata_path = storage_path / f'{input_data["filename"]}.__metadata__.json' - with open(storage_metadata_path, 'w', encoding='utf-8') as metadata_file: # noqa: ASYNC230 - s = await json_dumps( - { - 'key': input_data['metadata']['key'], - 'contentType': input_data['metadata']['contentType'], - } - ) - metadata_file.write(s) - - # Create the key-value store client to load the items from disk - store_details = await memory_storage_client.key_value_stores().get_or_create(name=key_value_store_name) - key_value_store_client = memory_storage_client.key_value_store(store_details.id) - - # Read the item from the store and check if it is as expected - actual_record = await key_value_store_client.get_record(expected_output['key']) - assert actual_record is not None - - assert actual_record.key == expected_output['key'] - assert actual_record.content_type == expected_output['contentType'] - assert actual_record.value == input_data['value'] diff --git a/tests/unit/storage_clients/_memory/test_key_value_store_collection_client.py b/tests/unit/storage_clients/_memory/test_key_value_store_collection_client.py deleted file mode 100644 index 41b289eb06..0000000000 --- a/tests/unit/storage_clients/_memory/test_key_value_store_collection_client.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from crawlee.storage_clients import MemoryStorageClient - from crawlee.storage_clients._memory import KeyValueStoreCollectionClient - - -@pytest.fixture -def key_value_stores_client(memory_storage_client: MemoryStorageClient) -> KeyValueStoreCollectionClient: - return memory_storage_client.key_value_stores() - - -async def test_get_or_create(key_value_stores_client: KeyValueStoreCollectionClient) -> None: - kvs_name = 'test' - # A new kvs gets created - kvs_info = await key_value_stores_client.get_or_create(name=kvs_name) - assert kvs_info.name == kvs_name - - # Another get_or_create call returns the same kvs - kvs_info_existing = await key_value_stores_client.get_or_create(name=kvs_name) - assert kvs_info.id == kvs_info_existing.id - assert kvs_info.name == kvs_info_existing.name - assert kvs_info.created_at == kvs_info_existing.created_at - - -async def test_list(key_value_stores_client: KeyValueStoreCollectionClient) -> None: - assert (await key_value_stores_client.list()).count == 0 - kvs_info = await key_value_stores_client.get_or_create(name='kvs') - kvs_list = await key_value_stores_client.list() - assert kvs_list.count == 1 - assert kvs_list.items[0].name == kvs_info.name - - # Test sorting behavior - newer_kvs_info = await key_value_stores_client.get_or_create(name='newer-kvs') - kvs_list_sorting = await key_value_stores_client.list() - assert kvs_list_sorting.count == 2 - assert kvs_list_sorting.items[0].name == kvs_info.name - assert kvs_list_sorting.items[1].name == newer_kvs_info.name diff --git a/tests/unit/storage_clients/_memory/test_memory_dataset_client.py b/tests/unit/storage_clients/_memory/test_memory_dataset_client.py new file mode 100644 index 0000000000..c25074e5c0 --- /dev/null +++ b/tests/unit/storage_clients/_memory/test_memory_dataset_client.py @@ -0,0 +1,279 @@ +from __future__ import annotations + +import asyncio +from datetime import datetime +from typing import TYPE_CHECKING + +import pytest + +from crawlee.configuration import Configuration +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storage_clients._memory import MemoryDatasetClient +from crawlee.storage_clients.models import DatasetItemsListPage + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + +@pytest.fixture +async def dataset_client() -> AsyncGenerator[MemoryDatasetClient, None]: + """Fixture that provides a fresh memory dataset client for each test.""" + client = await MemoryStorageClient().open_dataset_client(name='test_dataset') + yield client + await client.drop() + + +async def test_open_creates_new_dataset() -> None: + """Test that open() creates a new dataset with proper metadata and adds it to the cache.""" + client = await MemoryStorageClient().open_dataset_client(name='new_dataset') + + # Verify correct client type and properties + assert isinstance(client, MemoryDatasetClient) + assert client.metadata.id is not None + assert client.metadata.name == 'new_dataset' + assert client.metadata.item_count == 0 + assert isinstance(client.metadata.created_at, datetime) + assert isinstance(client.metadata.accessed_at, datetime) + assert isinstance(client.metadata.modified_at, datetime) + + +async def test_dataset_client_purge_on_start() -> None: + """Test that purge_on_start=True clears existing data in the dataset.""" + configuration = Configuration(purge_on_start=True) + + # Create dataset and add data + dataset_client1 = await MemoryStorageClient().open_dataset_client( + name='test_purge_dataset', + configuration=configuration, + ) + await dataset_client1.push_data({'item': 'initial data'}) + + # Verify data was added + items = await dataset_client1.get_data() + assert len(items.items) == 1 + + # Reopen + dataset_client2 = await MemoryStorageClient().open_dataset_client( + name='test_purge_dataset', + configuration=configuration, + ) + + # Verify data was purged + items = await dataset_client2.get_data() + assert len(items.items) == 0 + + +async def test_open_with_id_and_name() -> None: + """Test that open() can be used with both id and name parameters.""" + client = await MemoryStorageClient().open_dataset_client( + id='some-id', + name='some-name', + ) + assert client.metadata.id == 'some-id' + assert client.metadata.name == 'some-name' + + +async def test_push_data_single_item(dataset_client: MemoryDatasetClient) -> None: + """Test pushing a single item to the dataset and verifying it was stored correctly.""" + item = {'key': 'value', 'number': 42} + await dataset_client.push_data(item) + + # Verify item count was updated + assert dataset_client.metadata.item_count == 1 + + # Verify item was stored + result = await dataset_client.get_data() + assert result.count == 1 + assert result.items[0] == item + + +async def test_push_data_multiple_items(dataset_client: MemoryDatasetClient) -> None: + """Test pushing multiple items to the dataset and verifying they were stored correctly.""" + items = [ + {'id': 1, 'name': 'Item 1'}, + {'id': 2, 'name': 'Item 2'}, + {'id': 3, 'name': 'Item 3'}, + ] + await dataset_client.push_data(items) + + # Verify item count was updated + assert dataset_client.metadata.item_count == 3 + + # Verify items were stored + result = await dataset_client.get_data() + assert result.count == 3 + assert result.items == items + + +async def test_get_data_empty_dataset(dataset_client: MemoryDatasetClient) -> None: + """Test that getting data from an empty dataset returns empty results with correct metadata.""" + result = await dataset_client.get_data() + + assert isinstance(result, DatasetItemsListPage) + assert result.count == 0 + assert result.total == 0 + assert result.items == [] + + +async def test_get_data_with_items(dataset_client: MemoryDatasetClient) -> None: + """Test that all items pushed to the dataset can be retrieved with correct metadata.""" + # Add some items + items = [ + {'id': 1, 'name': 'Item 1'}, + {'id': 2, 'name': 'Item 2'}, + {'id': 3, 'name': 'Item 3'}, + ] + await dataset_client.push_data(items) + + # Get all items + result = await dataset_client.get_data() + + assert result.count == 3 + assert result.total == 3 + assert len(result.items) == 3 + assert result.items[0]['id'] == 1 + assert result.items[1]['id'] == 2 + assert result.items[2]['id'] == 3 + + +async def test_get_data_with_pagination(dataset_client: MemoryDatasetClient) -> None: + """Test that offset and limit parameters work correctly for dataset pagination.""" + # Add some items + items = [{'id': i} for i in range(1, 11)] # 10 items + await dataset_client.push_data(items) + + # Test offset + result = await dataset_client.get_data(offset=3) + assert result.count == 7 + assert result.offset == 3 + assert result.items[0]['id'] == 4 + + # Test limit + result = await dataset_client.get_data(limit=5) + assert result.count == 5 + assert result.limit == 5 + assert result.items[-1]['id'] == 5 + + # Test both offset and limit + result = await dataset_client.get_data(offset=2, limit=3) + assert result.count == 3 + assert result.offset == 2 + assert result.limit == 3 + assert result.items[0]['id'] == 3 + assert result.items[-1]['id'] == 5 + + +async def test_get_data_descending_order(dataset_client: MemoryDatasetClient) -> None: + """Test that the desc parameter correctly reverses the order of returned items.""" + # Add some items + items = [{'id': i} for i in range(1, 6)] # 5 items + await dataset_client.push_data(items) + + # Get items in descending order + result = await dataset_client.get_data(desc=True) + + assert result.desc is True + assert result.items[0]['id'] == 5 + assert result.items[-1]['id'] == 1 + + +async def test_get_data_skip_empty(dataset_client: MemoryDatasetClient) -> None: + """Test that the skip_empty parameter correctly filters out empty items.""" + # Add some items including an empty one + items = [ + {'id': 1, 'name': 'Item 1'}, + {}, # Empty item + {'id': 3, 'name': 'Item 3'}, + ] + await dataset_client.push_data(items) + + # Get all items + result = await dataset_client.get_data() + assert result.count == 3 + + # Get non-empty items + result = await dataset_client.get_data(skip_empty=True) + assert result.count == 2 + assert all(item != {} for item in result.items) + + +async def test_iterate(dataset_client: MemoryDatasetClient) -> None: + """Test that iterate_items yields each item in the dataset in the correct order.""" + # Add some items + items = [{'id': i} for i in range(1, 6)] # 5 items + await dataset_client.push_data(items) + + # Iterate over all items + collected_items = [item async for item in dataset_client.iterate_items()] + + assert len(collected_items) == 5 + assert collected_items[0]['id'] == 1 + assert collected_items[-1]['id'] == 5 + + +async def test_iterate_with_options(dataset_client: MemoryDatasetClient) -> None: + """Test that iterate_items respects offset, limit, and desc parameters.""" + # Add some items + items = [{'id': i} for i in range(1, 11)] # 10 items + await dataset_client.push_data(items) + + # Test with offset and limit + collected_items = [item async for item in dataset_client.iterate_items(offset=3, limit=3)] + + assert len(collected_items) == 3 + assert collected_items[0]['id'] == 4 + assert collected_items[-1]['id'] == 6 + + # Test with descending order + collected_items = [] + async for item in dataset_client.iterate_items(desc=True, limit=3): + collected_items.append(item) + + assert len(collected_items) == 3 + assert collected_items[0]['id'] == 10 + assert collected_items[-1]['id'] == 8 + + +async def test_drop(dataset_client: MemoryDatasetClient) -> None: + """Test that drop removes the dataset from cache and resets its state.""" + await dataset_client.push_data({'test': 'data'}) + + # Drop the dataset + await dataset_client.drop() + + # Verify the dataset is empty + assert dataset_client.metadata.item_count == 0 + result = await dataset_client.get_data() + assert result.count == 0 + + +async def test_metadata_updates(dataset_client: MemoryDatasetClient) -> None: + """Test that read/write operations properly update accessed_at and modified_at timestamps.""" + # Record initial timestamps + initial_created = dataset_client.metadata.created_at + initial_accessed = dataset_client.metadata.accessed_at + initial_modified = dataset_client.metadata.modified_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates accessed_at + await dataset_client.get_data() + + # Verify timestamps + assert dataset_client.metadata.created_at == initial_created + assert dataset_client.metadata.accessed_at > initial_accessed + assert dataset_client.metadata.modified_at == initial_modified + + accessed_after_get = dataset_client.metadata.accessed_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates modified_at + await dataset_client.push_data({'new': 'item'}) + + # Verify timestamps again + assert dataset_client.metadata.created_at == initial_created + assert dataset_client.metadata.modified_at > initial_modified + assert dataset_client.metadata.accessed_at > accessed_after_get diff --git a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py new file mode 100644 index 0000000000..5d8789f6c3 --- /dev/null +++ b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py @@ -0,0 +1,243 @@ +from __future__ import annotations + +import asyncio +from datetime import datetime +from typing import TYPE_CHECKING, Any + +import pytest + +from crawlee.configuration import Configuration +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storage_clients._memory import MemoryKeyValueStoreClient +from crawlee.storage_clients.models import KeyValueStoreRecordMetadata + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + +@pytest.fixture +async def kvs_client() -> AsyncGenerator[MemoryKeyValueStoreClient, None]: + """Fixture that provides a fresh memory key-value store client for each test.""" + client = await MemoryStorageClient().open_key_value_store_client(name='test_kvs') + yield client + await client.drop() + + +async def test_open_creates_new_kvs() -> None: + """Test that open() creates a new key-value store with proper metadata and adds it to the cache.""" + client = await MemoryStorageClient().open_key_value_store_client(name='new_kvs') + + # Verify correct client type and properties + assert isinstance(client, MemoryKeyValueStoreClient) + assert client.metadata.id is not None + assert client.metadata.name == 'new_kvs' + assert isinstance(client.metadata.created_at, datetime) + assert isinstance(client.metadata.accessed_at, datetime) + assert isinstance(client.metadata.modified_at, datetime) + + +async def test_kvs_client_purge_on_start() -> None: + """Test that purge_on_start=True clears existing data in the KVS.""" + configuration = Configuration(purge_on_start=True) + + # Create KVS and add data + kvs_client1 = await MemoryStorageClient().open_key_value_store_client( + name='test_purge_kvs', + configuration=configuration, + ) + await kvs_client1.set_value(key='test-key', value='initial value') + + # Verify value was set + record = await kvs_client1.get_value(key='test-key') + assert record is not None + assert record.value == 'initial value' + + # Reopen + kvs_client2 = await MemoryStorageClient().open_key_value_store_client( + name='test_purge_kvs', + configuration=configuration, + ) + + # Verify value was purged + record = await kvs_client2.get_value(key='test-key') + assert record is None + + +async def test_open_with_id_and_name() -> None: + """Test that open() can be used with both id and name parameters.""" + client = await MemoryStorageClient().open_key_value_store_client( + id='some-id', + name='some-name', + ) + assert client.metadata.id == 'some-id' + assert client.metadata.name == 'some-name' + + +@pytest.mark.parametrize( + ('key', 'value', 'expected_content_type'), + [ + pytest.param('string_key', 'string value', 'text/plain; charset=utf-8', id='string'), + pytest.param('dict_key', {'name': 'test', 'value': 42}, 'application/json; charset=utf-8', id='dictionary'), + pytest.param('list_key', [1, 2, 3], 'application/json; charset=utf-8', id='list'), + pytest.param('bytes_key', b'binary data', 'application/octet-stream', id='bytes'), + ], +) +async def test_set_get_value( + kvs_client: MemoryKeyValueStoreClient, + key: str, + value: Any, + expected_content_type: str, +) -> None: + """Test storing and retrieving different types of values with correct content types.""" + # Set value + await kvs_client.set_value(key=key, value=value) + + # Get and verify value + record = await kvs_client.get_value(key=key) + assert record is not None + assert record.key == key + assert record.value == value + assert record.content_type == expected_content_type + + +async def test_get_nonexistent_value(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that attempting to get a non-existent key returns None.""" + record = await kvs_client.get_value(key='nonexistent') + assert record is None + + +async def test_set_value_with_explicit_content_type(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that an explicitly provided content type overrides the automatically inferred one.""" + value = 'This could be XML' + content_type = 'application/xml' + + await kvs_client.set_value(key='xml_key', value=value, content_type=content_type) + + record = await kvs_client.get_value(key='xml_key') + assert record is not None + assert record.value == value + assert record.content_type == content_type + + +async def test_delete_value(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that a stored value can be deleted and is no longer retrievable after deletion.""" + # Set a value + await kvs_client.set_value(key='delete_me', value='to be deleted') + + # Verify it exists + record = await kvs_client.get_value(key='delete_me') + assert record is not None + + # Delete it + await kvs_client.delete_value(key='delete_me') + + # Verify it's gone + record = await kvs_client.get_value(key='delete_me') + assert record is None + + +async def test_delete_nonexistent_value(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that attempting to delete a non-existent key is a no-op and doesn't raise errors.""" + # Should not raise an error + await kvs_client.delete_value(key='nonexistent') + + +async def test_iterate_keys(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that all keys can be iterated over and are returned in sorted order with correct metadata.""" + # Set some values + items = { + 'a_key': 'value A', + 'b_key': 'value B', + 'c_key': 'value C', + 'd_key': 'value D', + } + + for key, value in items.items(): + await kvs_client.set_value(key=key, value=value) + + # Get all keys + metadata_list = [metadata async for metadata in kvs_client.iterate_keys()] + + # Verify keys are returned in sorted order + assert len(metadata_list) == 4 + assert [m.key for m in metadata_list] == sorted(items.keys()) + assert all(isinstance(m, KeyValueStoreRecordMetadata) for m in metadata_list) + + +async def test_iterate_keys_with_exclusive_start_key(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that exclusive_start_key parameter returns only keys after it alphabetically.""" + # Set some values + for key in ['a_key', 'b_key', 'c_key', 'd_key', 'e_key']: + await kvs_client.set_value(key=key, value=f'value for {key}') + + # Get keys starting after 'b_key' + metadata_list = [metadata async for metadata in kvs_client.iterate_keys(exclusive_start_key='b_key')] + + # Verify only keys after 'b_key' are returned + assert len(metadata_list) == 3 + assert [m.key for m in metadata_list] == ['c_key', 'd_key', 'e_key'] + + +async def test_iterate_keys_with_limit(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that the limit parameter returns only the specified number of keys.""" + # Set some values + for key in ['a_key', 'b_key', 'c_key', 'd_key', 'e_key']: + await kvs_client.set_value(key=key, value=f'value for {key}') + + # Get first 3 keys + metadata_list = [metadata async for metadata in kvs_client.iterate_keys(limit=3)] + + # Verify only the first 3 keys are returned + assert len(metadata_list) == 3 + assert [m.key for m in metadata_list] == ['a_key', 'b_key', 'c_key'] + + +async def test_drop(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that drop removes the store from cache and clears all data.""" + # Add some values to the store + await kvs_client.set_value(key='test', value='data') + + # Drop the store + await kvs_client.drop() + + # Verify the store is empty + record = await kvs_client.get_value(key='test') + assert record is None + + +async def test_get_public_url(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that get_public_url raises NotImplementedError for the memory implementation.""" + with pytest.raises(NotImplementedError): + await kvs_client.get_public_url(key='any-key') + + +async def test_metadata_updates(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that read/write operations properly update accessed_at and modified_at timestamps.""" + # Record initial timestamps + initial_created = kvs_client.metadata.created_at + initial_accessed = kvs_client.metadata.accessed_at + initial_modified = kvs_client.metadata.modified_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates accessed_at + await kvs_client.get_value(key='nonexistent') + + # Verify timestamps + assert kvs_client.metadata.created_at == initial_created + assert kvs_client.metadata.accessed_at > initial_accessed + assert kvs_client.metadata.modified_at == initial_modified + + accessed_after_get = kvs_client.metadata.accessed_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates modified_at and accessed_at + await kvs_client.set_value(key='new_key', value='new value') + + # Verify timestamps again + assert kvs_client.metadata.created_at == initial_created + assert kvs_client.metadata.modified_at > initial_modified + assert kvs_client.metadata.accessed_at > accessed_after_get diff --git a/tests/unit/storage_clients/_memory/test_memory_rq_client.py b/tests/unit/storage_clients/_memory/test_memory_rq_client.py new file mode 100644 index 0000000000..028c53ccd2 --- /dev/null +++ b/tests/unit/storage_clients/_memory/test_memory_rq_client.py @@ -0,0 +1,442 @@ +from __future__ import annotations + +import asyncio +from datetime import datetime +from typing import TYPE_CHECKING + +import pytest + +from crawlee import Request +from crawlee.configuration import Configuration +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storage_clients._memory import MemoryRequestQueueClient + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + +@pytest.fixture +async def rq_client() -> AsyncGenerator[MemoryRequestQueueClient, None]: + """Fixture that provides a fresh memory request queue client for each test.""" + client = await MemoryStorageClient().open_request_queue_client(name='test_rq') + yield client + await client.drop() + + +async def test_open_creates_new_rq() -> None: + """Test that open() creates a new request queue with proper metadata and adds it to the cache.""" + client = await MemoryStorageClient().open_request_queue_client(name='new_rq') + + # Verify correct client type and properties + assert isinstance(client, MemoryRequestQueueClient) + assert client.metadata.id is not None + assert client.metadata.name == 'new_rq' + assert isinstance(client.metadata.created_at, datetime) + assert isinstance(client.metadata.accessed_at, datetime) + assert isinstance(client.metadata.modified_at, datetime) + assert client.metadata.handled_request_count == 0 + assert client.metadata.pending_request_count == 0 + assert client.metadata.total_request_count == 0 + assert client.metadata.had_multiple_clients is False + + +async def test_rq_client_purge_on_start() -> None: + """Test that purge_on_start=True clears existing data in the RQ.""" + configuration = Configuration(purge_on_start=True) + + # Create RQ and add data + rq_client1 = await MemoryStorageClient().open_request_queue_client( + name='test_purge_rq', + configuration=configuration, + ) + request = Request.from_url(url='https://example.com/initial') + await rq_client1.add_batch_of_requests([request]) + + # Verify request was added + assert await rq_client1.is_empty() is False + + # Reopen + rq_client2 = await MemoryStorageClient().open_request_queue_client( + name='test_purge_rq', + configuration=configuration, + ) + + # Verify queue was purged + assert await rq_client2.is_empty() is True + + +async def test_open_with_id_and_name() -> None: + """Test that open() can be used with both id and name parameters.""" + client = await MemoryStorageClient().open_request_queue_client( + id='some-id', + name='some-name', + ) + assert client.metadata.id is not None # ID is always auto-generated + assert client.metadata.name == 'some-name' + + +async def test_add_batch_of_requests(rq_client: MemoryRequestQueueClient) -> None: + """Test adding a batch of requests to the queue.""" + requests = [ + Request.from_url(url='https://example.com/1'), + Request.from_url(url='https://example.com/2'), + Request.from_url(url='https://example.com/3'), + ] + + response = await rq_client.add_batch_of_requests(requests) + + # Verify correct response + assert len(response.processed_requests) == 3 + assert len(response.unprocessed_requests) == 0 + + # Verify each request was processed correctly + for i, req in enumerate(requests): + assert response.processed_requests[i].id == req.id + assert response.processed_requests[i].unique_key == req.unique_key + assert response.processed_requests[i].was_already_present is False + assert response.processed_requests[i].was_already_handled is False + + # Verify metadata was updated + assert rq_client.metadata.total_request_count == 3 + assert rq_client.metadata.pending_request_count == 3 + + +async def test_add_batch_of_requests_with_duplicates(rq_client: MemoryRequestQueueClient) -> None: + """Test adding requests with duplicate unique keys.""" + # Add initial requests + initial_requests = [ + Request.from_url(url='https://example.com/1', unique_key='key1'), + Request.from_url(url='https://example.com/2', unique_key='key2'), + ] + await rq_client.add_batch_of_requests(initial_requests) + + # Mark first request as handled + req1 = await rq_client.fetch_next_request() + assert req1 is not None + await rq_client.mark_request_as_handled(req1) + + # Add duplicate requests + duplicate_requests = [ + Request.from_url(url='https://example.com/1-dup', unique_key='key1'), # Same as first (handled) + Request.from_url(url='https://example.com/2-dup', unique_key='key2'), # Same as second (not handled) + Request.from_url(url='https://example.com/3', unique_key='key3'), # New request + ] + response = await rq_client.add_batch_of_requests(duplicate_requests) + + # Verify response + assert len(response.processed_requests) == 3 + + # First request should be marked as already handled + assert response.processed_requests[0].was_already_present is True + assert response.processed_requests[0].was_already_handled is True + + # Second request should be marked as already present but not handled + assert response.processed_requests[1].was_already_present is True + assert response.processed_requests[1].was_already_handled is False + + # Third request should be new + assert response.processed_requests[2].was_already_present is False + assert response.processed_requests[2].was_already_handled is False + + +async def test_add_batch_of_requests_to_forefront(rq_client: MemoryRequestQueueClient) -> None: + """Test adding requests to the forefront of the queue.""" + # Add initial requests + initial_requests = [ + Request.from_url(url='https://example.com/1'), + Request.from_url(url='https://example.com/2'), + ] + await rq_client.add_batch_of_requests(initial_requests) + + # Add new requests to forefront + forefront_requests = [ + Request.from_url(url='https://example.com/priority'), + ] + await rq_client.add_batch_of_requests(forefront_requests, forefront=True) + + # The priority request should be fetched first + next_request = await rq_client.fetch_next_request() + assert next_request is not None + assert next_request.url == 'https://example.com/priority' + + +async def test_fetch_next_request(rq_client: MemoryRequestQueueClient) -> None: + """Test fetching the next request from the queue.""" + # Add some requests + requests = [ + Request.from_url(url='https://example.com/1'), + Request.from_url(url='https://example.com/2'), + ] + await rq_client.add_batch_of_requests(requests) + + # Fetch first request + request1 = await rq_client.fetch_next_request() + assert request1 is not None + assert request1.url == 'https://example.com/1' + + # Fetch second request + request2 = await rq_client.fetch_next_request() + assert request2 is not None + assert request2.url == 'https://example.com/2' + + # No more requests + request3 = await rq_client.fetch_next_request() + assert request3 is None + + +async def test_fetch_skips_handled_requests(rq_client: MemoryRequestQueueClient) -> None: + """Test that fetch_next_request skips handled requests.""" + # Add requests + requests = [ + Request.from_url(url='https://example.com/1'), + Request.from_url(url='https://example.com/2'), + ] + await rq_client.add_batch_of_requests(requests) + + # Fetch and handle first request + request1 = await rq_client.fetch_next_request() + assert request1 is not None + await rq_client.mark_request_as_handled(request1) + + # Next fetch should return second request, not the handled one + request = await rq_client.fetch_next_request() + assert request is not None + assert request.url == 'https://example.com/2' + + +async def test_fetch_skips_in_progress_requests(rq_client: MemoryRequestQueueClient) -> None: + """Test that fetch_next_request skips requests that are already in progress.""" + # Add requests + requests = [ + Request.from_url(url='https://example.com/1'), + Request.from_url(url='https://example.com/2'), + ] + await rq_client.add_batch_of_requests(requests) + + # Fetch first request (it should be in progress now) + request1 = await rq_client.fetch_next_request() + assert request1 is not None + + # Next fetch should return second request, not the in-progress one + request2 = await rq_client.fetch_next_request() + assert request2 is not None + assert request2.url == 'https://example.com/2' + + # Third fetch should return None as all requests are in progress + request3 = await rq_client.fetch_next_request() + assert request3 is None + + +async def test_get_request(rq_client: MemoryRequestQueueClient) -> None: + """Test getting a request by ID.""" + # Add a request + request = Request.from_url(url='https://example.com/test') + await rq_client.add_batch_of_requests([request]) + + # Get the request by ID + retrieved_request = await rq_client.get_request(request.id) + assert retrieved_request is not None + assert retrieved_request.id == request.id + assert retrieved_request.url == request.url + + # Try to get a non-existent request + nonexistent = await rq_client.get_request('nonexistent-id') + assert nonexistent is None + + +async def test_get_in_progress_request(rq_client: MemoryRequestQueueClient) -> None: + """Test getting an in-progress request by ID.""" + # Add a request + request = Request.from_url(url='https://example.com/test') + await rq_client.add_batch_of_requests([request]) + + # Fetch the request to make it in-progress + fetched = await rq_client.fetch_next_request() + assert fetched is not None + + # Get the request by ID + retrieved = await rq_client.get_request(request.id) + assert retrieved is not None + assert retrieved.id == request.id + assert retrieved.url == request.url + + +async def test_mark_request_as_handled(rq_client: MemoryRequestQueueClient) -> None: + """Test marking a request as handled.""" + # Add a request + request = Request.from_url(url='https://example.com/test') + await rq_client.add_batch_of_requests([request]) + + # Fetch the request to make it in-progress + fetched = await rq_client.fetch_next_request() + assert fetched is not None + + # Mark as handled + result = await rq_client.mark_request_as_handled(fetched) + assert result is not None + assert result.id == fetched.id + assert result.was_already_handled is True + + # Check that metadata was updated + assert rq_client.metadata.handled_request_count == 1 + assert rq_client.metadata.pending_request_count == 0 + + # Try to mark again (should fail as it's no longer in-progress) + result = await rq_client.mark_request_as_handled(fetched) + assert result is None + + +async def test_reclaim_request(rq_client: MemoryRequestQueueClient) -> None: + """Test reclaiming a request back to the queue.""" + # Add a request + request = Request.from_url(url='https://example.com/test') + await rq_client.add_batch_of_requests([request]) + + # Fetch the request to make it in-progress + fetched = await rq_client.fetch_next_request() + assert fetched is not None + + # Reclaim the request + result = await rq_client.reclaim_request(fetched) + assert result is not None + assert result.id == fetched.id + assert result.was_already_handled is False + + # It should be available to fetch again + reclaimed = await rq_client.fetch_next_request() + assert reclaimed is not None + assert reclaimed.id == fetched.id + + +async def test_reclaim_request_to_forefront(rq_client: MemoryRequestQueueClient) -> None: + """Test reclaiming a request to the forefront of the queue.""" + # Add requests + requests = [ + Request.from_url(url='https://example.com/1'), + Request.from_url(url='https://example.com/2'), + ] + await rq_client.add_batch_of_requests(requests) + + # Fetch the second request to make it in-progress + await rq_client.fetch_next_request() # Skip the first one + request2 = await rq_client.fetch_next_request() + assert request2 is not None + assert request2.url == 'https://example.com/2' + + # Reclaim the request to forefront + await rq_client.reclaim_request(request2, forefront=True) + + # It should now be the first in the queue + next_request = await rq_client.fetch_next_request() + assert next_request is not None + assert next_request.url == 'https://example.com/2' + + +async def test_is_empty(rq_client: MemoryRequestQueueClient) -> None: + """Test checking if the queue is empty.""" + # Initially empty + assert await rq_client.is_empty() is True + + # Add a request + request = Request.from_url(url='https://example.com/test') + await rq_client.add_batch_of_requests([request]) + + # Not empty now + assert await rq_client.is_empty() is False + + # Fetch and handle + fetched = await rq_client.fetch_next_request() + assert fetched is not None + await rq_client.mark_request_as_handled(fetched) + + # Empty again (all requests handled) + assert await rq_client.is_empty() is True + + +async def test_is_empty_with_in_progress(rq_client: MemoryRequestQueueClient) -> None: + """Test that in-progress requests don't affect is_empty.""" + # Add a request + request = Request.from_url(url='https://example.com/test') + await rq_client.add_batch_of_requests([request]) + + # Fetch but don't handle + await rq_client.fetch_next_request() + + # Queue should still be considered non-empty + # This is because the request hasn't been handled yet + assert await rq_client.is_empty() is False + + +async def test_drop(rq_client: MemoryRequestQueueClient) -> None: + """Test that drop removes the queue from cache and clears all data.""" + # Add a request + request = Request.from_url(url='https://example.com/test') + await rq_client.add_batch_of_requests([request]) + + # Drop the queue + await rq_client.drop() + + # Verify the queue is empty + assert await rq_client.is_empty() is True + + +async def test_metadata_updates(rq_client: MemoryRequestQueueClient) -> None: + """Test that operations properly update metadata timestamps.""" + # Record initial timestamps + initial_created = rq_client.metadata.created_at + initial_accessed = rq_client.metadata.accessed_at + initial_modified = rq_client.metadata.modified_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates modified_at and accessed_at + request = Request.from_url(url='https://example.com/test') + await rq_client.add_batch_of_requests([request]) + + # Verify timestamps + assert rq_client.metadata.created_at == initial_created + assert rq_client.metadata.modified_at > initial_modified + assert rq_client.metadata.accessed_at > initial_accessed + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Record timestamps after add + accessed_after_add = rq_client.metadata.accessed_at + modified_after_add = rq_client.metadata.modified_at + + # Check is_empty (should only update accessed_at) + await rq_client.is_empty() + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Verify only accessed_at changed + assert rq_client.metadata.modified_at == modified_after_add + assert rq_client.metadata.accessed_at > accessed_after_add + + +async def test_unique_key_generation(rq_client: MemoryRequestQueueClient) -> None: + """Test that unique keys are auto-generated if not provided.""" + # Add requests without explicit unique keys + requests = [ + Request.from_url(url='https://example.com/1'), + Request.from_url(url='https://example.com/1', always_enqueue=True), + ] + response = await rq_client.add_batch_of_requests(requests) + + # Both should be added as their auto-generated unique keys will differ + assert len(response.processed_requests) == 2 + assert all(not pr.was_already_present for pr in response.processed_requests) + + # Add a request with explicit unique key + request = Request.from_url(url='https://example.com/2', unique_key='explicit-key') + await rq_client.add_batch_of_requests([request]) + + # Add duplicate with same unique key + duplicate = Request.from_url(url='https://example.com/different', unique_key='explicit-key') + duplicate_response = await rq_client.add_batch_of_requests([duplicate]) + + # Should be marked as already present + assert duplicate_response.processed_requests[0].was_already_present is True diff --git a/tests/unit/storage_clients/_memory/test_memory_storage_client.py b/tests/unit/storage_clients/_memory/test_memory_storage_client.py deleted file mode 100644 index 0d043322ae..0000000000 --- a/tests/unit/storage_clients/_memory/test_memory_storage_client.py +++ /dev/null @@ -1,288 +0,0 @@ -# TODO: Update crawlee_storage_dir args once the Pydantic bug is fixed -# https://github.com/apify/crawlee-python/issues/146 - -from __future__ import annotations - -from pathlib import Path - -import pytest - -from crawlee import Request, service_locator -from crawlee._consts import METADATA_FILENAME -from crawlee.configuration import Configuration -from crawlee.storage_clients import MemoryStorageClient -from crawlee.storage_clients.models import BatchRequestsOperationResponse - - -async def test_write_metadata(tmp_path: Path) -> None: - dataset_name = 'test' - dataset_no_metadata_name = 'test-no-metadata' - ms = MemoryStorageClient.from_config( - Configuration( - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - write_metadata=True, - ), - ) - ms_no_metadata = MemoryStorageClient.from_config( - Configuration( - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - write_metadata=False, - ) - ) - datasets_client = ms.datasets() - datasets_no_metadata_client = ms_no_metadata.datasets() - await datasets_client.get_or_create(name=dataset_name) - await datasets_no_metadata_client.get_or_create(name=dataset_no_metadata_name) - assert Path(ms.datasets_directory, dataset_name, METADATA_FILENAME).exists() is True - assert Path(ms_no_metadata.datasets_directory, dataset_no_metadata_name, METADATA_FILENAME).exists() is False - - -@pytest.mark.parametrize( - 'persist_storage', - [ - True, - False, - ], -) -async def test_persist_storage(persist_storage: bool, tmp_path: Path) -> None: # noqa: FBT001 - ms = MemoryStorageClient.from_config( - Configuration( - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - persist_storage=persist_storage, - ) - ) - - # Key value stores - kvs_client = ms.key_value_stores() - kvs_info = await kvs_client.get_or_create(name='kvs') - await ms.key_value_store(kvs_info.id).set_record('test', {'x': 1}, 'application/json') - - path = Path(ms.key_value_stores_directory) / (kvs_info.name or '') / 'test.json' - assert path.exists() is persist_storage - - # Request queues - rq_client = ms.request_queues() - rq_info = await rq_client.get_or_create(name='rq') - - request = Request.from_url('http://lorem.com') - await ms.request_queue(rq_info.id).add_request(request) - - path = Path(ms.request_queues_directory) / (rq_info.name or '') / f'{request.id}.json' - assert path.exists() is persist_storage - - # Datasets - ds_client = ms.datasets() - ds_info = await ds_client.get_or_create(name='ds') - - await ms.dataset(ds_info.id).push_items([{'foo': 'bar'}]) - - -def test_persist_storage_set_to_false_via_string_env_var(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - monkeypatch.setenv('CRAWLEE_PERSIST_STORAGE', 'false') - ms = MemoryStorageClient.from_config( - Configuration(crawlee_storage_dir=str(tmp_path)), # type: ignore[call-arg] - ) - assert ms.persist_storage is False - - -def test_persist_storage_set_to_false_via_numeric_env_var(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - monkeypatch.setenv('CRAWLEE_PERSIST_STORAGE', '0') - ms = MemoryStorageClient.from_config(Configuration(crawlee_storage_dir=str(tmp_path))) # type: ignore[call-arg] - assert ms.persist_storage is False - - -def test_persist_storage_true_via_constructor_arg(tmp_path: Path) -> None: - ms = MemoryStorageClient.from_config( - Configuration( - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - persist_storage=True, - ) - ) - assert ms.persist_storage is True - - -def test_default_write_metadata_behavior(tmp_path: Path) -> None: - # Default behavior - ms = MemoryStorageClient.from_config( - Configuration(crawlee_storage_dir=str(tmp_path)), # type: ignore[call-arg] - ) - assert ms.write_metadata is True - - -def test_write_metadata_set_to_false_via_env_var(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - # Test if env var changes write_metadata to False - monkeypatch.setenv('CRAWLEE_WRITE_METADATA', 'false') - ms = MemoryStorageClient.from_config( - Configuration(crawlee_storage_dir=str(tmp_path)), # type: ignore[call-arg] - ) - assert ms.write_metadata is False - - -def test_write_metadata_false_via_constructor_arg_overrides_env_var(tmp_path: Path) -> None: - # Test if constructor arg takes precedence over env var value - ms = MemoryStorageClient.from_config( - Configuration( - write_metadata=False, - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - ) - ) - assert ms.write_metadata is False - - -async def test_purge_datasets(tmp_path: Path) -> None: - ms = MemoryStorageClient.from_config( - Configuration( - write_metadata=True, - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - ) - ) - # Create default and non-default datasets - datasets_client = ms.datasets() - default_dataset_info = await datasets_client.get_or_create(name='default') - non_default_dataset_info = await datasets_client.get_or_create(name='non-default') - - # Check all folders inside datasets directory before and after purge - assert default_dataset_info.name is not None - assert non_default_dataset_info.name is not None - - default_path = Path(ms.datasets_directory, default_dataset_info.name) - non_default_path = Path(ms.datasets_directory, non_default_dataset_info.name) - - assert default_path.exists() is True - assert non_default_path.exists() is True - - await ms._purge_default_storages() - - assert default_path.exists() is False - assert non_default_path.exists() is True - - -async def test_purge_key_value_stores(tmp_path: Path) -> None: - ms = MemoryStorageClient.from_config( - Configuration( - write_metadata=True, - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - ) - ) - - # Create default and non-default key-value stores - kvs_client = ms.key_value_stores() - default_kvs_info = await kvs_client.get_or_create(name='default') - non_default_kvs_info = await kvs_client.get_or_create(name='non-default') - default_kvs_client = ms.key_value_store(default_kvs_info.id) - # INPUT.json should be kept - await default_kvs_client.set_record('INPUT', {'abc': 123}, 'application/json') - # test.json should not be kept - await default_kvs_client.set_record('test', {'abc': 123}, 'application/json') - - # Check all folders and files inside kvs directory before and after purge - assert default_kvs_info.name is not None - assert non_default_kvs_info.name is not None - - default_kvs_path = Path(ms.key_value_stores_directory, default_kvs_info.name) - non_default_kvs_path = Path(ms.key_value_stores_directory, non_default_kvs_info.name) - kvs_directory = Path(ms.key_value_stores_directory, 'default') - - assert default_kvs_path.exists() is True - assert non_default_kvs_path.exists() is True - - assert (kvs_directory / 'INPUT.json').exists() is True - assert (kvs_directory / 'test.json').exists() is True - - await ms._purge_default_storages() - - assert default_kvs_path.exists() is True - assert non_default_kvs_path.exists() is True - - assert (kvs_directory / 'INPUT.json').exists() is True - assert (kvs_directory / 'test.json').exists() is False - - -async def test_purge_request_queues(tmp_path: Path) -> None: - ms = MemoryStorageClient.from_config( - Configuration( - write_metadata=True, - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - ) - ) - # Create default and non-default request queues - rq_client = ms.request_queues() - default_rq_info = await rq_client.get_or_create(name='default') - non_default_rq_info = await rq_client.get_or_create(name='non-default') - - # Check all folders inside rq directory before and after purge - assert default_rq_info.name - assert non_default_rq_info.name - - default_rq_path = Path(ms.request_queues_directory, default_rq_info.name) - non_default_rq_path = Path(ms.request_queues_directory, non_default_rq_info.name) - - assert default_rq_path.exists() is True - assert non_default_rq_path.exists() is True - - await ms._purge_default_storages() - - assert default_rq_path.exists() is False - assert non_default_rq_path.exists() is True - - -async def test_not_implemented_method(tmp_path: Path) -> None: - ms = MemoryStorageClient.from_config( - Configuration( - write_metadata=True, - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - ) - ) - ddt = ms.dataset('test') - with pytest.raises(NotImplementedError, match='This method is not supported in memory storage.'): - await ddt.stream_items(item_format='json') - - with pytest.raises(NotImplementedError, match='This method is not supported in memory storage.'): - await ddt.stream_items(item_format='json') - - -async def test_default_storage_path_used(monkeypatch: pytest.MonkeyPatch) -> None: - # Reset the configuration in service locator - service_locator._configuration = None - service_locator._configuration_was_retrieved = False - - # Remove the env var for setting the storage directory - monkeypatch.delenv('CRAWLEE_STORAGE_DIR', raising=False) - - # Initialize the service locator with default configuration - msc = MemoryStorageClient.from_config() - assert msc.storage_dir == './storage' - - -async def test_storage_path_from_env_var_overrides_default(monkeypatch: pytest.MonkeyPatch) -> None: - # We expect the env var to override the default value - monkeypatch.setenv('CRAWLEE_STORAGE_DIR', './env_var_storage_dir') - service_locator.set_configuration(Configuration()) - ms = MemoryStorageClient.from_config() - assert ms.storage_dir == './env_var_storage_dir' - - -async def test_parametrized_storage_path_overrides_env_var() -> None: - # We expect the parametrized value to be used - ms = MemoryStorageClient.from_config( - Configuration(crawlee_storage_dir='./parametrized_storage_dir'), # type: ignore[call-arg] - ) - assert ms.storage_dir == './parametrized_storage_dir' - - -async def test_batch_requests_operation_response() -> None: - """Test that `BatchRequestsOperationResponse` creation from example responses.""" - process_request = { - 'requestId': 'EAaArVRs5qV39C9', - 'uniqueKey': 'https://example.com', - 'wasAlreadyHandled': False, - 'wasAlreadyPresent': True, - } - unprocess_request_full = {'uniqueKey': 'https://example2.com', 'method': 'GET', 'url': 'https://example2.com'} - unprocess_request_minimal = {'uniqueKey': 'https://example3.com', 'url': 'https://example3.com'} - BatchRequestsOperationResponse.model_validate( - { - 'processedRequests': [process_request], - 'unprocessedRequests': [unprocess_request_full, unprocess_request_minimal], - } - ) diff --git a/tests/unit/storage_clients/_memory/test_memory_storage_e2e.py b/tests/unit/storage_clients/_memory/test_memory_storage_e2e.py deleted file mode 100644 index c79fa66792..0000000000 --- a/tests/unit/storage_clients/_memory/test_memory_storage_e2e.py +++ /dev/null @@ -1,130 +0,0 @@ -from __future__ import annotations - -from datetime import datetime, timezone -from typing import Callable - -import pytest - -from crawlee import Request, service_locator -from crawlee.storages._key_value_store import KeyValueStore -from crawlee.storages._request_queue import RequestQueue - - -@pytest.mark.parametrize('purge_on_start', [True, False]) -async def test_actor_memory_storage_client_key_value_store_e2e( - monkeypatch: pytest.MonkeyPatch, - purge_on_start: bool, # noqa: FBT001 - prepare_test_env: Callable[[], None], -) -> None: - """This test simulates two clean runs using memory storage. - The second run attempts to access data created by the first one. - We run 2 configurations with different `purge_on_start`.""" - # Configure purging env var - monkeypatch.setenv('CRAWLEE_PURGE_ON_START', f'{int(purge_on_start)}') - # Store old storage client so we have the object reference for comparison - old_client = service_locator.get_storage_client() - - old_default_kvs = await KeyValueStore.open() - old_non_default_kvs = await KeyValueStore.open(name='non-default') - # Create data in default and non-default key-value store - await old_default_kvs.set_value('test', 'default value') - await old_non_default_kvs.set_value('test', 'non-default value') - - # We simulate another clean run, we expect the memory storage to read from the local data directory - # Default storages are purged based on purge_on_start parameter. - prepare_test_env() - - # Check if we're using a different memory storage instance - assert old_client is not service_locator.get_storage_client() - default_kvs = await KeyValueStore.open() - assert default_kvs is not old_default_kvs - non_default_kvs = await KeyValueStore.open(name='non-default') - assert non_default_kvs is not old_non_default_kvs - default_value = await default_kvs.get_value('test') - - if purge_on_start: - assert default_value is None - else: - assert default_value == 'default value' - - assert await non_default_kvs.get_value('test') == 'non-default value' - - -@pytest.mark.parametrize('purge_on_start', [True, False]) -async def test_actor_memory_storage_client_request_queue_e2e( - monkeypatch: pytest.MonkeyPatch, - purge_on_start: bool, # noqa: FBT001 - prepare_test_env: Callable[[], None], -) -> None: - """This test simulates two clean runs using memory storage. - The second run attempts to access data created by the first one. - We run 2 configurations with different `purge_on_start`.""" - # Configure purging env var - monkeypatch.setenv('CRAWLEE_PURGE_ON_START', f'{int(purge_on_start)}') - - # Add some requests to the default queue - default_queue = await RequestQueue.open() - for i in range(6): - # [0, 3] <- nothing special - # [1, 4] <- forefront=True - # [2, 5] <- handled=True - request_url = f'http://example.com/{i}' - forefront = i % 3 == 1 - was_handled = i % 3 == 2 - await default_queue.add_request( - Request.from_url( - unique_key=str(i), - url=request_url, - handled_at=datetime.now(timezone.utc) if was_handled else None, - payload=b'test', - ), - forefront=forefront, - ) - - # We simulate another clean run, we expect the memory storage to read from the local data directory - # Default storages are purged based on purge_on_start parameter. - prepare_test_env() - - # Add some more requests to the default queue - default_queue = await RequestQueue.open() - for i in range(6, 12): - # [6, 9] <- nothing special - # [7, 10] <- forefront=True - # [8, 11] <- handled=True - request_url = f'http://example.com/{i}' - forefront = i % 3 == 1 - was_handled = i % 3 == 2 - await default_queue.add_request( - Request.from_url( - unique_key=str(i), - url=request_url, - handled_at=datetime.now(timezone.utc) if was_handled else None, - payload=b'test', - ), - forefront=forefront, - ) - - queue_info = await default_queue.get_info() - assert queue_info is not None - - # If the queue was purged between the runs, only the requests from the second run should be present, - # in the right order - if purge_on_start: - assert queue_info.total_request_count == 6 - assert queue_info.handled_request_count == 2 - - expected_pending_request_order = [10, 7, 6, 9] - # If the queue was NOT purged between the runs, all the requests should be in the queue in the right order - else: - assert queue_info.total_request_count == 12 - assert queue_info.handled_request_count == 4 - - expected_pending_request_order = [10, 7, 4, 1, 0, 3, 6, 9] - - actual_requests = list[Request]() - while req := await default_queue.fetch_next_request(): - actual_requests.append(req) - - assert [int(req.unique_key) for req in actual_requests] == expected_pending_request_order - assert [req.url for req in actual_requests] == [f'http://example.com/{req.unique_key}' for req in actual_requests] - assert [req.payload for req in actual_requests] == [b'test' for _ in actual_requests] diff --git a/tests/unit/storage_clients/_memory/test_request_queue_client.py b/tests/unit/storage_clients/_memory/test_request_queue_client.py deleted file mode 100644 index feffacbbd8..0000000000 --- a/tests/unit/storage_clients/_memory/test_request_queue_client.py +++ /dev/null @@ -1,249 +0,0 @@ -from __future__ import annotations - -import asyncio -from datetime import datetime, timezone -from pathlib import Path -from typing import TYPE_CHECKING - -import pytest - -from crawlee import Request -from crawlee._request import RequestState - -if TYPE_CHECKING: - from crawlee.storage_clients import MemoryStorageClient - from crawlee.storage_clients._memory import RequestQueueClient - - -@pytest.fixture -async def request_queue_client(memory_storage_client: MemoryStorageClient) -> RequestQueueClient: - request_queues_client = memory_storage_client.request_queues() - rq_info = await request_queues_client.get_or_create(name='test') - return memory_storage_client.request_queue(rq_info.id) - - -async def test_nonexistent(memory_storage_client: MemoryStorageClient) -> None: - request_queue_client = memory_storage_client.request_queue(id='nonexistent-id') - assert await request_queue_client.get() is None - with pytest.raises(ValueError, match='Request queue with id "nonexistent-id" does not exist.'): - await request_queue_client.update(name='test-update') - await request_queue_client.delete() - - -async def test_get(request_queue_client: RequestQueueClient) -> None: - await asyncio.sleep(0.1) - info = await request_queue_client.get() - assert info is not None - assert info.id == request_queue_client.id - assert info.accessed_at != info.created_at - - -async def test_update(request_queue_client: RequestQueueClient) -> None: - new_rq_name = 'test-update' - request = Request.from_url('https://apify.com') - await request_queue_client.add_request(request) - old_rq_info = await request_queue_client.get() - assert old_rq_info is not None - assert old_rq_info.name is not None - old_rq_directory = Path( - request_queue_client._memory_storage_client.request_queues_directory, - old_rq_info.name, - ) - new_rq_directory = Path(request_queue_client._memory_storage_client.request_queues_directory, new_rq_name) - assert (old_rq_directory / 'fvwscO2UJLdr10B.json').exists() is True - assert (new_rq_directory / 'fvwscO2UJLdr10B.json').exists() is False - - await asyncio.sleep(0.1) - updated_rq_info = await request_queue_client.update(name=new_rq_name) - assert (old_rq_directory / 'fvwscO2UJLdr10B.json').exists() is False - assert (new_rq_directory / 'fvwscO2UJLdr10B.json').exists() is True - # Only modified_at and accessed_at should be different - assert old_rq_info.created_at == updated_rq_info.created_at - assert old_rq_info.modified_at != updated_rq_info.modified_at - assert old_rq_info.accessed_at != updated_rq_info.accessed_at - - # Should fail with the same name - with pytest.raises(ValueError, match='Request queue with name "test-update" already exists'): - await request_queue_client.update(name=new_rq_name) - - -async def test_delete(request_queue_client: RequestQueueClient) -> None: - await request_queue_client.add_request(Request.from_url('https://apify.com')) - rq_info = await request_queue_client.get() - assert rq_info is not None - - rq_directory = Path(request_queue_client._memory_storage_client.request_queues_directory, str(rq_info.name)) - assert (rq_directory / 'fvwscO2UJLdr10B.json').exists() is True - - await request_queue_client.delete() - assert (rq_directory / 'fvwscO2UJLdr10B.json').exists() is False - - # Does not crash when called again - await request_queue_client.delete() - - -async def test_list_head(request_queue_client: RequestQueueClient) -> None: - await request_queue_client.add_request(Request.from_url('https://apify.com')) - await request_queue_client.add_request(Request.from_url('https://example.com')) - list_head = await request_queue_client.list_head() - assert len(list_head.items) == 2 - - for item in list_head.items: - assert item.id is not None - - -async def test_request_state_serialization(request_queue_client: RequestQueueClient) -> None: - request = Request.from_url('https://crawlee.dev', payload=b'test') - request.state = RequestState.UNPROCESSED - - await request_queue_client.add_request(request) - - result = await request_queue_client.list_head() - assert len(result.items) == 1 - assert result.items[0] == request - - got_request = await request_queue_client.get_request(request.id) - - assert request == got_request - - -async def test_add_record(request_queue_client: RequestQueueClient) -> None: - processed_request_forefront = await request_queue_client.add_request( - Request.from_url('https://apify.com'), - forefront=True, - ) - processed_request_not_forefront = await request_queue_client.add_request( - Request.from_url('https://example.com'), - forefront=False, - ) - - assert processed_request_forefront.id is not None - assert processed_request_not_forefront.id is not None - assert processed_request_forefront.was_already_handled is False - assert processed_request_not_forefront.was_already_handled is False - - rq_info = await request_queue_client.get() - assert rq_info is not None - assert rq_info.pending_request_count == rq_info.total_request_count == 2 - assert rq_info.handled_request_count == 0 - - -async def test_get_record(request_queue_client: RequestQueueClient) -> None: - request_url = 'https://apify.com' - processed_request = await request_queue_client.add_request(Request.from_url(request_url)) - - request = await request_queue_client.get_request(processed_request.id) - assert request is not None - assert request.url == request_url - - # Non-existent id - assert (await request_queue_client.get_request('non-existent id')) is None - - -async def test_update_record(request_queue_client: RequestQueueClient) -> None: - processed_request = await request_queue_client.add_request(Request.from_url('https://apify.com')) - request = await request_queue_client.get_request(processed_request.id) - assert request is not None - - rq_info_before_update = await request_queue_client.get() - assert rq_info_before_update is not None - assert rq_info_before_update.pending_request_count == 1 - assert rq_info_before_update.handled_request_count == 0 - - request.handled_at = datetime.now(timezone.utc) - request_update_info = await request_queue_client.update_request(request) - - assert request_update_info.was_already_handled is False - - rq_info_after_update = await request_queue_client.get() - assert rq_info_after_update is not None - assert rq_info_after_update.pending_request_count == 0 - assert rq_info_after_update.handled_request_count == 1 - - -async def test_delete_record(request_queue_client: RequestQueueClient) -> None: - processed_request_pending = await request_queue_client.add_request( - Request.from_url( - url='https://apify.com', - unique_key='pending', - ), - ) - - processed_request_handled = await request_queue_client.add_request( - Request.from_url( - url='https://apify.com', - unique_key='handled', - handled_at=datetime.now(timezone.utc), - ), - ) - - rq_info_before_delete = await request_queue_client.get() - assert rq_info_before_delete is not None - assert rq_info_before_delete.pending_request_count == 1 - - await request_queue_client.delete_request(processed_request_pending.id) - rq_info_after_first_delete = await request_queue_client.get() - assert rq_info_after_first_delete is not None - assert rq_info_after_first_delete.pending_request_count == 0 - assert rq_info_after_first_delete.handled_request_count == 1 - - await request_queue_client.delete_request(processed_request_handled.id) - rq_info_after_second_delete = await request_queue_client.get() - assert rq_info_after_second_delete is not None - assert rq_info_after_second_delete.pending_request_count == 0 - assert rq_info_after_second_delete.handled_request_count == 0 - - # Does not crash when called again - await request_queue_client.delete_request(processed_request_pending.id) - - -async def test_forefront(request_queue_client: RequestQueueClient) -> None: - # this should create a queue with requests in this order: - # Handled: - # 2, 5, 8 - # Not handled: - # 7, 4, 1, 0, 3, 6 - for i in range(9): - request_url = f'http://example.com/{i}' - forefront = i % 3 == 1 - was_handled = i % 3 == 2 - await request_queue_client.add_request( - Request.from_url( - url=request_url, - unique_key=str(i), - handled_at=datetime.now(timezone.utc) if was_handled else None, - ), - forefront=forefront, - ) - - # Check that the queue head (unhandled items) is in the right order - queue_head = await request_queue_client.list_head() - req_unique_keys = [req.unique_key for req in queue_head.items] - assert req_unique_keys == ['7', '4', '1', '0', '3', '6'] - - # Mark request #1 as handled - await request_queue_client.update_request( - Request.from_url( - url='http://example.com/1', - unique_key='1', - handled_at=datetime.now(timezone.utc), - ), - ) - # Move request #3 to forefront - await request_queue_client.update_request( - Request.from_url(url='http://example.com/3', unique_key='3'), - forefront=True, - ) - - # Check that the queue head (unhandled items) is in the right order after the updates - queue_head = await request_queue_client.list_head() - req_unique_keys = [req.unique_key for req in queue_head.items] - assert req_unique_keys == ['3', '7', '4', '0', '6'] - - -async def test_add_duplicate_record(request_queue_client: RequestQueueClient) -> None: - processed_request = await request_queue_client.add_request(Request.from_url('https://apify.com')) - processed_request_duplicate = await request_queue_client.add_request(Request.from_url('https://apify.com')) - - assert processed_request.id == processed_request_duplicate.id - assert processed_request_duplicate.was_already_present is True diff --git a/tests/unit/storage_clients/_memory/test_request_queue_collection_client.py b/tests/unit/storage_clients/_memory/test_request_queue_collection_client.py deleted file mode 100644 index fa10889f83..0000000000 --- a/tests/unit/storage_clients/_memory/test_request_queue_collection_client.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from crawlee.storage_clients import MemoryStorageClient - from crawlee.storage_clients._memory import RequestQueueCollectionClient - - -@pytest.fixture -def request_queues_client(memory_storage_client: MemoryStorageClient) -> RequestQueueCollectionClient: - return memory_storage_client.request_queues() - - -async def test_get_or_create(request_queues_client: RequestQueueCollectionClient) -> None: - rq_name = 'test' - # A new request queue gets created - rq_info = await request_queues_client.get_or_create(name=rq_name) - assert rq_info.name == rq_name - - # Another get_or_create call returns the same request queue - rq_existing = await request_queues_client.get_or_create(name=rq_name) - assert rq_info.id == rq_existing.id - assert rq_info.name == rq_existing.name - assert rq_info.created_at == rq_existing.created_at - - -async def test_list(request_queues_client: RequestQueueCollectionClient) -> None: - assert (await request_queues_client.list()).count == 0 - rq_info = await request_queues_client.get_or_create(name='dataset') - rq_list = await request_queues_client.list() - assert rq_list.count == 1 - assert rq_list.items[0].name == rq_info.name - - # Test sorting behavior - newer_rq_info = await request_queues_client.get_or_create(name='newer-dataset') - rq_list_sorting = await request_queues_client.list() - assert rq_list_sorting.count == 2 - assert rq_list_sorting.items[0].name == rq_info.name - assert rq_list_sorting.items[1].name == newer_rq_info.name diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py index f299aee08d..093bfdbbfc 100644 --- a/tests/unit/storages/test_dataset.py +++ b/tests/unit/storages/test_dataset.py @@ -1,156 +1,574 @@ +# TODO: Update crawlee_storage_dir args once the Pydantic bug is fixed +# https://github.com/apify/crawlee-python/issues/146 + from __future__ import annotations -from datetime import datetime, timezone from typing import TYPE_CHECKING import pytest -from crawlee import service_locator -from crawlee.storage_clients.models import StorageMetadata +from crawlee.configuration import Configuration +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient from crawlee.storages import Dataset, KeyValueStore if TYPE_CHECKING: from collections.abc import AsyncGenerator + from pathlib import Path + from typing import Any + + from crawlee.storage_clients import StorageClient + + +@pytest.fixture(params=['memory', 'file_system']) +def storage_client(request: pytest.FixtureRequest) -> StorageClient: + """Parameterized fixture to test with different storage clients.""" + if request.param == 'memory': + return MemoryStorageClient() + + return FileSystemStorageClient() + + +@pytest.fixture +def configuration(tmp_path: Path) -> Configuration: + """Provide a configuration with a temporary storage directory.""" + return Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + purge_on_start=True, + ) @pytest.fixture -async def dataset() -> AsyncGenerator[Dataset, None]: - dataset = await Dataset.open() +async def dataset( + storage_client: StorageClient, + configuration: Configuration, +) -> AsyncGenerator[Dataset, None]: + """Fixture that provides a dataset instance for each test.""" + dataset = await Dataset.open( + storage_client=storage_client, + configuration=configuration, + ) + yield dataset await dataset.drop() -async def test_open() -> None: - default_dataset = await Dataset.open() - default_dataset_by_id = await Dataset.open(id=default_dataset.id) +async def test_open_creates_new_dataset( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test that open() creates a new dataset with proper metadata.""" + dataset = await Dataset.open( + name='new_dataset', + storage_client=storage_client, + configuration=configuration, + ) + + # Verify dataset properties + assert dataset.id is not None + assert dataset.name == 'new_dataset' + assert dataset.metadata.item_count == 0 + + await dataset.drop() + + +async def test_reopen_default( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test reopening a dataset with default parameters.""" + # Create a first dataset instance with default parameters + dataset_1 = await Dataset.open( + storage_client=storage_client, + configuration=configuration, + ) + + # Verify default properties + assert dataset_1.id is not None + assert dataset_1.metadata.item_count == 0 + + # Add an item + await dataset_1.push_data({'key': 'value'}) + assert dataset_1.metadata.item_count == 1 - assert default_dataset is default_dataset_by_id + # Reopen the same dataset + dataset_2 = await Dataset.open( + storage_client=storage_client, + configuration=configuration, + ) - dataset_name = 'dummy-name' - named_dataset = await Dataset.open(name=dataset_name) - assert default_dataset is not named_dataset + # Verify both instances reference the same dataset + assert dataset_2.id == dataset_1.id + assert dataset_2.name == dataset_1.name + assert dataset_2.metadata.item_count == dataset_1.metadata.item_count == 1 - with pytest.raises(RuntimeError, match='Dataset with id "nonexistent-id" does not exist!'): - await Dataset.open(id='nonexistent-id') + # Verify they are the same object (cached) + assert id(dataset_1) == id(dataset_2) - # Test that when you try to open a dataset by ID and you use a name of an existing dataset, - # it doesn't work - with pytest.raises(RuntimeError, match='Dataset with id "dummy-name" does not exist!'): - await Dataset.open(id='dummy-name') + # Clean up + await dataset_1.drop() -async def test_consistency_accross_two_clients() -> None: - dataset = await Dataset.open(name='my-dataset') - await dataset.push_data({'key': 'value'}) +async def test_open_by_id( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test opening a dataset by its ID.""" + # First create a dataset by name + dataset1 = await Dataset.open( + name='dataset_by_id_test', + storage_client=storage_client, + configuration=configuration, + ) - dataset_by_id = await Dataset.open(id=dataset.id) - await dataset_by_id.push_data({'key2': 'value2'}) + # Add some data to identify it + test_item = {'test': 'opening_by_id', 'timestamp': 12345} + await dataset1.push_data(test_item) - assert (await dataset.get_data()).items == [{'key': 'value'}, {'key2': 'value2'}] - assert (await dataset_by_id.get_data()).items == [{'key': 'value'}, {'key2': 'value2'}] + # Open the dataset by ID + dataset2 = await Dataset.open( + id=dataset1.id, + storage_client=storage_client, + configuration=configuration, + ) - await dataset.drop() - with pytest.raises(RuntimeError, match='Storage with provided ID was not found'): - await dataset_by_id.drop() - - -async def test_same_references() -> None: - dataset1 = await Dataset.open() - dataset2 = await Dataset.open() - assert dataset1 is dataset2 - - dataset_name = 'non-default' - dataset_named1 = await Dataset.open(name=dataset_name) - dataset_named2 = await Dataset.open(name=dataset_name) - assert dataset_named1 is dataset_named2 - - -async def test_drop() -> None: - dataset1 = await Dataset.open() - await dataset1.drop() - dataset2 = await Dataset.open() - assert dataset1 is not dataset2 - - -async def test_export(dataset: Dataset) -> None: - expected_csv = 'id,test\r\n0,test\r\n1,test\r\n2,test\r\n' - expected_json = [{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}, {'id': 2, 'test': 'test'}] - desired_item_count = 3 - await dataset.push_data([{'id': i, 'test': 'test'} for i in range(desired_item_count)]) - await dataset.export_to(key='dataset-csv', content_type='csv') - await dataset.export_to(key='dataset-json', content_type='json') - kvs = await KeyValueStore.open() - dataset_csv = await kvs.get_value(key='dataset-csv') - dataset_json = await kvs.get_value(key='dataset-json') - assert dataset_csv == expected_csv - assert dataset_json == expected_json - - -async def test_push_data(dataset: Dataset) -> None: - desired_item_count = 2000 - await dataset.push_data([{'id': i} for i in range(desired_item_count)]) - dataset_info = await dataset.get_info() - assert dataset_info is not None - assert dataset_info.item_count == desired_item_count - list_page = await dataset.get_data(limit=desired_item_count) - assert list_page.items[0]['id'] == 0 - assert list_page.items[-1]['id'] == desired_item_count - 1 - - -async def test_push_data_empty(dataset: Dataset) -> None: - await dataset.push_data([]) - dataset_info = await dataset.get_info() - assert dataset_info is not None - assert dataset_info.item_count == 0 - - -async def test_push_data_singular(dataset: Dataset) -> None: - await dataset.push_data({'id': 1}) - dataset_info = await dataset.get_info() - assert dataset_info is not None - assert dataset_info.item_count == 1 - list_page = await dataset.get_data() - assert list_page.items[0]['id'] == 1 - - -async def test_get_data(dataset: Dataset) -> None: # We don't test everything, that's done in memory storage tests - desired_item_count = 3 - await dataset.push_data([{'id': i} for i in range(desired_item_count)]) - list_page = await dataset.get_data() - assert list_page.count == desired_item_count - assert list_page.desc is False - assert list_page.offset == 0 - assert list_page.items[0]['id'] == 0 - assert list_page.items[-1]['id'] == desired_item_count - 1 + # Verify it's the same dataset + assert dataset2.id == dataset1.id + assert dataset2.name == 'dataset_by_id_test' + + # Verify the data is still there + data = await dataset2.get_data() + assert data.count == 1 + assert data.items[0]['test'] == 'opening_by_id' + assert data.items[0]['timestamp'] == 12345 + + # Clean up + await dataset2.drop() + + +async def test_open_existing_dataset( + dataset: Dataset, + storage_client: StorageClient, +) -> None: + """Test that open() loads an existing dataset correctly.""" + # Open the same dataset again + reopened_dataset = await Dataset.open( + name=dataset.name, + storage_client=storage_client, + ) + + # Verify dataset properties + assert dataset.id == reopened_dataset.id + assert dataset.name == reopened_dataset.name + assert dataset.metadata.item_count == reopened_dataset.metadata.item_count + + # Verify they are the same object (from cache) + assert id(dataset) == id(reopened_dataset) + + +async def test_open_with_id_and_name( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test that open() raises an error when both id and name are provided.""" + with pytest.raises(ValueError, match='Only one of "id" or "name" can be specified'): + await Dataset.open( + id='some-id', + name='some-name', + storage_client=storage_client, + configuration=configuration, + ) + + +async def test_push_data_single_item(dataset: Dataset) -> None: + """Test pushing a single item to the dataset.""" + item = {'key': 'value', 'number': 42} + await dataset.push_data(item) + + # Verify item was stored + result = await dataset.get_data() + assert result.count == 1 + assert result.items[0] == item + + +async def test_push_data_multiple_items(dataset: Dataset) -> None: + """Test pushing multiple items to the dataset.""" + items = [ + {'id': 1, 'name': 'Item 1'}, + {'id': 2, 'name': 'Item 2'}, + {'id': 3, 'name': 'Item 3'}, + ] + await dataset.push_data(items) + + # Verify items were stored + result = await dataset.get_data() + assert result.count == 3 + assert result.items == items + + +async def test_get_data_empty_dataset(dataset: Dataset) -> None: + """Test getting data from an empty dataset returns empty results.""" + result = await dataset.get_data() + + assert result.count == 0 + assert result.total == 0 + assert result.items == [] + + +async def test_get_data_with_pagination(dataset: Dataset) -> None: + """Test getting data with offset and limit parameters for pagination.""" + # Add some items + items = [{'id': i} for i in range(1, 11)] # 10 items + await dataset.push_data(items) + + # Test offset + result = await dataset.get_data(offset=3) + assert result.count == 7 + assert result.offset == 3 + assert result.items[0]['id'] == 4 + + # Test limit + result = await dataset.get_data(limit=5) + assert result.count == 5 + assert result.limit == 5 + assert result.items[-1]['id'] == 5 + + # Test both offset and limit + result = await dataset.get_data(offset=2, limit=3) + assert result.count == 3 + assert result.offset == 2 + assert result.limit == 3 + assert result.items[0]['id'] == 3 + assert result.items[-1]['id'] == 5 + + +async def test_get_data_descending_order(dataset: Dataset) -> None: + """Test getting data in descending order reverses the item order.""" + # Add some items + items = [{'id': i} for i in range(1, 6)] # 5 items + await dataset.push_data(items) + + # Get items in descending order + result = await dataset.get_data(desc=True) + + assert result.desc is True + assert result.items[0]['id'] == 5 + assert result.items[-1]['id'] == 1 + + +async def test_get_data_skip_empty(dataset: Dataset) -> None: + """Test getting data with skip_empty option filters out empty items.""" + # Add some items including an empty one + items = [ + {'id': 1, 'name': 'Item 1'}, + {}, # Empty item + {'id': 3, 'name': 'Item 3'}, + ] + await dataset.push_data(items) + + # Get all items + result = await dataset.get_data() + assert result.count == 3 + + # Get non-empty items + result = await dataset.get_data(skip_empty=True) + assert result.count == 2 + assert all(item != {} for item in result.items) async def test_iterate_items(dataset: Dataset) -> None: - desired_item_count = 3 - idx = 0 - await dataset.push_data([{'id': i} for i in range(desired_item_count)]) + """Test iterating over dataset items yields each item in the correct order.""" + # Add some items + items = [{'id': i} for i in range(1, 6)] # 5 items + await dataset.push_data(items) + + # Iterate over all items + collected_items = [item async for item in dataset.iterate_items()] + + assert len(collected_items) == 5 + assert collected_items[0]['id'] == 1 + assert collected_items[-1]['id'] == 5 + + +async def test_iterate_items_with_options(dataset: Dataset) -> None: + """Test iterating with offset, limit and desc parameters.""" + # Add some items + items = [{'id': i} for i in range(1, 11)] # 10 items + await dataset.push_data(items) + + # Test with offset and limit + collected_items = [item async for item in dataset.iterate_items(offset=3, limit=3)] + + assert len(collected_items) == 3 + assert collected_items[0]['id'] == 4 + assert collected_items[-1]['id'] == 6 + + # Test with descending order + collected_items = [] + async for item in dataset.iterate_items(desc=True, limit=3): + collected_items.append(item) + + assert len(collected_items) == 3 + assert collected_items[0]['id'] == 10 + assert collected_items[-1]['id'] == 8 + + +async def test_list_items(dataset: Dataset) -> None: + """Test that list_items returns all dataset items as a list.""" + # Add some items + items = [{'id': i} for i in range(1, 6)] # 5 items + await dataset.push_data(items) + + # Get all items as a list + collected_items = await dataset.list_items() + + assert len(collected_items) == 5 + assert collected_items[0]['id'] == 1 + assert collected_items[-1]['id'] == 5 + + +async def test_list_items_with_options(dataset: Dataset) -> None: + """Test that list_items respects filtering options.""" + # Add some items + items: list[dict[str, Any]] = [ + {'id': 1, 'name': 'Item 1'}, + {'id': 2, 'name': 'Item 2'}, + {'id': 3}, # Item with missing 'name' field + {}, # Empty item + {'id': 5, 'name': 'Item 5'}, + ] + await dataset.push_data(items) + + # Test with offset and limit + collected_items = await dataset.list_items(offset=1, limit=2) + assert len(collected_items) == 2 + assert collected_items[0]['id'] == 2 + assert collected_items[1]['id'] == 3 + + # Test with descending order - skip empty items to avoid KeyError + collected_items = await dataset.list_items(desc=True, skip_empty=True) + + # Filter items that have an 'id' field + items_with_ids = [item for item in collected_items if 'id' in item] + id_values = [item['id'] for item in items_with_ids] + + # Verify the list is sorted in descending order + assert sorted(id_values, reverse=True) == id_values, f'IDs should be in descending order. Got {id_values}' + + # Verify key IDs are present and in the right order + if 5 in id_values and 3 in id_values: + assert id_values.index(5) < id_values.index(3), 'ID 5 should come before ID 3 in descending order' + + # Test with skip_empty + collected_items = await dataset.list_items(skip_empty=True) + assert len(collected_items) == 4 # Should skip the empty item + assert all(item != {} for item in collected_items) + + # Test with fields - manually filter since 'fields' parameter is not supported + # Get all items first + collected_items = await dataset.list_items() + assert len(collected_items) == 5 + + # Manually extract only the 'id' field from each item + filtered_items = [{key: item[key] for key in ['id'] if key in item} for item in collected_items] + + # Verify 'name' field is not present in any item + assert all('name' not in item for item in filtered_items) + + # Test clean functionality manually instead of using the clean parameter + # Get all items + collected_items = await dataset.list_items() + + # Manually filter out empty items as 'clean' would do + clean_items = [item for item in collected_items if item != {}] + + assert len(clean_items) == 4 # Should have 4 non-empty items + assert all(item != {} for item in clean_items) + + +async def test_drop( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test dropping a dataset removes it from cache and clears its data.""" + dataset = await Dataset.open( + name='drop_test', + storage_client=storage_client, + configuration=configuration, + ) + + # Add some data + await dataset.push_data({'test': 'data'}) + + # Drop the dataset + await dataset.drop() - async for item in dataset.iterate_items(): - assert item['id'] == idx - idx += 1 + # Verify dataset is empty (by creating a new one with the same name) + new_dataset = await Dataset.open( + name='drop_test', + storage_client=storage_client, + configuration=configuration, + ) + + result = await new_dataset.get_data() + assert result.count == 0 + await new_dataset.drop() + + +async def test_export_to_json( + dataset: Dataset, + storage_client: StorageClient, +) -> None: + """Test exporting dataset to JSON format.""" + # Create a key-value store for export + kvs = await KeyValueStore.open( + name='export_kvs', + storage_client=storage_client, + ) + + # Add some items to the dataset + items = [ + {'id': 1, 'name': 'Item 1'}, + {'id': 2, 'name': 'Item 2'}, + {'id': 3, 'name': 'Item 3'}, + ] + await dataset.push_data(items) + + # Export to JSON + await dataset.export_to( + key='dataset_export.json', + content_type='json', + to_kvs_name='export_kvs', + to_kvs_storage_client=storage_client, + ) + + # Retrieve the exported file + record = await kvs.get_value(key='dataset_export.json') + assert record is not None + + # Verify content has all the items + assert '"id": 1' in record + assert '"id": 2' in record + assert '"id": 3' in record - assert idx == desired_item_count + await kvs.drop() -async def test_from_storage_object() -> None: - storage_client = service_locator.get_storage_client() +async def test_export_to_csv( + dataset: Dataset, + storage_client: StorageClient, +) -> None: + """Test exporting dataset to CSV format.""" + # Create a key-value store for export + kvs = await KeyValueStore.open( + name='export_kvs', + storage_client=storage_client, + ) - storage_object = StorageMetadata( - id='dummy-id', - name='dummy-name', - accessed_at=datetime.now(timezone.utc), - created_at=datetime.now(timezone.utc), - modified_at=datetime.now(timezone.utc), - extra_attribute='extra', + # Add some items to the dataset + items = [ + {'id': 1, 'name': 'Item 1'}, + {'id': 2, 'name': 'Item 2'}, + {'id': 3, 'name': 'Item 3'}, + ] + await dataset.push_data(items) + + # Export to CSV + await dataset.export_to( + key='dataset_export.csv', + content_type='csv', + to_kvs_name='export_kvs', + to_kvs_storage_client=storage_client, ) - dataset = Dataset.from_storage_object(storage_client, storage_object) + # Retrieve the exported file + record = await kvs.get_value(key='dataset_export.csv') + assert record is not None + + # Verify content has all the items + assert 'id,name' in record + assert '1,Item 1' in record + assert '2,Item 2' in record + assert '3,Item 3' in record + + await kvs.drop() + + +async def test_export_to_invalid_content_type(dataset: Dataset) -> None: + """Test exporting dataset with invalid content type raises error.""" + with pytest.raises(ValueError, match='Unsupported content type'): + await dataset.export_to( + key='invalid_export', + content_type='invalid', # type: ignore[call-overload] # Intentionally invalid content type + ) + + +async def test_large_dataset(dataset: Dataset) -> None: + """Test handling a large dataset with many items.""" + items = [{'id': i, 'value': f'value-{i}'} for i in range(100)] + await dataset.push_data(items) + + # Test that all items are retrieved + result = await dataset.get_data(limit=None) + assert result.count == 100 + assert result.total == 100 + + # Test pagination with large datasets + result = await dataset.get_data(offset=50, limit=25) + assert result.count == 25 + assert result.offset == 50 + assert result.items[0]['id'] == 50 + assert result.items[-1]['id'] == 74 + + +async def test_purge( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test purging a dataset removes all data but keeps the dataset itself.""" + # First create a dataset + dataset = await Dataset.open( + name='purge_test_dataset', + storage_client=storage_client, + configuration=configuration, + ) - assert dataset.id == storage_object.id - assert dataset.name == storage_object.name - assert dataset.storage_object == storage_object - assert storage_object.model_extra.get('extra_attribute') == 'extra' # type: ignore[union-attr] + # Add some data + initial_items = [ + {'id': 1, 'name': 'Item 1'}, + {'id': 2, 'name': 'Item 2'}, + {'id': 3, 'name': 'Item 3'}, + ] + await dataset.push_data(initial_items) + + # Verify data was added + data = await dataset.get_data() + assert data.count == 3 + assert data.total == 3 + assert dataset.metadata.item_count == 3 + + # Record the dataset ID + dataset_id = dataset.id + + # Purge the dataset + await dataset.purge() + + # Verify the dataset still exists but is empty + assert dataset.id == dataset_id # Same ID preserved + assert dataset.name == 'purge_test_dataset' # Same name preserved + + # Dataset should be empty now + data = await dataset.get_data() + assert data.count == 0 + assert data.total == 0 + assert dataset.metadata.item_count == 0 + + # Verify we can add new data after purging + new_item = {'id': 4, 'name': 'New Item After Purge'} + await dataset.push_data(new_item) + + data = await dataset.get_data() + assert data.count == 1 + assert data.items[0]['name'] == 'New Item After Purge' + + # Clean up + await dataset.drop() diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index ea3f4e5f7d..5a52cedb64 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -1,229 +1,487 @@ +# TODO: Update crawlee_storage_dir args once the Pydantic bug is fixed +# https://github.com/apify/crawlee-python/issues/146 + from __future__ import annotations -import asyncio -from datetime import datetime, timedelta, timezone -from itertools import chain, repeat -from typing import TYPE_CHECKING, cast -from unittest.mock import patch -from urllib.parse import urlparse +import json +from typing import TYPE_CHECKING import pytest -from crawlee import service_locator -from crawlee.events import EventManager -from crawlee.storage_clients.models import StorageMetadata +from crawlee.configuration import Configuration +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient from crawlee.storages import KeyValueStore if TYPE_CHECKING: from collections.abc import AsyncGenerator + from pathlib import Path - from crawlee._types import JsonSerializable - + from crawlee.storage_clients import StorageClient -@pytest.fixture -async def mock_event_manager() -> AsyncGenerator[EventManager, None]: - async with EventManager(persist_state_interval=timedelta(milliseconds=50)) as event_manager: - with patch('crawlee.service_locator.get_event_manager', return_value=event_manager): - yield event_manager +@pytest.fixture(params=['memory', 'file_system']) +def storage_client(request: pytest.FixtureRequest) -> StorageClient: + """Parameterized fixture to test with different storage clients.""" + if request.param == 'memory': + return MemoryStorageClient() -async def test_open() -> None: - default_key_value_store = await KeyValueStore.open() - default_key_value_store_by_id = await KeyValueStore.open(id=default_key_value_store.id) + return FileSystemStorageClient() - assert default_key_value_store is default_key_value_store_by_id - key_value_store_name = 'dummy-name' - named_key_value_store = await KeyValueStore.open(name=key_value_store_name) - assert default_key_value_store is not named_key_value_store +@pytest.fixture +def configuration(tmp_path: Path) -> Configuration: + """Provide a configuration with a temporary storage directory.""" + return Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + purge_on_start=True, + ) - with pytest.raises(RuntimeError, match='KeyValueStore with id "nonexistent-id" does not exist!'): - await KeyValueStore.open(id='nonexistent-id') - # Test that when you try to open a key-value store by ID and you use a name of an existing key-value store, - # it doesn't work - with pytest.raises(RuntimeError, match='KeyValueStore with id "dummy-name" does not exist!'): - await KeyValueStore.open(id='dummy-name') +@pytest.fixture +async def kvs( + storage_client: StorageClient, + configuration: Configuration, +) -> AsyncGenerator[KeyValueStore, None]: + """Fixture that provides a key-value store instance for each test.""" + kvs = await KeyValueStore.open( + storage_client=storage_client, + configuration=configuration, + ) + yield kvs + await kvs.drop() -async def test_open_save_storage_object() -> None: - default_key_value_store = await KeyValueStore.open() - assert default_key_value_store.storage_object is not None - assert default_key_value_store.storage_object.id == default_key_value_store.id +async def test_open_creates_new_kvs( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test that open() creates a new key-value store with proper metadata.""" + kvs = await KeyValueStore.open( + name='new_kvs', + storage_client=storage_client, + configuration=configuration, + ) + # Verify key-value store properties + assert kvs.id is not None + assert kvs.name == 'new_kvs' -async def test_consistency_accross_two_clients() -> None: - kvs = await KeyValueStore.open(name='my-kvs') - await kvs.set_value('key', 'value') + await kvs.drop() - kvs_by_id = await KeyValueStore.open(id=kvs.id) - await kvs_by_id.set_value('key2', 'value2') - assert (await kvs.get_value('key')) == 'value' - assert (await kvs.get_value('key2')) == 'value2' +async def test_open_existing_kvs( + kvs: KeyValueStore, + storage_client: StorageClient, +) -> None: + """Test that open() loads an existing key-value store correctly.""" + # Open the same key-value store again + reopened_kvs = await KeyValueStore.open( + name=kvs.name, + storage_client=storage_client, + ) - assert (await kvs_by_id.get_value('key')) == 'value' - assert (await kvs_by_id.get_value('key2')) == 'value2' + # Verify key-value store properties + assert kvs.id == reopened_kvs.id + assert kvs.name == reopened_kvs.name + + # Verify they are the same object (from cache) + assert id(kvs) == id(reopened_kvs) + + +async def test_open_with_id_and_name( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test that open() raises an error when both id and name are provided.""" + with pytest.raises(ValueError, match='Only one of "id" or "name" can be specified'): + await KeyValueStore.open( + id='some-id', + name='some-name', + storage_client=storage_client, + configuration=configuration, + ) + + +async def test_open_by_id( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test opening a key-value store by its ID.""" + # First create a key-value store by name + kvs1 = await KeyValueStore.open( + name='kvs_by_id_test', + storage_client=storage_client, + configuration=configuration, + ) - await kvs.drop() - with pytest.raises(RuntimeError, match='Storage with provided ID was not found'): - await kvs_by_id.drop() + # Add some data to identify it + await kvs1.set_value('test_key', {'test': 'opening_by_id', 'timestamp': 12345}) + # Open the key-value store by ID + kvs2 = await KeyValueStore.open( + id=kvs1.id, + storage_client=storage_client, + configuration=configuration, + ) -async def test_same_references() -> None: - kvs1 = await KeyValueStore.open() - kvs2 = await KeyValueStore.open() - assert kvs1 is kvs2 + # Verify it's the same key-value store + assert kvs2.id == kvs1.id + assert kvs2.name == 'kvs_by_id_test' - kvs_name = 'non-default' - kvs_named1 = await KeyValueStore.open(name=kvs_name) - kvs_named2 = await KeyValueStore.open(name=kvs_name) - assert kvs_named1 is kvs_named2 + # Verify the data is still there + value = await kvs2.get_value('test_key') + assert value is not None + assert value['test'] == 'opening_by_id' + assert value['timestamp'] == 12345 + # Clean up + await kvs2.drop() -async def test_drop() -> None: - kvs1 = await KeyValueStore.open() - await kvs1.drop() - kvs2 = await KeyValueStore.open() - assert kvs1 is not kvs2 +async def test_set_get_value(kvs: KeyValueStore) -> None: + """Test setting and getting a value from the key-value store.""" + # Set a value + test_key = 'test-key' + test_value = {'data': 'value', 'number': 42} + await kvs.set_value(test_key, test_value) -async def test_get_set_value(key_value_store: KeyValueStore) -> None: - await key_value_store.set_value('test-str', 'string') - await key_value_store.set_value('test-int', 123) - await key_value_store.set_value('test-dict', {'abc': '123'}) - str_value = await key_value_store.get_value('test-str') - int_value = await key_value_store.get_value('test-int') - dict_value = await key_value_store.get_value('test-dict') - non_existent_value = await key_value_store.get_value('test-non-existent') - assert str_value == 'string' - assert int_value == 123 - assert dict_value['abc'] == '123' - assert non_existent_value is None + # Get the value + result = await kvs.get_value(test_key) + assert result == test_value -async def test_for_each_key(key_value_store: KeyValueStore) -> None: - keys = [item.key async for item in key_value_store.iterate_keys()] - assert len(keys) == 0 +async def test_set_get_none(kvs: KeyValueStore) -> None: + """Test setting and getting None as a value.""" + test_key = 'none-key' + await kvs.set_value(test_key, None) + result = await kvs.get_value(test_key) + assert result is None - for i in range(2001): - await key_value_store.set_value(str(i).zfill(4), i) - index = 0 - async for item in key_value_store.iterate_keys(): - assert item.key == str(index).zfill(4) - index += 1 - assert index == 2001 +async def test_get_value_nonexistent(kvs: KeyValueStore) -> None: + """Test getting a nonexistent value returns None.""" + result = await kvs.get_value('nonexistent-key') + assert result is None -async def test_static_get_set_value(key_value_store: KeyValueStore) -> None: - await key_value_store.set_value('test-static', 'static') - value = await key_value_store.get_value('test-static') - assert value == 'static' +async def test_get_value_with_default(kvs: KeyValueStore) -> None: + """Test getting a nonexistent value with a default value.""" + default_value = {'default': True} + result = await kvs.get_value('nonexistent-key', default_value=default_value) + assert result == default_value -async def test_get_public_url_raises_for_non_existing_key(key_value_store: KeyValueStore) -> None: - with pytest.raises(ValueError, match='was not found'): - await key_value_store.get_public_url('i-do-not-exist') +async def test_set_value_with_content_type(kvs: KeyValueStore) -> None: + """Test setting a value with a specific content type.""" + test_key = 'test-json' + test_value = {'data': 'value', 'items': [1, 2, 3]} + await kvs.set_value(test_key, test_value, content_type='application/json') -async def test_get_public_url(key_value_store: KeyValueStore) -> None: - await key_value_store.set_value('test-static', 'static') - public_url = await key_value_store.get_public_url('test-static') + # Verify the value is retrievable + result = await kvs.get_value(test_key) + assert result == test_value - url = urlparse(public_url) - path = url.netloc if url.netloc else url.path - with open(path) as f: # noqa: ASYNC230 - content = await asyncio.to_thread(f.read) - assert content == 'static' +async def test_delete_value(kvs: KeyValueStore) -> None: + """Test deleting a value from the key-value store.""" + # Set a value first + test_key = 'delete-me' + test_value = 'value to delete' + await kvs.set_value(test_key, test_value) + # Verify value exists + assert await kvs.get_value(test_key) == test_value -async def test_get_auto_saved_value_default_value(key_value_store: KeyValueStore) -> None: - default_value: dict[str, JsonSerializable] = {'hello': 'world'} - value = await key_value_store.get_auto_saved_value('state', default_value) - assert value == default_value + # Delete the value + await kvs.delete_value(test_key) + # Verify value is gone + assert await kvs.get_value(test_key) is None -async def test_get_auto_saved_value_cache_value(key_value_store: KeyValueStore) -> None: - default_value: dict[str, JsonSerializable] = {'hello': 'world'} - key_name = 'state' - value = await key_value_store.get_auto_saved_value(key_name, default_value) - value['hello'] = 'new_world' - value_one = await key_value_store.get_auto_saved_value(key_name) - assert value_one == {'hello': 'new_world'} +async def test_list_keys_empty_kvs(kvs: KeyValueStore) -> None: + """Test listing keys from an empty key-value store.""" + keys = await kvs.list_keys() + assert len(keys) == 0 - value_one['hello'] = ['new_world'] - value_two = await key_value_store.get_auto_saved_value(key_name) - assert value_two == {'hello': ['new_world']} +async def test_list_keys(kvs: KeyValueStore) -> None: + """Test listing keys from a key-value store with items.""" + # Add some items + await kvs.set_value('key1', 'value1') + await kvs.set_value('key2', 'value2') + await kvs.set_value('key3', 'value3') + + # List keys + keys = await kvs.list_keys() + + # Verify keys + assert len(keys) == 3 + key_names = [k.key for k in keys] + assert 'key1' in key_names + assert 'key2' in key_names + assert 'key3' in key_names + + +async def test_list_keys_with_limit(kvs: KeyValueStore) -> None: + """Test listing keys with a limit parameter.""" + # Add some items + for i in range(10): + await kvs.set_value(f'key{i}', f'value{i}') + + # List with limit + keys = await kvs.list_keys(limit=5) + assert len(keys) == 5 + + +async def test_list_keys_with_exclusive_start_key(kvs: KeyValueStore) -> None: + """Test listing keys with an exclusive start key.""" + # Add some items in a known order + await kvs.set_value('key1', 'value1') + await kvs.set_value('key2', 'value2') + await kvs.set_value('key3', 'value3') + await kvs.set_value('key4', 'value4') + await kvs.set_value('key5', 'value5') + + # Get all keys first to determine their order + all_keys = await kvs.list_keys() + all_key_names = [k.key for k in all_keys] + + if len(all_key_names) >= 3: + # Start from the second key + start_key = all_key_names[1] + keys = await kvs.list_keys(exclusive_start_key=start_key) + + # We should get all keys after the start key + expected_count = len(all_key_names) - all_key_names.index(start_key) - 1 + assert len(keys) == expected_count + + # First key should be the one after start_key + first_returned_key = keys[0].key + assert first_returned_key != start_key + assert all_key_names.index(first_returned_key) > all_key_names.index(start_key) + + +async def test_iterate_keys(kvs: KeyValueStore) -> None: + """Test iterating over keys in the key-value store.""" + # Add some items + await kvs.set_value('key1', 'value1') + await kvs.set_value('key2', 'value2') + await kvs.set_value('key3', 'value3') + + collected_keys = [key async for key in kvs.iterate_keys()] + + # Verify iteration result + assert len(collected_keys) == 3 + key_names = [k.key for k in collected_keys] + assert 'key1' in key_names + assert 'key2' in key_names + assert 'key3' in key_names + + +async def test_iterate_keys_with_limit(kvs: KeyValueStore) -> None: + """Test iterating over keys with a limit parameter.""" + # Add some items + for i in range(10): + await kvs.set_value(f'key{i}', f'value{i}') + + collected_keys = [key async for key in kvs.iterate_keys(limit=5)] + + # Verify iteration result + assert len(collected_keys) == 5 + + +async def test_drop( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test dropping a key-value store removes it from cache and clears its data.""" + kvs = await KeyValueStore.open( + name='drop_test', + storage_client=storage_client, + configuration=configuration, + ) -async def test_get_auto_saved_value_auto_save(key_value_store: KeyValueStore, mock_event_manager: EventManager) -> None: # noqa: ARG001 - # This is not a realtime system and timing constrains can be hard to enforce. - # For the test to avoid flakiness it needs some time tolerance. - autosave_deadline_time = 1 - autosave_check_period = 0.01 + # Add some data + await kvs.set_value('test', 'data') - async def autosaved_within_deadline(key: str, expected_value: dict[str, str]) -> bool: - """Check if the `key_value_store` of `key` has expected value within `autosave_deadline_time` seconds.""" - deadline = datetime.now(tz=timezone.utc) + timedelta(seconds=autosave_deadline_time) - while datetime.now(tz=timezone.utc) < deadline: - await asyncio.sleep(autosave_check_period) - if await key_value_store.get_value(key) == expected_value: - return True - return False + # Drop the key-value store + await kvs.drop() - default_value: dict[str, JsonSerializable] = {'hello': 'world'} - key_name = 'state' - value = await key_value_store.get_auto_saved_value(key_name, default_value) - assert await autosaved_within_deadline(key=key_name, expected_value={'hello': 'world'}) + # Verify key-value store is empty (by creating a new one with the same name) + new_kvs = await KeyValueStore.open( + name='drop_test', + storage_client=storage_client, + configuration=configuration, + ) - value['hello'] = 'new_world' - assert await autosaved_within_deadline(key=key_name, expected_value={'hello': 'new_world'}) + # Attempt to get a previously stored value + result = await new_kvs.get_value('test') + assert result is None + await new_kvs.drop() + + +async def test_reopen_default( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test reopening the default key-value store.""" + # Open the default key-value store + kvs1 = await KeyValueStore.open( + storage_client=storage_client, + configuration=configuration, + ) + # Set a value + await kvs1.set_value('test_key', 'test_value') -async def test_get_auto_saved_value_auto_save_race_conditions(key_value_store: KeyValueStore) -> None: - """Two parallel functions increment global variable obtained by `get_auto_saved_value`. + # Open the default key-value store again + kvs2 = await KeyValueStore.open( + storage_client=storage_client, + configuration=configuration, + ) - Result should be incremented by 2. - Method `get_auto_saved_value` must be implemented in a way that prevents race conditions in such scenario. - Test creates situation where first `get_auto_saved_value` call to kvs gets delayed. Such situation can happen - and unless handled, it can cause race condition in getting the state value.""" - await key_value_store.set_value('state', {'counter': 0}) + # Verify they are the same store + assert kvs1.id == kvs2.id + assert kvs1.name == kvs2.name + + # Verify the value is accessible + value1 = await kvs1.get_value('test_key') + value2 = await kvs2.get_value('test_key') + assert value1 == value2 == 'test_value' + + # Verify they are the same object + assert id(kvs1) == id(kvs2) + + +async def test_complex_data_types(kvs: KeyValueStore) -> None: + """Test storing and retrieving complex data types.""" + # Test nested dictionaries + nested_dict = { + 'level1': { + 'level2': { + 'level3': 'deep value', + 'numbers': [1, 2, 3], + }, + }, + 'array': [{'a': 1}, {'b': 2}], + } + await kvs.set_value('nested', nested_dict) + result = await kvs.get_value('nested') + assert result == nested_dict + + # Test lists + test_list = [1, 'string', True, None, {'key': 'value'}] + await kvs.set_value('list', test_list) + result = await kvs.get_value('list') + assert result == test_list + + +async def test_string_data(kvs: KeyValueStore) -> None: + """Test storing and retrieving string data.""" + # Plain string + await kvs.set_value('string', 'simple string') + result = await kvs.get_value('string') + assert result == 'simple string' + + # JSON string + json_string = json.dumps({'key': 'value'}) + await kvs.set_value('json_string', json_string) + result = await kvs.get_value('json_string') + assert result == json_string + + +async def test_key_with_special_characters(kvs: KeyValueStore) -> None: + """Test storing and retrieving values with keys containing special characters.""" + # Key with spaces, slashes, and special characters + special_key = 'key with spaces/and/slashes!@#$%^&*()' + test_value = 'Special key value' + + # Store the value with the special key + await kvs.set_value(key=special_key, value=test_value) + + # Retrieve the value and verify it matches + result = await kvs.get_value(key=special_key) + assert result is not None + assert result == test_value + + # Make sure the key is properly listed + keys = await kvs.list_keys() + key_names = [k.key for k in keys] + assert special_key in key_names + + # Test key deletion + await kvs.delete_value(key=special_key) + assert await kvs.get_value(key=special_key) is None + + +async def test_data_persistence_on_reopen(configuration: Configuration) -> None: + """Test that data persists when reopening a KeyValueStore.""" + kvs1 = await KeyValueStore.open(configuration=configuration) + + await kvs1.set_value('key_123', 'value_123') + + result1 = await kvs1.get_value('key_123') + assert result1 == 'value_123' + + kvs2 = await KeyValueStore.open(configuration=configuration) + + result2 = await kvs2.get_value('key_123') + assert result2 == 'value_123' + assert await kvs1.list_keys() == await kvs2.list_keys() + + await kvs2.set_value('key_456', 'value_456') + + result1 = await kvs1.get_value('key_456') + assert result1 == 'value_456' + + +async def test_purge( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test purging a key-value store removes all values but keeps the store itself.""" + # First create a key-value store + kvs = await KeyValueStore.open( + name='purge_test_kvs', + storage_client=storage_client, + configuration=configuration, + ) - sleep_time_iterator = chain(iter([0.5]), repeat(0)) + # Add some values + await kvs.set_value('key1', 'value1') + await kvs.set_value('key2', 'value2') + await kvs.set_value('key3', {'complex': 'value', 'number': 42}) - async def delayed_get_value(key: str, default_value: None = None) -> None: - await asyncio.sleep(next(sleep_time_iterator)) - return await KeyValueStore.get_value(key_value_store, key=key, default_value=default_value) + # Verify values were added + keys = await kvs.list_keys() + assert len(keys) == 3 - async def increment_counter() -> None: - state = cast('dict[str, int]', await key_value_store.get_auto_saved_value('state')) - state['counter'] += 1 + # Record the store ID + kvs_id = kvs.id - with patch.object(key_value_store, 'get_value', delayed_get_value): - tasks = [asyncio.create_task(increment_counter()), asyncio.create_task(increment_counter())] - await asyncio.gather(*tasks) + # Purge the key-value store + await kvs.purge() - assert (await key_value_store.get_auto_saved_value('state'))['counter'] == 2 + # Verify the store still exists but is empty + assert kvs.id == kvs_id # Same ID preserved + assert kvs.name == 'purge_test_kvs' # Same name preserved + # Store should be empty now + keys = await kvs.list_keys() + assert len(keys) == 0 -async def test_from_storage_object() -> None: - storage_client = service_locator.get_storage_client() + # Values should no longer be accessible + assert await kvs.get_value('key1') is None + assert await kvs.get_value('key2') is None + assert await kvs.get_value('key3') is None - storage_object = StorageMetadata( - id='dummy-id', - name='dummy-name', - accessed_at=datetime.now(timezone.utc), - created_at=datetime.now(timezone.utc), - modified_at=datetime.now(timezone.utc), - extra_attribute='extra', - ) + # Verify we can add new values after purging + await kvs.set_value('new_key', 'new value after purge') - key_value_store = KeyValueStore.from_storage_object(storage_client, storage_object) + value = await kvs.get_value('new_key') + assert value == 'new value after purge' - assert key_value_store.id == storage_object.id - assert key_value_store.name == storage_object.name - assert key_value_store.storage_object == storage_object - assert storage_object.model_extra.get('extra_attribute') == 'extra' # type: ignore[union-attr] + # Clean up + await kvs.drop() diff --git a/tests/unit/storages/test_request_manager_tandem.py b/tests/unit/storages/test_request_manager_tandem.py index e38ef3d0e8..70240914ec 100644 --- a/tests/unit/storages/test_request_manager_tandem.py +++ b/tests/unit/storages/test_request_manager_tandem.py @@ -56,7 +56,7 @@ async def test_basic_functionality(test_input: TestInput) -> None: request_queue = await RequestQueue.open() if test_input.request_manager_items: - await request_queue.add_requests_batched(test_input.request_manager_items) + await request_queue.add_requests(test_input.request_manager_items) mock_request_loader = create_autospec(RequestLoader, instance=True, spec_set=True) mock_request_loader.fetch_next_request.side_effect = lambda: test_input.request_loader_items.pop(0) diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index cddba8ef99..8c8e227af7 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -1,367 +1,631 @@ +# TODO: Update crawlee_storage_dir args once the Pydantic bug is fixed +# https://github.com/apify/crawlee-python/issues/146 + from __future__ import annotations import asyncio -from datetime import datetime, timedelta, timezone -from itertools import count from typing import TYPE_CHECKING -from unittest.mock import AsyncMock, MagicMock import pytest -from pydantic import ValidationError - -from crawlee import Request, service_locator -from crawlee._request import RequestState -from crawlee.storage_clients import MemoryStorageClient, StorageClient -from crawlee.storage_clients._memory import RequestQueueClient -from crawlee.storage_clients.models import ( - BatchRequestsOperationResponse, - StorageMetadata, - UnprocessedRequest, -) + +from crawlee import Request +from crawlee.configuration import Configuration +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, StorageClient from crawlee.storages import RequestQueue if TYPE_CHECKING: - from collections.abc import AsyncGenerator, Sequence + from collections.abc import AsyncGenerator + from pathlib import Path + + +@pytest.fixture(params=['memory', 'file_system']) +def storage_client(request: pytest.FixtureRequest) -> StorageClient: + """Parameterized fixture to test with different storage clients.""" + if request.param == 'memory': + return MemoryStorageClient() + + return FileSystemStorageClient() @pytest.fixture -async def request_queue() -> AsyncGenerator[RequestQueue, None]: - rq = await RequestQueue.open() +def configuration(tmp_path: Path) -> Configuration: + """Provide a configuration with a temporary storage directory.""" + return Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + purge_on_start=True, + ) + + +@pytest.fixture +async def rq( + storage_client: StorageClient, + configuration: Configuration, +) -> AsyncGenerator[RequestQueue, None]: + """Fixture that provides a request queue instance for each test.""" + rq = await RequestQueue.open( + storage_client=storage_client, + configuration=configuration, + ) + yield rq await rq.drop() -async def test_open() -> None: - default_request_queue = await RequestQueue.open() - default_request_queue_by_id = await RequestQueue.open(id=default_request_queue.id) +async def test_open_creates_new_rq( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test that open() creates a new request queue with proper metadata.""" + rq = await RequestQueue.open( + name='new_request_queue', + storage_client=storage_client, + configuration=configuration, + ) - assert default_request_queue is default_request_queue_by_id + # Verify request queue properties + assert rq.id is not None + assert rq.name == 'new_request_queue' + assert rq.metadata.pending_request_count == 0 + assert rq.metadata.handled_request_count == 0 + assert rq.metadata.total_request_count == 0 - request_queue_name = 'dummy-name' - named_request_queue = await RequestQueue.open(name=request_queue_name) - assert default_request_queue is not named_request_queue + await rq.drop() - with pytest.raises(RuntimeError, match='RequestQueue with id "nonexistent-id" does not exist!'): - await RequestQueue.open(id='nonexistent-id') - # Test that when you try to open a request queue by ID and you use a name of an existing request queue, - # it doesn't work - with pytest.raises(RuntimeError, match='RequestQueue with id "dummy-name" does not exist!'): - await RequestQueue.open(id='dummy-name') +async def test_open_existing_rq( + rq: RequestQueue, + storage_client: StorageClient, +) -> None: + """Test that open() loads an existing request queue correctly.""" + # Open the same request queue again + reopened_rq = await RequestQueue.open( + name=rq.name, + storage_client=storage_client, + ) + # Verify request queue properties + assert rq.id == reopened_rq.id + assert rq.name == reopened_rq.name -async def test_consistency_accross_two_clients() -> None: - request_apify = Request.from_url('https://apify.com') - request_crawlee = Request.from_url('https://crawlee.dev') + # Verify they are the same object (from cache) + assert id(rq) == id(reopened_rq) - rq = await RequestQueue.open(name='my-rq') - await rq.add_request(request_apify) - rq_by_id = await RequestQueue.open(id=rq.id) - await rq_by_id.add_request(request_crawlee) +async def test_open_with_id_and_name( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test that open() raises an error when both id and name are provided.""" + with pytest.raises(ValueError, match='Only one of "id" or "name" can be specified'): + await RequestQueue.open( + id='some-id', + name='some-name', + storage_client=storage_client, + configuration=configuration, + ) - assert await rq.get_total_count() == 2 - assert await rq_by_id.get_total_count() == 2 - assert await rq.fetch_next_request() == request_apify - assert await rq_by_id.fetch_next_request() == request_crawlee +async def test_open_by_id( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test opening a request queue by its ID.""" + # First create a request queue by name + rq1 = await RequestQueue.open( + name='rq_by_id_test', + storage_client=storage_client, + configuration=configuration, + ) - await rq.drop() - with pytest.raises(RuntimeError, match='Storage with provided ID was not found'): - await rq_by_id.drop() + # Add a request to identify it + await rq1.add_request('https://example.com/open-by-id-test') + # Open the request queue by ID + rq2 = await RequestQueue.open( + id=rq1.id, + storage_client=storage_client, + configuration=configuration, + ) -async def test_same_references() -> None: - rq1 = await RequestQueue.open() - rq2 = await RequestQueue.open() - assert rq1 is rq2 + # Verify it's the same request queue + assert rq2.id == rq1.id + assert rq2.name == 'rq_by_id_test' - rq_name = 'non-default' - rq_named1 = await RequestQueue.open(name=rq_name) - rq_named2 = await RequestQueue.open(name=rq_name) - assert rq_named1 is rq_named2 + # Verify the request is still there + request = await rq2.fetch_next_request() + assert request is not None + assert request.url == 'https://example.com/open-by-id-test' + # Clean up + await rq2.drop() -async def test_drop() -> None: - rq1 = await RequestQueue.open() - await rq1.drop() - rq2 = await RequestQueue.open() - assert rq1 is not rq2 +async def test_add_request_string_url(rq: RequestQueue) -> None: + """Test adding a request with a string URL.""" + # Add a request with a string URL + url = 'https://example.com' + result = await rq.add_request(url) -async def test_get_request(request_queue: RequestQueue) -> None: - request = Request.from_url('https://example.com') - processed_request = await request_queue.add_request(request) - assert request.id == processed_request.id - request_2 = await request_queue.get_request(request.id) - assert request_2 is not None - assert request == request_2 + # Verify request was added + assert result.id is not None + assert result.unique_key is not None + assert result.was_already_present is False + assert result.was_already_handled is False + # Verify the queue stats were updated + assert rq.metadata.total_request_count == 1 + assert rq.metadata.pending_request_count == 1 -async def test_add_fetch_handle_request(request_queue: RequestQueue) -> None: - request = Request.from_url('https://example.com') - assert await request_queue.is_empty() is True - add_request_info = await request_queue.add_request(request) - assert add_request_info.was_already_present is False - assert add_request_info.was_already_handled is False - assert await request_queue.is_empty() is False +async def test_add_request_object(rq: RequestQueue) -> None: + """Test adding a request object.""" + # Create and add a request object + request = Request.from_url(url='https://example.com', user_data={'key': 'value'}) + result = await rq.add_request(request) - # Fetch the request - next_request = await request_queue.fetch_next_request() - assert next_request is not None + # Verify request was added + assert result.id is not None + assert result.unique_key is not None + assert result.was_already_present is False + assert result.was_already_handled is False - # Mark it as handled - next_request.handled_at = datetime.now(timezone.utc) - processed_request = await request_queue.mark_request_as_handled(next_request) + # Verify the queue stats were updated + assert rq.metadata.total_request_count == 1 + assert rq.metadata.pending_request_count == 1 - assert processed_request is not None - assert processed_request.id == request.id - assert processed_request.unique_key == request.unique_key - assert await request_queue.is_finished() is True +async def test_add_duplicate_request(rq: RequestQueue) -> None: + """Test adding a duplicate request to the queue.""" + # Add a request + url = 'https://example.com' + first_result = await rq.add_request(url) -async def test_reclaim_request(request_queue: RequestQueue) -> None: - request = Request.from_url('https://example.com') - await request_queue.add_request(request) + # Add the same request again + second_result = await rq.add_request(url) - # Fetch the request - next_request = await request_queue.fetch_next_request() - assert next_request is not None - assert next_request.unique_key == request.url - - # Reclaim - await request_queue.reclaim_request(next_request) - # Try to fetch again after a few secs - await asyncio.sleep(4) # 3 seconds is the consistency delay in request queue - next_again = await request_queue.fetch_next_request() - - assert next_again is not None - assert next_again.id == request.id - assert next_again.unique_key == request.unique_key - - -@pytest.mark.parametrize( - 'requests', - [ - [Request.from_url('https://apify.com')], - ['https://crawlee.dev'], - [Request.from_url(f'https://example.com/{i}') for i in range(10)], - [f'https://example.com/{i}' for i in range(15)], - ], - ids=['single-request', 'single-url', 'multiple-requests', 'multiple-urls'], -) -async def test_add_batched_requests( - request_queue: RequestQueue, - requests: Sequence[str | Request], -) -> None: - request_count = len(requests) + # Verify the second request was detected as duplicate + assert second_result.was_already_present is True + assert second_result.unique_key == first_result.unique_key - # Add the requests to the RQ in batches - await request_queue.add_requests_batched(requests, wait_for_all_requests_to_be_added=True) + # Verify the queue stats weren't incremented twice + assert rq.metadata.total_request_count == 1 + assert rq.metadata.pending_request_count == 1 - # Ensure the batch was processed correctly - assert await request_queue.get_total_count() == request_count - # Fetch and validate each request in the queue - for original_request in requests: - next_request = await request_queue.fetch_next_request() - assert next_request is not None +async def test_add_requests_batch(rq: RequestQueue) -> None: + """Test adding multiple requests in a batch.""" + # Create a batch of requests + urls = [ + 'https://example.com/page1', + 'https://example.com/page2', + 'https://example.com/page3', + ] - expected_url = original_request if isinstance(original_request, str) else original_request.url - assert next_request.url == expected_url + # Add the requests + await rq.add_requests(urls) - # Confirm the queue is empty after processing all requests - assert await request_queue.is_empty() is True + # Wait for all background tasks to complete + await asyncio.sleep(0.1) + # Verify the queue stats + assert rq.metadata.total_request_count == 3 + assert rq.metadata.pending_request_count == 3 -async def test_invalid_user_data_serialization() -> None: - with pytest.raises(ValidationError): - Request.from_url( - 'https://crawlee.dev', - user_data={ - 'foo': datetime(year=2020, month=7, day=4, tzinfo=timezone.utc), - 'bar': {datetime(year=2020, month=4, day=7, tzinfo=timezone.utc)}, - }, - ) +async def test_add_requests_batch_with_forefront(rq: RequestQueue) -> None: + """Test adding multiple requests in a batch with forefront option.""" + # Add some initial requests + await rq.add_request('https://example.com/page1') + await rq.add_request('https://example.com/page2') + + # Add a batch of priority requests at the forefront -async def test_user_data_serialization(request_queue: RequestQueue) -> None: - request = Request.from_url( - 'https://crawlee.dev', - user_data={ - 'hello': 'world', - 'foo': 42, - }, + await rq.add_requests( + [ + 'https://example.com/priority1', + 'https://example.com/priority2', + 'https://example.com/priority3', + ], + forefront=True, ) - await request_queue.add_request(request) + # Wait for all background tasks to complete + await asyncio.sleep(0.1) + + # Fetch requests - they should come out in priority order first + next_request1 = await rq.fetch_next_request() + assert next_request1 is not None + assert next_request1.url.startswith('https://example.com/priority') + + next_request2 = await rq.fetch_next_request() + assert next_request2 is not None + assert next_request2.url.startswith('https://example.com/priority') + + next_request3 = await rq.fetch_next_request() + assert next_request3 is not None + assert next_request3.url.startswith('https://example.com/priority') - dequeued_request = await request_queue.fetch_next_request() - assert dequeued_request is not None + # Now we should get the original requests + next_request4 = await rq.fetch_next_request() + assert next_request4 is not None + assert next_request4.url == 'https://example.com/page1' - assert dequeued_request.user_data['hello'] == 'world' - assert dequeued_request.user_data['foo'] == 42 + next_request5 = await rq.fetch_next_request() + assert next_request5 is not None + assert next_request5.url == 'https://example.com/page2' + # Queue should be empty now + next_request6 = await rq.fetch_next_request() + assert next_request6 is None -async def test_complex_user_data_serialization(request_queue: RequestQueue) -> None: - request = Request.from_url('https://crawlee.dev') - request.user_data['hello'] = 'world' - request.user_data['foo'] = 42 - request.crawlee_data.max_retries = 1 - request.crawlee_data.state = RequestState.ERROR_HANDLER - await request_queue.add_request(request) +async def test_add_requests_with_forefront(rq: RequestQueue) -> None: + """Test adding requests to the front of the queue.""" + # Add some initial requests + await rq.add_request('https://example.com/page1') + await rq.add_request('https://example.com/page2') - dequeued_request = await request_queue.fetch_next_request() - assert dequeued_request is not None + # Add a priority request at the forefront + await rq.add_request('https://example.com/priority', forefront=True) + + # Fetch the next request - should be the priority one + next_request = await rq.fetch_next_request() + assert next_request is not None + assert next_request.url == 'https://example.com/priority' + + +async def test_add_requests_mixed_forefront(rq: RequestQueue) -> None: + """Test the ordering when adding requests with mixed forefront values.""" + # Add normal requests + await rq.add_request('https://example.com/normal1') + await rq.add_request('https://example.com/normal2') + + # Add a batch with forefront=True + await rq.add_requests( + ['https://example.com/priority1', 'https://example.com/priority2'], + forefront=True, + ) - data = dequeued_request.model_dump(by_alias=True) - assert data['userData']['hello'] == 'world' - assert data['userData']['foo'] == 42 - assert data['userData']['__crawlee'] == { - 'maxRetries': 1, - 'state': RequestState.ERROR_HANDLER, - } + # Add another normal request + await rq.add_request('https://example.com/normal3') + # Add another priority request + await rq.add_request('https://example.com/priority3', forefront=True) -async def test_deduplication_of_requests_with_custom_unique_key() -> None: - with pytest.raises(ValueError, match='`always_enqueue` cannot be used with a custom `unique_key`'): - Request.from_url('https://apify.com', unique_key='apify', always_enqueue=True) + # Wait for background tasks + await asyncio.sleep(0.1) + # The expected order should be: + # 1. priority3 (most recent forefront) + # 2. priority1 (from batch, forefront) + # 3. priority2 (from batch, forefront) + # 4. normal1 (oldest normal) + # 5. normal2 + # 6. normal3 (newest normal) -async def test_deduplication_of_requests_with_invalid_custom_unique_key() -> None: - request_1 = Request.from_url('https://apify.com', always_enqueue=True) - request_2 = Request.from_url('https://apify.com', always_enqueue=True) + requests = [] + while True: + req = await rq.fetch_next_request() + if req is None: + break + requests.append(req) + await rq.mark_request_as_handled(req) - rq = await RequestQueue.open(name='my-rq') - await rq.add_request(request_1) - await rq.add_request(request_2) + assert len(requests) == 6 + assert requests[0].url == 'https://example.com/priority3' - assert await rq.get_total_count() == 2 + # The next two should be from the forefront batch (exact order within batch may vary) + batch_urls = {requests[1].url, requests[2].url} + assert 'https://example.com/priority1' in batch_urls + assert 'https://example.com/priority2' in batch_urls - assert await rq.fetch_next_request() == request_1 - assert await rq.fetch_next_request() == request_2 + # Then the normal requests in order + assert requests[3].url == 'https://example.com/normal1' + assert requests[4].url == 'https://example.com/normal2' + assert requests[5].url == 'https://example.com/normal3' -async def test_deduplication_of_requests_with_valid_custom_unique_key() -> None: - request_1 = Request.from_url('https://apify.com') - request_2 = Request.from_url('https://apify.com') +async def test_fetch_next_request_and_mark_handled(rq: RequestQueue) -> None: + """Test fetching and marking requests as handled.""" + # Add some requests + await rq.add_request('https://example.com/page1') + await rq.add_request('https://example.com/page2') - rq = await RequestQueue.open(name='my-rq') - await rq.add_request(request_1) - await rq.add_request(request_2) + # Fetch first request + request1 = await rq.fetch_next_request() + assert request1 is not None + assert request1.url == 'https://example.com/page1' - assert await rq.get_total_count() == 1 + # Mark the request as handled + result = await rq.mark_request_as_handled(request1) + assert result is not None + assert result.was_already_handled is True - assert await rq.fetch_next_request() == request_1 + # Fetch next request + request2 = await rq.fetch_next_request() + assert request2 is not None + assert request2.url == 'https://example.com/page2' + # Mark the second request as handled + await rq.mark_request_as_handled(request2) -async def test_cache_requests(request_queue: RequestQueue) -> None: - request_1 = Request.from_url('https://apify.com') - request_2 = Request.from_url('https://crawlee.dev') + # Verify counts + assert rq.metadata.total_request_count == 2 + assert rq.metadata.handled_request_count == 2 + assert rq.metadata.pending_request_count == 0 - await request_queue.add_request(request_1) - await request_queue.add_request(request_2) + # Verify queue is empty + empty_request = await rq.fetch_next_request() + assert empty_request is None - assert request_queue._requests_cache.currsize == 2 - fetched_request = await request_queue.fetch_next_request() +async def test_get_request_by_id(rq: RequestQueue) -> None: + """Test retrieving a request by its ID.""" + # Add a request + added_result = await rq.add_request('https://example.com') + request_id = added_result.id - assert fetched_request is not None - assert fetched_request.id == request_1.id + # Retrieve the request by ID + retrieved_request = await rq.get_request(request_id) + assert retrieved_request is not None + assert retrieved_request.id == request_id + assert retrieved_request.url == 'https://example.com' - # After calling fetch_next_request request_1 moved to the end of the cache store. - cached_items = [request_queue._requests_cache.popitem()[0] for _ in range(2)] - assert cached_items == [request_2.id, request_1.id] +async def test_get_non_existent_request(rq: RequestQueue) -> None: + """Test retrieving a request that doesn't exist.""" + non_existent_request = await rq.get_request('non-existent-id') + assert non_existent_request is None + + +async def test_reclaim_request(rq: RequestQueue) -> None: + """Test reclaiming a request that failed processing.""" + # Add a request + await rq.add_request('https://example.com') + + # Fetch the request + request = await rq.fetch_next_request() + assert request is not None + + # Reclaim the request + result = await rq.reclaim_request(request) + assert result is not None + assert result.was_already_handled is False + + # Verify we can fetch it again + reclaimed_request = await rq.fetch_next_request() + assert reclaimed_request is not None + assert reclaimed_request.id == request.id + assert reclaimed_request.url == 'https://example.com' + + +async def test_reclaim_request_with_forefront(rq: RequestQueue) -> None: + """Test reclaiming a request to the front of the queue.""" + # Add requests + await rq.add_request('https://example.com/first') + await rq.add_request('https://example.com/second') + + # Fetch the first request + first_request = await rq.fetch_next_request() + assert first_request is not None + assert first_request.url == 'https://example.com/first' + + # Reclaim it to the forefront + await rq.reclaim_request(first_request, forefront=True) + + # The reclaimed request should be returned first (before the second request) + next_request = await rq.fetch_next_request() + assert next_request is not None + assert next_request.url == 'https://example.com/first' -async def test_from_storage_object() -> None: - storage_client = service_locator.get_storage_client() - storage_object = StorageMetadata( - id='dummy-id', - name='dummy-name', - accessed_at=datetime.now(timezone.utc), - created_at=datetime.now(timezone.utc), - modified_at=datetime.now(timezone.utc), - extra_attribute='extra', +async def test_is_empty(rq: RequestQueue) -> None: + """Test checking if a request queue is empty.""" + # Initially the queue should be empty + assert await rq.is_empty() is True + + # Add a request + await rq.add_request('https://example.com') + assert await rq.is_empty() is False + + # Fetch and handle the request + request = await rq.fetch_next_request() + + assert request is not None + await rq.mark_request_as_handled(request) + + # Queue should be empty again + assert await rq.is_empty() is True + + +async def test_is_finished(rq: RequestQueue) -> None: + """Test checking if a request queue is finished.""" + # Initially the queue should be finished (empty and no background tasks) + assert await rq.is_finished() is True + + # Add a request + await rq.add_request('https://example.com') + assert await rq.is_finished() is False + + # Add requests in the background + await rq.add_requests( + ['https://example.com/1', 'https://example.com/2'], + wait_for_all_requests_to_be_added=False, + ) + + # Queue shouldn't be finished while background tasks are running + assert await rq.is_finished() is False + + # Wait for background tasks to finish + await asyncio.sleep(0.2) + + # Process all requests + while True: + request = await rq.fetch_next_request() + if request is None: + break + await rq.mark_request_as_handled(request) + + # Now queue should be finished + assert await rq.is_finished() is True + + +async def test_mark_non_existent_request_as_handled(rq: RequestQueue) -> None: + """Test marking a non-existent request as handled.""" + # Create a request that hasn't been added to the queue + request = Request.from_url(url='https://example.com', id='non-existent-id') + + # Attempt to mark it as handled + result = await rq.mark_request_as_handled(request) + assert result is None + + +async def test_reclaim_non_existent_request(rq: RequestQueue) -> None: + """Test reclaiming a non-existent request.""" + # Create a request that hasn't been added to the queue + request = Request.from_url(url='https://example.com', id='non-existent-id') + + # Attempt to reclaim it + result = await rq.reclaim_request(request) + assert result is None + + +async def test_drop( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test dropping a request queue removes it from cache and clears its data.""" + rq = await RequestQueue.open( + name='drop_test', + storage_client=storage_client, + configuration=configuration, + ) + + # Add a request + await rq.add_request('https://example.com') + + # Drop the request queue + await rq.drop() + + # Verify request queue is empty (by creating a new one with the same name) + new_rq = await RequestQueue.open( + name='drop_test', + storage_client=storage_client, + configuration=configuration, ) - request_queue = RequestQueue.from_storage_object(storage_client, storage_object) - - assert request_queue.id == storage_object.id - assert request_queue.name == storage_object.name - assert request_queue.storage_object == storage_object - assert storage_object.model_extra.get('extra_attribute') == 'extra' # type: ignore[union-attr] - - -async def test_add_batched_requests_with_retry(request_queue: RequestQueue) -> None: - """Test that unprocessed requests are retried. - - Unprocessed requests should not count in `get_total_count` - Test creates situation where in `batch_add_requests` call in first batch 3 requests are unprocessed. - On each following `batch_add_requests` call the last request in batch remains unprocessed. - In this test `batch_add_requests` is called once with batch of 10 requests. With retries only 1 request should - remain unprocessed.""" - - batch_add_requests_call_counter = count(start=1) - service_locator.get_storage_client() - initial_request_count = 10 - expected_added_requests = 9 - requests = [f'https://example.com/{i}' for i in range(initial_request_count)] - - class MockedRequestQueueClient(RequestQueueClient): - """Patched memory storage client that simulates unprocessed requests.""" - - async def _batch_add_requests_without_last_n( - self, batch: Sequence[Request], n: int = 0 - ) -> BatchRequestsOperationResponse: - response = await super().batch_add_requests(batch[:-n]) - response.unprocessed_requests = [ - UnprocessedRequest(url=r.url, unique_key=r.unique_key, method=r.method) for r in batch[-n:] - ] - return response - - async def batch_add_requests( - self, - requests: Sequence[Request], - *, - forefront: bool = False, # noqa: ARG002 - ) -> BatchRequestsOperationResponse: - """Mocked client behavior that simulates unprocessed requests. - - It processes all except last three at first run, then all except last none. - Overall if tried with the same batch it will process all except the last one. - """ - call_count = next(batch_add_requests_call_counter) - if call_count == 1: - # Process all but last three - return await self._batch_add_requests_without_last_n(requests, n=3) - # Process all but last - return await self._batch_add_requests_without_last_n(requests, n=1) - - mocked_storage_client = AsyncMock(spec=StorageClient) - mocked_storage_client.request_queue = MagicMock( - return_value=MockedRequestQueueClient(id='default', memory_storage_client=MemoryStorageClient.from_config()) + # Verify the queue is empty + assert await new_rq.is_empty() is True + assert new_rq.metadata.total_request_count == 0 + assert new_rq.metadata.pending_request_count == 0 + await new_rq.drop() + + +async def test_reopen_default( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test reopening the default request queue.""" + # First clean up any class-level caches + RequestQueue._cache_by_id.clear() + RequestQueue._cache_by_name.clear() + RequestQueue._default_instance = None + + # Open the default request queue + rq1 = await RequestQueue.open( + storage_client=storage_client, + configuration=configuration, ) - request_queue = RequestQueue(id='default', name='some_name', storage_client=mocked_storage_client) + # If a request queue already exists (due to previous test run), purge it to start fresh + try: + await rq1.purge() + except Exception: + # If purge fails, try dropping and recreating + await rq1.drop() + rq1 = await RequestQueue.open( + storage_client=storage_client, + configuration=configuration, + ) + + # Verify we're starting fresh + assert rq1.metadata.pending_request_count == 0 - # Add the requests to the RQ in batches - await request_queue.add_requests_batched( - requests, wait_for_all_requests_to_be_added=True, wait_time_between_batches=timedelta(0) + # Add a request + await rq1.add_request('https://example.com/') + + # Verify the request was added + assert rq1.metadata.pending_request_count == 1 + + # Open the default request queue again + rq2 = await RequestQueue.open( + storage_client=storage_client, + configuration=configuration, ) - # Ensure the batch was processed correctly - assert await request_queue.get_total_count() == expected_added_requests - # Fetch and validate each request in the queue - for original_request in requests[:expected_added_requests]: - next_request = await request_queue.fetch_next_request() - assert next_request is not None + # Verify they are the same queue + assert rq1.id == rq2.id + assert rq1.name == rq2.name + assert rq1.metadata.total_request_count == rq2.metadata.total_request_count + assert rq1.metadata.pending_request_count == rq2.metadata.pending_request_count + assert rq1.metadata.handled_request_count == rq2.metadata.handled_request_count + + # Verify the request is accessible + request = await rq2.fetch_next_request() + assert request is not None + assert request.url == 'https://example.com/' + + # Clean up after the test + await rq1.drop() - expected_url = original_request if isinstance(original_request, str) else original_request.url - assert next_request.url == expected_url - # Confirm the queue is empty after processing all requests - assert await request_queue.is_empty() is True +async def test_purge( + storage_client: StorageClient, + configuration: Configuration, +) -> None: + """Test purging a request queue removes all requests but keeps the queue itself.""" + # First create a request queue + rq = await RequestQueue.open( + name='purge_test_queue', + storage_client=storage_client, + configuration=configuration, + ) + + # Add some requests + await rq.add_requests( + [ + 'https://example.com/page1', + 'https://example.com/page2', + 'https://example.com/page3', + ] + ) + + # Verify requests were added + assert rq.metadata.total_request_count == 3 + assert rq.metadata.pending_request_count == 3 + + # Record the queue ID + queue_id = rq.id + + # Purge the queue + await rq.purge() + + # Verify the queue still exists but is empty + assert rq.id == queue_id # Same ID preserved + assert rq.name == 'purge_test_queue' # Same name preserved + + # Queue should be empty now + assert rq.metadata.total_request_count == 0 + assert rq.metadata.pending_request_count == 0 + assert rq.metadata.handled_request_count == 0 + assert await rq.is_empty() is True + + # Verify we can add new requests after purging + await rq.add_request('https://example.com/new-after-purge') + + request = await rq.fetch_next_request() + assert request is not None + assert request.url == 'https://example.com/new-after-purge' + + # Clean up + await rq.drop() diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index 73e17d50d9..f89401e5be 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -9,6 +9,7 @@ from crawlee.configuration import Configuration from crawlee.crawlers import HttpCrawler, HttpCrawlingContext from crawlee.storage_clients import MemoryStorageClient +from crawlee.storage_clients._file_system._storage_client import FileSystemStorageClient if TYPE_CHECKING: from pathlib import Path @@ -35,14 +36,15 @@ def test_global_configuration_works_reversed() -> None: async def test_storage_not_persisted_when_disabled(tmp_path: Path, server_url: URL) -> None: - config = Configuration( - persist_storage=False, - write_metadata=False, + configuration = Configuration( crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] ) - storage_client = MemoryStorageClient.from_config(config) + storage_client = MemoryStorageClient() - crawler = HttpCrawler(storage_client=storage_client) + crawler = HttpCrawler( + configuration=configuration, + storage_client=storage_client, + ) @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: @@ -56,14 +58,16 @@ async def default_handler(context: HttpCrawlingContext) -> None: async def test_storage_persisted_when_enabled(tmp_path: Path, server_url: URL) -> None: - config = Configuration( - persist_storage=True, - write_metadata=True, + configuration = Configuration( crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] ) - storage_client = MemoryStorageClient.from_config(config) - crawler = HttpCrawler(storage_client=storage_client) + storage_client = FileSystemStorageClient() + + crawler = HttpCrawler( + configuration=configuration, + storage_client=storage_client, + ) @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: diff --git a/tests/unit/test_service_locator.py b/tests/unit/test_service_locator.py index 50da5ddb86..a4ed0620dd 100644 --- a/tests/unit/test_service_locator.py +++ b/tests/unit/test_service_locator.py @@ -6,7 +6,7 @@ from crawlee.configuration import Configuration from crawlee.errors import ServiceConflictError from crawlee.events import LocalEventManager -from crawlee.storage_clients import MemoryStorageClient +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient def test_default_configuration() -> None: @@ -72,21 +72,21 @@ def test_event_manager_conflict() -> None: def test_default_storage_client() -> None: default_storage_client = service_locator.get_storage_client() - assert isinstance(default_storage_client, MemoryStorageClient) + assert isinstance(default_storage_client, FileSystemStorageClient) def test_custom_storage_client() -> None: - custom_storage_client = MemoryStorageClient.from_config() + custom_storage_client = MemoryStorageClient() service_locator.set_storage_client(custom_storage_client) storage_client = service_locator.get_storage_client() assert storage_client is custom_storage_client def test_storage_client_overwrite() -> None: - custom_storage_client = MemoryStorageClient.from_config() + custom_storage_client = MemoryStorageClient() service_locator.set_storage_client(custom_storage_client) - another_custom_storage_client = MemoryStorageClient.from_config() + another_custom_storage_client = MemoryStorageClient() service_locator.set_storage_client(another_custom_storage_client) assert custom_storage_client != another_custom_storage_client @@ -95,7 +95,7 @@ def test_storage_client_overwrite() -> None: def test_storage_client_conflict() -> None: service_locator.get_storage_client() - custom_storage_client = MemoryStorageClient.from_config() + custom_storage_client = MemoryStorageClient() with pytest.raises(ServiceConflictError, match='StorageClient is already in use.'): service_locator.set_storage_client(custom_storage_client) diff --git a/website/generate_module_shortcuts.py b/website/generate_module_shortcuts.py index 5a18e8d3f3..61acc68ade 100755 --- a/website/generate_module_shortcuts.py +++ b/website/generate_module_shortcuts.py @@ -5,6 +5,7 @@ import importlib import inspect import json +from pathlib import Path from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -55,5 +56,5 @@ def resolve_shortcuts(shortcuts: dict) -> None: resolve_shortcuts(shortcuts) -with open('module_shortcuts.json', 'w', encoding='utf-8') as shortcuts_file: +with Path('module_shortcuts.json').open('w', encoding='utf-8') as shortcuts_file: json.dump(shortcuts, shortcuts_file, indent=4, sort_keys=True) From dd9be6e809eae147ac15a891959c947d37faaf9e Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Sat, 10 May 2025 12:30:14 +0200 Subject: [PATCH 02/43] Cleanup --- src/crawlee/_consts.py | 1 + src/crawlee/_types.py | 226 ++++++++++++------ src/crawlee/_utils/data_processing.py | 41 ---- src/crawlee/_utils/file.py | 161 ++++++------- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- .../_base/_request_queue_client.py | 8 +- .../_file_system/_dataset_client.py | 4 +- .../_file_system/_key_value_store_client.py | 5 +- .../_file_system/_request_queue_client.py | 4 +- .../storage_clients/_file_system/_utils.py | 49 ---- .../_memory/_key_value_store_client.py | 3 - .../_memory/_request_queue_client.py | 6 +- src/crawlee/storage_clients/models.py | 170 +------------ src/crawlee/storages/_dataset.py | 3 +- src/crawlee/storages/_types.py | 167 ------------- tests/unit/_utils/test_data_processing.py | 51 ---- tests/unit/_utils/test_file.py | 127 +--------- .../_file_system/test_fs_dataset_client.py | 35 +++ .../_file_system/test_fs_kvs_client.py | 35 +++ .../_file_system/test_fs_rq_client.py | 35 +++ 20 files changed, 343 insertions(+), 790 deletions(-) delete mode 100644 src/crawlee/_utils/data_processing.py delete mode 100644 src/crawlee/storage_clients/_file_system/_utils.py delete mode 100644 src/crawlee/storages/_types.py delete mode 100644 tests/unit/_utils/test_data_processing.py diff --git a/src/crawlee/_consts.py b/src/crawlee/_consts.py index d8d40087b0..9345e53e98 100644 --- a/src/crawlee/_consts.py +++ b/src/crawlee/_consts.py @@ -1,3 +1,4 @@ from __future__ import annotations METADATA_FILENAME = '__metadata__.json' +"""The name of the metadata file for storage clients.""" diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 289b705ee2..5aade878eb 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -3,27 +3,41 @@ import dataclasses from collections.abc import Iterator, Mapping from dataclasses import dataclass -from enum import Enum -from typing import TYPE_CHECKING, Annotated, Any, Callable, Literal, Optional, Protocol, TypeVar, Union, cast, overload +from typing import ( + TYPE_CHECKING, + Annotated, + Any, + Callable, + Literal, + Optional, + Protocol, + TypedDict, + TypeVar, + Union, + cast, + overload, +) from pydantic import ConfigDict, Field, PlainValidator, RootModel -from typing_extensions import NotRequired, TypeAlias, TypedDict, Unpack from crawlee._utils.docs import docs_group if TYPE_CHECKING: + import json import logging import re - from collections.abc import Coroutine, Sequence + from collections.abc import Callable, Coroutine, Sequence + + from typing_extensions import NotRequired, Required, TypeAlias, Unpack from crawlee import Glob, Request from crawlee._request import RequestOptions + from crawlee.configuration import Configuration from crawlee.http_clients import HttpResponse from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session - from crawlee.storage_clients.models import DatasetItemsListPage + from crawlee.storage_clients import StorageClient from crawlee.storages import KeyValueStore - from crawlee.storages._types import ExportToKwargs, GetDataKwargs # Workaround for https://github.com/pydantic/pydantic/issues/9445 J = TypeVar('J', bound='JsonSerializable') @@ -138,15 +152,6 @@ def __init__( self.max_tasks_per_minute = max_tasks_per_minute -@docs_group('Data structures') -class StorageTypes(str, Enum): - """Possible Crawlee storage types.""" - - DATASET = 'Dataset' - KEY_VALUE_STORE = 'Key-value store' - REQUEST_QUEUE = 'Request queue' - - class EnqueueLinksKwargs(TypedDict): """Keyword arguments for the `enqueue_links` methods.""" @@ -416,55 +421,6 @@ def __call__( """ -@docs_group('Functions') -class ExportToFunction(Protocol): - """A function for exporting data from a `Dataset`. - - It simplifies the process of exporting data from a `Dataset`. It opens the specified one and exports - its content to a `KeyValueStore`. - """ - - def __call__( - self, - dataset_id: str | None = None, - dataset_name: str | None = None, - **kwargs: Unpack[ExportToKwargs], - ) -> Coroutine[None, None, None]: - """Call dunder method. - - Args: - dataset_id: The ID of the `Dataset` to export data from. - dataset_name: The name of the `Dataset` to export data from. - **kwargs: Additional keyword arguments. - """ - - -@docs_group('Functions') -class GetDataFunction(Protocol): - """A function for retrieving data from a `Dataset`. - - It simplifies the process of accessing data from a `Dataset`. It opens the specified one and retrieves - data based on the provided parameters. It allows filtering and pagination. - """ - - def __call__( - self, - dataset_id: str | None = None, - dataset_name: str | None = None, - **kwargs: Unpack[GetDataKwargs], - ) -> Coroutine[None, None, DatasetItemsListPage]: - """Call dunder method. - - Args: - dataset_id: ID of the `Dataset` to get data from. - dataset_name: Name of the `Dataset` to get data from. - **kwargs: Additional keyword arguments. - - Returns: - A page of retrieved items. - """ - - @docs_group('Functions') class GetKeyValueStoreFunction(Protocol): """A function for accessing a `KeyValueStore`. @@ -575,18 +531,6 @@ def __bool__(self) -> bool: return bool(self.screenshot or self.html) -@docs_group('Functions') -class GetPageSnapshot(Protocol): - """A function for getting snapshot of a page.""" - - def __call__(self) -> Coroutine[None, None, PageSnapshot]: - """Get page snapshot. - - Returns: - Snapshot of a page. - """ - - @docs_group('Functions') class UseStateFunction(Protocol): """A function for managing state within the crawling context. @@ -654,3 +598,133 @@ async def get_snapshot(self) -> PageSnapshot: def __hash__(self) -> int: """Return hash of the context. Each context is considered unique.""" return id(self) + + +class GetDataKwargs(TypedDict): + """Keyword arguments for dataset's `get_data` method.""" + + offset: NotRequired[int] + """Skips the specified number of items at the start.""" + + limit: NotRequired[int | None] + """The maximum number of items to retrieve. Unlimited if None.""" + + clean: NotRequired[bool] + """Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.""" + + desc: NotRequired[bool] + """Set to True to sort results in descending order.""" + + fields: NotRequired[list[str]] + """Fields to include in each item. Sorts fields as specified if provided.""" + + omit: NotRequired[list[str]] + """Fields to exclude from each item.""" + + unwind: NotRequired[str] + """Unwinds items by a specified array field, turning each element into a separate item.""" + + skip_empty: NotRequired[bool] + """Excludes empty items from the results if True.""" + + skip_hidden: NotRequired[bool] + """Excludes fields starting with '#' if True.""" + + flatten: NotRequired[list[str]] + """Fields to be flattened in returned items.""" + + view: NotRequired[str] + """Specifies the dataset view to be used.""" + + +class ExportToKwargs(TypedDict): + """Keyword arguments for dataset's `export_to` method.""" + + key: Required[str] + """The key under which to save the data.""" + + content_type: NotRequired[Literal['json', 'csv']] + """The format in which to export the data. Either 'json' or 'csv'.""" + + to_kvs_id: NotRequired[str] + """ID of the key-value store to save the exported file.""" + + to_kvs_name: NotRequired[str] + """Name of the key-value store to save the exported file.""" + + to_kvs_storage_client: NotRequired[StorageClient] + """The storage client to use for saving the exported file.""" + + to_kvs_configuration: NotRequired[Configuration] + """The configuration to use for saving the exported file.""" + + +class ExportDataJsonKwargs(TypedDict): + """Keyword arguments for dataset's `export_data_json` method.""" + + skipkeys: NotRequired[bool] + """If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped + instead of raising a `TypeError`.""" + + ensure_ascii: NotRequired[bool] + """Determines if non-ASCII characters should be escaped in the output JSON string.""" + + check_circular: NotRequired[bool] + """If False (default: True), skips the circular reference check for container types. A circular reference will + result in a `RecursionError` or worse if unchecked.""" + + allow_nan: NotRequired[bool] + """If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply + with the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity).""" + + cls: NotRequired[type[json.JSONEncoder]] + """Allows specifying a custom JSON encoder.""" + + indent: NotRequired[int] + """Specifies the number of spaces for indentation in the pretty-printed JSON output.""" + + separators: NotRequired[tuple[str, str]] + """A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ') + otherwise.""" + + default: NotRequired[Callable] + """A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version + of the object or raise a `TypeError`.""" + + sort_keys: NotRequired[bool] + """Specifies whether the output JSON object should have keys sorted alphabetically.""" + + +class ExportDataCsvKwargs(TypedDict): + """Keyword arguments for dataset's `export_data_csv` method.""" + + dialect: NotRequired[str] + """Specifies a dialect to be used in CSV parsing and writing.""" + + delimiter: NotRequired[str] + """A one-character string used to separate fields. Defaults to ','.""" + + doublequote: NotRequired[bool] + """Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled; + when False, the `escapechar` is used as a prefix. Defaults to True.""" + + escapechar: NotRequired[str] + """A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar` + if `doublequote` is False. Defaults to None, disabling escaping.""" + + lineterminator: NotRequired[str] + """The string used to terminate lines produced by the writer. Defaults to '\\r\\n'.""" + + quotechar: NotRequired[str] + """A one-character string used to quote fields containing special characters, like the delimiter or quotechar, + or fields containing new-line characters. Defaults to '\"'.""" + + quoting: NotRequired[int] + """Controls when quotes should be generated by the writer and recognized by the reader. Can take any of + the `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`.""" + + skipinitialspace: NotRequired[bool] + """When True, spaces immediately following the delimiter are ignored. Defaults to False.""" + + strict: NotRequired[bool] + """When True, raises an exception on bad CSV input. Defaults to False.""" diff --git a/src/crawlee/_utils/data_processing.py b/src/crawlee/_utils/data_processing.py deleted file mode 100644 index e423650952..0000000000 --- a/src/crawlee/_utils/data_processing.py +++ /dev/null @@ -1,41 +0,0 @@ -from __future__ import annotations - -import json -from enum import Enum -from typing import TYPE_CHECKING, Any, NoReturn - -from crawlee._utils.file import ContentType, is_content_type - -if TYPE_CHECKING: - from crawlee._types import StorageTypes - - -def maybe_extract_enum_member_value(maybe_enum_member: Any) -> Any: - """Extract the value of an enumeration member if it is an Enum, otherwise return the original value.""" - if isinstance(maybe_enum_member, Enum): - return maybe_enum_member.value - return maybe_enum_member - - -def maybe_parse_body(body: bytes, content_type: str) -> Any: - """Parse the response body based on the content type.""" - if is_content_type(ContentType.JSON, content_type): - s = body.decode('utf-8') - return json.loads(s) - - if is_content_type(ContentType.XML, content_type) or is_content_type(ContentType.TEXT, content_type): - return body.decode('utf-8') - - return body - - -def raise_on_duplicate_storage(client_type: StorageTypes, key_name: str, value: str) -> NoReturn: - """Raise an error indicating that a storage with the provided key name and value already exists.""" - client_type = maybe_extract_enum_member_value(client_type) - raise ValueError(f'{client_type} with {key_name} "{value}" already exists.') - - -def raise_on_non_existing_storage(client_type: StorageTypes, id: str | None) -> NoReturn: - """Raise an error indicating that a storage with the provided id does not exist.""" - client_type = maybe_extract_enum_member_value(client_type) - raise ValueError(f'{client_type} with id "{id}" does not exist.') diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index 4de6804490..7ce1413103 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -1,106 +1,20 @@ from __future__ import annotations import asyncio -import contextlib import csv import json -import mimetypes import os -import re -import shutil -from enum import Enum -from logging import getLogger +import tempfile +from pathlib import Path from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import AsyncIterator - from pathlib import Path from typing import Any, TextIO from typing_extensions import Unpack - from crawlee.storages._types import ExportDataCsvKwargs, ExportDataJsonKwargs - -logger = getLogger(__name__) - - -class ContentType(Enum): - JSON = r'^application/json' - TEXT = r'^text/' - XML = r'^application/.*xml$' - - def matches(self, content_type: str) -> bool: - """Check if the content type matches the enum's pattern.""" - return bool(re.search(self.value, content_type, re.IGNORECASE)) - - -def is_content_type(content_type_enum: ContentType, content_type: str) -> bool: - """Check if the provided content type string matches the specified ContentType.""" - return content_type_enum.matches(content_type) - - -async def force_remove(filename: str | Path) -> None: - """Remove a file, suppressing the FileNotFoundError if it does not exist. - - JS-like rm(filename, { force: true }). - - Args: - filename: The path to the file to be removed. - """ - with contextlib.suppress(FileNotFoundError): - await asyncio.to_thread(os.remove, filename) - - -async def force_rename(src_dir: str | Path, dst_dir: str | Path) -> None: - """Rename a directory, ensuring that the destination directory is removed if it exists. - - Args: - src_dir: The source directory path. - dst_dir: The destination directory path. - """ - # Make sure source directory exists - if await asyncio.to_thread(os.path.exists, src_dir): - # Remove destination directory if it exists - if await asyncio.to_thread(os.path.exists, dst_dir): - await asyncio.to_thread(shutil.rmtree, dst_dir, ignore_errors=True) - await asyncio.to_thread(os.rename, src_dir, dst_dir) - - -def determine_file_extension(content_type: str) -> str | None: - """Determine the file extension for a given MIME content type. - - Args: - content_type: The MIME content type string. - - Returns: - A string representing the determined file extension without a leading dot, - or None if no extension could be determined. - """ - # e.g. mimetypes.guess_extension('application/json ') does not work... - actual_content_type = content_type.split(';')[0].strip() - - # mimetypes.guess_extension returns 'xsl' in this case, because 'application/xxx' is "structured" - # ('text/xml' would be "unstructured" and return 'xml') we have to explicitly override it here - if actual_content_type == 'application/xml': - return 'xml' - - # Determine the extension from the mime type - ext = mimetypes.guess_extension(actual_content_type) - - # Remove the leading dot if extension successfully parsed - return ext[1:] if ext is not None else ext - - -async def json_dumps(obj: Any) -> str: - """Serialize an object to a JSON-formatted string with specific settings. - - Args: - obj: The object to serialize. - - Returns: - A string containing the JSON representation of the input object. - """ - return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str) + from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs def infer_mime_type(value: Any) -> str: @@ -128,6 +42,75 @@ def infer_mime_type(value: Any) -> str: return 'application/octet-stream' +async def json_dumps(obj: Any) -> str: + """Serialize an object to a JSON-formatted string with specific settings. + + Args: + obj: The object to serialize. + + Returns: + A string containing the JSON representation of the input object. + """ + return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str) + + +async def atomic_write_text(path: Path, data: str) -> None: + dir_path = path.parent + + def _sync_write_text() -> str: + # create a temp file in the target dir, return its name + fd, tmp_path = tempfile.mkstemp( + suffix=f'{path.suffix}.tmp', + prefix=f'{path.name}.', + dir=str(dir_path), + ) + try: + with os.fdopen(fd, 'w', encoding='utf-8') as tmp_file: + tmp_file.write(data) + except: + Path(tmp_path).unlink(missing_ok=True) + raise + return tmp_path + + tmp_path = await asyncio.to_thread(_sync_write_text) + + try: + await asyncio.to_thread(os.replace, tmp_path, str(path)) + except (FileNotFoundError, PermissionError): + # fallback if tmp went missing + await asyncio.to_thread(path.write_text, data, encoding='utf-8') + finally: + await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True) + + +async def atomic_write_bytes(path: Path, data: bytes) -> None: + dir_path = path.parent + + def _sync_write_bytes() -> str: + fd, tmp_path = tempfile.mkstemp( + suffix=f'{path.suffix}.tmp', + prefix=f'{path.name}.', + dir=str(dir_path), + ) + try: + with os.fdopen(fd, 'wb') as tmp_file: + tmp_file.write(data) + except: + Path(tmp_path).unlink(missing_ok=True) + raise + return tmp_path + + tmp_path = await asyncio.to_thread(_sync_write_bytes) + + try: + await asyncio.to_thread(os.replace, tmp_path, str(path)) + except (FileNotFoundError, PermissionError): + # fallback if tmp went missing + await asyncio.to_thread(path.write_bytes, data) + finally: + await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True) + + async def export_json_to_stream( iterator: AsyncIterator[dict], dst: TextIO, diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 97f9ea6546..087e2ebca9 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -74,6 +74,7 @@ ConcurrencySettings, EnqueueLinksFunction, ExtractLinksFunction, + GetDataKwargs, HttpMethod, JsonSerializable, PushDataKwargs, @@ -87,7 +88,6 @@ from crawlee.statistics import FinalStatistics from crawlee.storage_clients import StorageClient from crawlee.storage_clients.models import DatasetItemsListPage - from crawlee.storages._types import GetDataKwargs TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext) TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) diff --git a/src/crawlee/storage_clients/_base/_request_queue_client.py b/src/crawlee/storage_clients/_base/_request_queue_client.py index b1e6ba389f..64659b4d02 100644 --- a/src/crawlee/storage_clients/_base/_request_queue_client.py +++ b/src/crawlee/storage_clients/_base/_request_queue_client.py @@ -8,13 +8,9 @@ if TYPE_CHECKING: from collections.abc import Sequence + from crawlee import Request from crawlee.configuration import Configuration - from crawlee.storage_clients.models import ( - AddRequestsResponse, - ProcessedRequest, - Request, - RequestQueueMetadata, - ) + from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata @docs_group('Abstract classes') diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index d9d9c1fda3..18f5769a54 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -11,12 +11,12 @@ from pydantic import ValidationError from typing_extensions import override +from crawlee._consts import METADATA_FILENAME from crawlee._utils.crypto import crypto_random_object_id +from crawlee._utils.file import atomic_write_text, json_dumps from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata -from ._utils import METADATA_FILENAME, atomic_write_text, json_dumps - if TYPE_CHECKING: from collections.abc import AsyncIterator from typing import Any diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index d4e7334928..1730b8340b 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -12,13 +12,12 @@ from pydantic import ValidationError from typing_extensions import override +from crawlee._consts import METADATA_FILENAME from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.file import infer_mime_type +from crawlee._utils.file import atomic_write_bytes, atomic_write_text, infer_mime_type, json_dumps from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata -from ._utils import METADATA_FILENAME, atomic_write_bytes, atomic_write_text, json_dumps - if TYPE_CHECKING: from collections.abc import AsyncIterator diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 0ade0f3846..5bb9c5133b 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -12,12 +12,12 @@ from typing_extensions import override from crawlee import Request +from crawlee._consts import METADATA_FILENAME from crawlee._utils.crypto import crypto_random_object_id +from crawlee._utils.file import atomic_write_text, json_dumps from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata -from ._utils import METADATA_FILENAME, atomic_write_text, json_dumps - if TYPE_CHECKING: from collections.abc import Sequence diff --git a/src/crawlee/storage_clients/_file_system/_utils.py b/src/crawlee/storage_clients/_file_system/_utils.py deleted file mode 100644 index f5068a5d8d..0000000000 --- a/src/crawlee/storage_clients/_file_system/_utils.py +++ /dev/null @@ -1,49 +0,0 @@ -from __future__ import annotations - -import asyncio -import json -import os -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from pathlib import Path - from typing import Any - -METADATA_FILENAME = '__metadata__.json' -"""The name of the metadata file for storage clients.""" - - -async def json_dumps(obj: Any) -> str: - """Serialize an object to a JSON-formatted string with specific settings. - - Args: - obj: The object to serialize. - - Returns: - A string containing the JSON representation of the input object. - """ - return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str) - - -async def atomic_write_text(path: Path, data: str) -> None: - tmp = path.with_suffix(path.suffix + '.tmp') - # write to .tmp - await asyncio.to_thread(tmp.write_text, data, encoding='utf-8') - - try: - await asyncio.to_thread(os.replace, tmp, path) - except FileNotFoundError: - # If the .tmp vanished, fall back to a straight write - await asyncio.to_thread(path.write_text, data, encoding='utf-8') - - -async def atomic_write_bytes(path: Path, data: bytes) -> None: - tmp = path.with_suffix(path.suffix + '.tmp') - # write to .tmp - await asyncio.to_thread(tmp.write_bytes, data) - - try: - await asyncio.to_thread(os.replace, tmp, path) - except FileNotFoundError: - # If the .tmp vanished, fall back to a straight write - await asyncio.to_thread(path.write_bytes, data) diff --git a/src/crawlee/storage_clients/_memory/_key_value_store_client.py b/src/crawlee/storage_clients/_memory/_key_value_store_client.py index 34843a380e..b527ebc013 100644 --- a/src/crawlee/storage_clients/_memory/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_memory/_key_value_store_client.py @@ -2,7 +2,6 @@ import sys from datetime import datetime, timezone -from logging import getLogger from typing import TYPE_CHECKING, Any from typing_extensions import override @@ -17,8 +16,6 @@ from crawlee.configuration import Configuration -logger = getLogger(__name__) - class MemoryKeyValueStoreClient(KeyValueStoreClient): """Memory implementation of the key-value store client. diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index 95035e6155..0d60011979 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -9,11 +9,7 @@ from crawlee import Request from crawlee._utils.crypto import crypto_random_object_id from crawlee.storage_clients._base import RequestQueueClient -from crawlee.storage_clients.models import ( - AddRequestsResponse, - ProcessedRequest, - RequestQueueMetadata, -) +from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata if TYPE_CHECKING: from collections.abc import Sequence diff --git a/src/crawlee/storage_clients/models.py b/src/crawlee/storage_clients/models.py index 8b5f0c6d0a..17067f2a1d 100644 --- a/src/crawlee/storage_clients/models.py +++ b/src/crawlee/storage_clients/models.py @@ -1,14 +1,11 @@ from __future__ import annotations -import json -from datetime import datetime, timedelta -from decimal import Decimal +from datetime import datetime from typing import Annotated, Any, Generic from pydantic import BaseModel, BeforeValidator, ConfigDict, Field from typing_extensions import TypeVar -from crawlee import Request from crawlee._types import HttpMethod from crawlee._utils.docs import docs_group from crawlee._utils.urls import validate_http_url @@ -113,74 +110,8 @@ class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]): @docs_group('Data structures') -class KeyValueStoreListKeysPage(BaseModel): - """Model for listing keys in the key-value store.""" - - model_config = ConfigDict(populate_by_name=True) - - count: Annotated[int, Field(alias='count')] - """The number of keys returned on this page.""" - - limit: Annotated[int, Field(alias='limit')] - """The maximum number of keys to return.""" - - is_truncated: Annotated[bool, Field(alias='isTruncated')] - """Indicates whether there are more keys to retrieve.""" - - exclusive_start_key: Annotated[str | None, Field(alias='exclusiveStartKey', default=None)] - """The key from which to start this page of results.""" - - next_exclusive_start_key: Annotated[str | None, Field(alias='nextExclusiveStartKey', default=None)] - """The key from which to start the next page of results.""" - - items: Annotated[list[KeyValueStoreRecordMetadata], Field(alias='items', default_factory=list)] - """The list of KVS items metadata returned on this page.""" - - -@docs_group('Data structures') -class RequestQueueHeadState(BaseModel): - """Model for the request queue head state.""" - - model_config = ConfigDict(populate_by_name=True) - - was_limit_reached: Annotated[bool, Field(alias='wasLimitReached')] - prev_limit: Annotated[int, Field(alias='prevLimit')] - queue_modified_at: Annotated[datetime, Field(alias='queueModifiedAt')] - query_started_at: Annotated[datetime, Field(alias='queryStartedAt')] - had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')] - - -@docs_group('Data structures') -class RequestQueueHead(BaseModel): - """Model for request queue head. - - Represents a collection of requests retrieved from the beginning of a queue, - including metadata about the queue's state and lock information for the requests. - """ - - model_config = ConfigDict(populate_by_name=True) - - limit: Annotated[int | None, Field(alias='limit', default=None)] - """The maximum number of requests that were requested from the queue.""" - - had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients', default=False)] - """Indicates whether the queue has been accessed by multiple clients (consumers).""" - - queue_modified_at: Annotated[datetime, Field(alias='queueModifiedAt')] - """The timestamp when the queue was last modified.""" - - lock_time: Annotated[timedelta | None, Field(alias='lockSecs', default=None)] - """The duration for which the returned requests are locked and cannot be processed by other clients.""" - - queue_has_locked_requests: Annotated[bool | None, Field(alias='queueHasLockedRequests', default=False)] - """Indicates whether the queue contains any locked requests.""" - - items: Annotated[list[Request], Field(alias='items', default_factory=list[Request])] - """The list of request objects retrieved from the beginning of the queue.""" - - -class _ListPage(BaseModel): - """Model for a single page of storage items returned from a collection list method.""" +class DatasetItemsListPage(BaseModel): + """Model for a single page of dataset items returned from a collection list method.""" model_config = ConfigDict(populate_by_name=True) @@ -199,48 +130,10 @@ class _ListPage(BaseModel): desc: Annotated[bool, Field(default=False)] """Indicates if the returned list is in descending order.""" - -@docs_group('Data structures') -class DatasetListPage(_ListPage): - """Model for a single page of dataset items returned from a collection list method.""" - - items: Annotated[list[DatasetMetadata], Field(default_factory=list)] - """The list of dataset items returned on this page.""" - - -@docs_group('Data structures') -class KeyValueStoreListPage(_ListPage): - """Model for a single page of key-value store items returned from a collection list method.""" - - items: Annotated[list[KeyValueStoreMetadata], Field(default_factory=list)] - """The list of key-value store items returned on this page.""" - - -@docs_group('Data structures') -class RequestQueueListPage(_ListPage): - """Model for a single page of request queue items returned from a collection list method.""" - - items: Annotated[list[RequestQueueMetadata], Field(default_factory=list)] - """The list of request queue items returned on this page.""" - - -@docs_group('Data structures') -class DatasetItemsListPage(_ListPage): - """Model for a single page of dataset items returned from a collection list method.""" - items: Annotated[list[dict], Field(default_factory=list)] """The list of dataset items returned on this page.""" -@docs_group('Data structures') -class ProlongRequestLockResponse(BaseModel): - """Response to prolong request lock calls.""" - - model_config = ConfigDict(populate_by_name=True) - - lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')] - - @docs_group('Data structures') class ProcessedRequest(BaseModel): """Represents a processed request.""" @@ -281,60 +174,3 @@ class AddRequestsResponse(BaseModel): unprocessed_requests: Annotated[list[UnprocessedRequest], Field(alias='unprocessedRequests')] """Requests that could not be processed, typically due to validation errors or other issues.""" - - -class InternalRequest(BaseModel): - """Internal representation of a queue request with additional metadata for ordering and storage.""" - - model_config = ConfigDict(populate_by_name=True) - - id: str - - unique_key: str - - order_no: Decimal | None = None - """Order number for maintaining request sequence in queue. - Used for restoring correct request order when recovering queue from storage.""" - - handled_at: datetime | None - - request: Annotated[ - Request, - Field(alias='json_'), - BeforeValidator(lambda v: json.loads(v) if isinstance(v, str) else v), - ] - """Original Request object. The alias 'json_' is required for backward compatibility with legacy code.""" - - @classmethod - def from_request(cls, request: Request, id: str, order_no: Decimal | None) -> InternalRequest: - """Create an internal request from a `Request` object.""" - return cls( - unique_key=request.unique_key, - id=id, - handled_at=request.handled_at, - order_no=order_no, - request=request, - ) - - def to_request(self) -> Request: - """Convert the internal request back to a `Request` object.""" - return self.request - - -class CachedRequest(BaseModel): - """Pydantic model for cached request information.""" - - id: str - """The ID of the request.""" - - was_already_handled: bool - """Whether the request was already handled.""" - - hydrated: Request | None = None - """The hydrated request object (the original one).""" - - lock_expires_at: datetime | None = None - """The expiration time of the lock on the request.""" - - forefront: bool = False - """Whether the request was added to the forefront of the queue.""" diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index a46673fe65..a950a78dc9 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -19,13 +19,12 @@ from typing_extensions import Unpack + from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata - from ._types import ExportDataCsvKwargs, ExportDataJsonKwargs - logger = logging.getLogger(__name__) diff --git a/src/crawlee/storages/_types.py b/src/crawlee/storages/_types.py deleted file mode 100644 index e8c1b135e0..0000000000 --- a/src/crawlee/storages/_types.py +++ /dev/null @@ -1,167 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Literal, TypedDict - -if TYPE_CHECKING: - import json - from collections.abc import Callable - from datetime import datetime - - from typing_extensions import NotRequired, Required - - from crawlee import Request - from crawlee.configuration import Configuration - from crawlee.storage_clients import StorageClient - - -class CachedRequest(TypedDict): - """Represent a cached request in the `RequestQueue`.""" - - id: str - """The ID of the request.""" - - was_already_handled: bool - """Indicates whether the request was already handled.""" - - hydrated: Request | None - """The hydrated request object.""" - - lock_expires_at: datetime | None - """The time at which the lock on the request expires.""" - - forefront: bool - """Indicates whether the request is at the forefront of the queue.""" - - -class IterateKwargs(TypedDict): - """Keyword arguments for dataset's `iterate` method.""" - - offset: NotRequired[int] - """Skips the specified number of items at the start.""" - - limit: NotRequired[int | None] - """The maximum number of items to retrieve. Unlimited if None.""" - - clean: NotRequired[bool] - """Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.""" - - desc: NotRequired[bool] - """Set to True to sort results in descending order.""" - - fields: NotRequired[list[str]] - """Fields to include in each item. Sorts fields as specified if provided.""" - - omit: NotRequired[list[str]] - """Fields to exclude from each item.""" - - unwind: NotRequired[str] - """Unwinds items by a specified array field, turning each element into a separate item.""" - - skip_empty: NotRequired[bool] - """Excludes empty items from the results if True.""" - - skip_hidden: NotRequired[bool] - """Excludes fields starting with '#' if True.""" - - -class GetDataKwargs(IterateKwargs): - """Keyword arguments for dataset's `get_data` method.""" - - flatten: NotRequired[list[str]] - """Fields to be flattened in returned items.""" - - view: NotRequired[str] - """Specifies the dataset view to be used.""" - - -class ExportToKwargs(TypedDict): - """Keyword arguments for dataset's `export_to` method.""" - - key: Required[str] - """The key under which to save the data.""" - - content_type: NotRequired[Literal['json', 'csv']] - """The format in which to export the data. Either 'json' or 'csv'.""" - - to_kvs_id: NotRequired[str] - """ID of the key-value store to save the exported file.""" - - to_kvs_name: NotRequired[str] - """Name of the key-value store to save the exported file.""" - - to_kvs_storage_client: NotRequired[StorageClient] - """The storage client to use for saving the exported file.""" - - to_kvs_configuration: NotRequired[Configuration] - """The configuration to use for saving the exported file.""" - - -class ExportDataJsonKwargs(TypedDict): - """Keyword arguments for dataset's `export_data_json` method.""" - - skipkeys: NotRequired[bool] - """If True (default: False), dict keys that are not of a basic type (str, int, float, bool, None) will be skipped - instead of raising a `TypeError`.""" - - ensure_ascii: NotRequired[bool] - """Determines if non-ASCII characters should be escaped in the output JSON string.""" - - check_circular: NotRequired[bool] - """If False (default: True), skips the circular reference check for container types. A circular reference will - result in a `RecursionError` or worse if unchecked.""" - - allow_nan: NotRequired[bool] - """If False (default: True), raises a ValueError for out-of-range float values (nan, inf, -inf) to strictly comply - with the JSON specification. If True, uses their JavaScript equivalents (NaN, Infinity, -Infinity).""" - - cls: NotRequired[type[json.JSONEncoder]] - """Allows specifying a custom JSON encoder.""" - - indent: NotRequired[int] - """Specifies the number of spaces for indentation in the pretty-printed JSON output.""" - - separators: NotRequired[tuple[str, str]] - """A tuple of (item_separator, key_separator). The default is (', ', ': ') if indent is None and (',', ': ') - otherwise.""" - - default: NotRequired[Callable] - """A function called for objects that can't be serialized otherwise. It should return a JSON-encodable version - of the object or raise a `TypeError`.""" - - sort_keys: NotRequired[bool] - """Specifies whether the output JSON object should have keys sorted alphabetically.""" - - -class ExportDataCsvKwargs(TypedDict): - """Keyword arguments for dataset's `export_data_csv` method.""" - - dialect: NotRequired[str] - """Specifies a dialect to be used in CSV parsing and writing.""" - - delimiter: NotRequired[str] - """A one-character string used to separate fields. Defaults to ','.""" - - doublequote: NotRequired[bool] - """Controls how instances of `quotechar` inside a field should be quoted. When True, the character is doubled; - when False, the `escapechar` is used as a prefix. Defaults to True.""" - - escapechar: NotRequired[str] - """A one-character string used to escape the delimiter if `quoting` is set to `QUOTE_NONE` and the `quotechar` - if `doublequote` is False. Defaults to None, disabling escaping.""" - - lineterminator: NotRequired[str] - """The string used to terminate lines produced by the writer. Defaults to '\\r\\n'.""" - - quotechar: NotRequired[str] - """A one-character string used to quote fields containing special characters, like the delimiter or quotechar, - or fields containing new-line characters. Defaults to '\"'.""" - - quoting: NotRequired[int] - """Controls when quotes should be generated by the writer and recognized by the reader. Can take any of - the `QUOTE_*` constants, with a default of `QUOTE_MINIMAL`.""" - - skipinitialspace: NotRequired[bool] - """When True, spaces immediately following the delimiter are ignored. Defaults to False.""" - - strict: NotRequired[bool] - """When True, raises an exception on bad CSV input. Defaults to False.""" diff --git a/tests/unit/_utils/test_data_processing.py b/tests/unit/_utils/test_data_processing.py deleted file mode 100644 index c67335517b..0000000000 --- a/tests/unit/_utils/test_data_processing.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -from enum import Enum - -import pytest - -from crawlee._types import StorageTypes -from crawlee._utils.data_processing import ( - maybe_extract_enum_member_value, - maybe_parse_body, - raise_on_duplicate_storage, - raise_on_non_existing_storage, -) - - -def test_maybe_extract_enum_member_value() -> None: - class Color(Enum): - RED = 1 - GREEN = 2 - BLUE = 3 - - assert maybe_extract_enum_member_value(Color.RED) == 1 - assert maybe_extract_enum_member_value(Color.GREEN) == 2 - assert maybe_extract_enum_member_value(Color.BLUE) == 3 - assert maybe_extract_enum_member_value(10) == 10 - assert maybe_extract_enum_member_value('test') == 'test' - assert maybe_extract_enum_member_value(None) is None - - -def test_maybe_parse_body() -> None: - json_body = b'{"key": "value"}' - xml_body = b'ToveJani' - text_body = b'Plain text content' - binary_body = b'\x00\x01\x02' - - assert maybe_parse_body(json_body, 'application/json') == {'key': 'value'} - assert maybe_parse_body(xml_body, 'application/xml') == 'ToveJani' - assert maybe_parse_body(text_body, 'text/plain') == 'Plain text content' - assert maybe_parse_body(binary_body, 'application/octet-stream') == binary_body - assert maybe_parse_body(xml_body, 'text/xml') == 'ToveJani' - assert maybe_parse_body(text_body, 'text/plain; charset=utf-8') == 'Plain text content' - - -def test_raise_on_duplicate_storage() -> None: - with pytest.raises(ValueError, match='Dataset with name "test" already exists.'): - raise_on_duplicate_storage(StorageTypes.DATASET, 'name', 'test') - - -def test_raise_on_non_existing_storage() -> None: - with pytest.raises(ValueError, match='Dataset with id "kckxQw6j6AtrgyA09" does not exist.'): - raise_on_non_existing_storage(StorageTypes.DATASET, 'kckxQw6j6AtrgyA09') diff --git a/tests/unit/_utils/test_file.py b/tests/unit/_utils/test_file.py index 0762e1d966..c00618b600 100644 --- a/tests/unit/_utils/test_file.py +++ b/tests/unit/_utils/test_file.py @@ -1,18 +1,8 @@ from __future__ import annotations from datetime import datetime, timezone -from pathlib import Path -import pytest - -from crawlee._utils.file import ( - ContentType, - determine_file_extension, - force_remove, - force_rename, - is_content_type, - json_dumps, -) +from crawlee._utils.file import json_dumps async def test_json_dumps() -> None: @@ -21,118 +11,3 @@ async def test_json_dumps() -> None: assert await json_dumps('string') == '"string"' assert await json_dumps(123) == '123' assert await json_dumps(datetime(2022, 1, 1, tzinfo=timezone.utc)) == '"2022-01-01 00:00:00+00:00"' - - -@pytest.mark.parametrize( - ('content_type_enum', 'content_type', 'expected_result'), - [ - (ContentType.JSON, 'application/json', True), - (ContentType.JSON, 'application/json; charset=utf-8', True), - (ContentType.JSON, 'text/plain', False), - (ContentType.JSON, 'application/xml', False), - (ContentType.XML, 'application/xml', True), - (ContentType.XML, 'application/xhtml+xml', True), - (ContentType.XML, 'text/xml; charset=utf-8', False), - (ContentType.XML, 'application/json', False), - (ContentType.TEXT, 'text/plain', True), - (ContentType.TEXT, 'text/html; charset=utf-8', True), - (ContentType.TEXT, 'application/json', False), - (ContentType.TEXT, 'application/xml', False), - ], - ids=[ - 'json_valid_simple', - 'json_valid_charset', - 'json_invalid_text', - 'json_invalid_xml', - 'xml_valid_simple', - 'xml_valid_xhtml', - 'xml_invalid_text_charset', - 'xml_invalid_json', - 'text_valid_plain', - 'text_valid_html_charset', - 'text_invalid_json', - 'text_invalid_xml', - ], -) -def test_is_content_type(content_type_enum: ContentType, content_type: str, *, expected_result: bool) -> None: - result = is_content_type(content_type_enum, content_type) - assert expected_result == result - - -def test_is_content_type_json() -> None: - assert is_content_type(ContentType.JSON, 'application/json') is True - assert is_content_type(ContentType.JSON, 'application/json; charset=utf-8') is True - assert is_content_type(ContentType.JSON, 'text/plain') is False - assert is_content_type(ContentType.JSON, 'application/xml') is False - - -def test_is_content_type_xml() -> None: - assert is_content_type(ContentType.XML, 'application/xml') is True - assert is_content_type(ContentType.XML, 'application/xhtml+xml') is True - assert is_content_type(ContentType.XML, 'text/xml; charset=utf-8') is False - assert is_content_type(ContentType.XML, 'application/json') is False - - -def test_is_content_type_text() -> None: - assert is_content_type(ContentType.TEXT, 'text/plain') is True - assert is_content_type(ContentType.TEXT, 'text/html; charset=utf-8') is True - assert is_content_type(ContentType.TEXT, 'application/json') is False - assert is_content_type(ContentType.TEXT, 'application/xml') is False - - -def test_determine_file_extension() -> None: - # Can determine common types properly - assert determine_file_extension('application/json') == 'json' - assert determine_file_extension('application/xml') == 'xml' - assert determine_file_extension('text/plain') == 'txt' - - # Can handle unusual formats - assert determine_file_extension(' application/json ') == 'json' - assert determine_file_extension('APPLICATION/JSON') == 'json' - assert determine_file_extension('application/json;charset=utf-8') == 'json' - - # Return None for non-existent content types - assert determine_file_extension('clearly not a content type') is None - assert determine_file_extension('') is None - - -async def test_force_remove(tmp_path: Path) -> None: - test_file_path = Path(tmp_path, 'test.txt') - # Does not crash/raise when the file does not exist - assert test_file_path.exists() is False - await force_remove(test_file_path) - assert test_file_path.exists() is False - - # Remove the file if it exists - with test_file_path.open('a', encoding='utf-8'): - pass - assert test_file_path.exists() is True - await force_remove(test_file_path) - assert test_file_path.exists() is False - - -async def test_force_rename(tmp_path: Path) -> None: - src_dir = Path(tmp_path, 'src') - dst_dir = Path(tmp_path, 'dst') - src_file = Path(src_dir, 'src_dir.txt') - dst_file = Path(dst_dir, 'dst_dir.txt') - # Won't crash if source directory does not exist - assert src_dir.exists() is False - await force_rename(src_dir, dst_dir) - - # Will remove dst_dir if it exists (also covers normal case) - # Create the src_dir with a file in it - src_dir.mkdir() - with src_file.open('a', encoding='utf-8'): - pass - # Create the dst_dir with a file in it - dst_dir.mkdir() - with dst_file.open('a', encoding='utf-8'): - pass - assert src_file.exists() is True - assert dst_file.exists() is True - await force_rename(src_dir, dst_dir) - assert src_dir.exists() is False - assert dst_file.exists() is False - # src_dir.txt should exist in dst_dir - assert (dst_dir / 'src_dir.txt').exists() is True diff --git a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py index bdc1a361a9..c531ffdf41 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py @@ -64,6 +64,41 @@ async def test_open_creates_new_dataset(configuration: Configuration) -> None: assert metadata['item_count'] == 0 +async def test_open_dataset_by_id(configuration: Configuration) -> None: + """Test opening a dataset by ID after creating it by name.""" + storage_client = FileSystemStorageClient() + + # First create a dataset by name + original_client = await storage_client.open_dataset_client( + name='open-by-id-test', + configuration=configuration, + ) + + # Get the ID from the created client + dataset_id = original_client.metadata.id + + # Add some data to verify it persists + await original_client.push_data({'test_item': 'test_value'}) + + # Now try to open the same dataset using just the ID + reopened_client = await storage_client.open_dataset_client( + id=dataset_id, + configuration=configuration, + ) + + # Verify it's the same dataset + assert reopened_client.metadata.id == dataset_id + assert reopened_client.metadata.name == 'open-by-id-test' + + # Verify the data is still there + data = await reopened_client.get_data() + assert len(data.items) == 1 + assert data.items[0]['test_item'] == 'test_value' + + # Clean up + await reopened_client.drop() + + async def test_dataset_client_purge_on_start(configuration: Configuration) -> None: """Test that purge_on_start=True clears existing data in the dataset.""" configuration.purge_on_start = True diff --git a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py index bfc91af2cc..aa33a0ac1e 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py @@ -61,6 +61,41 @@ async def test_open_creates_new_kvs(configuration: Configuration) -> None: assert metadata['name'] == 'new_kvs' +async def test_open_kvs_by_id(configuration: Configuration) -> None: + """Test opening a key-value store by ID after creating it by name.""" + storage_client = FileSystemStorageClient() + + # First create a key-value store by name + original_client = await storage_client.open_key_value_store_client( + name='open-by-id-test', + configuration=configuration, + ) + + # Get the ID from the created client + kvs_id = original_client.metadata.id + + # Add some data to verify it persists + await original_client.set_value(key='test-key', value='test-value') + + # Now try to open the same key-value store using just the ID + reopened_client = await storage_client.open_key_value_store_client( + id=kvs_id, + configuration=configuration, + ) + + # Verify it's the same key-value store + assert reopened_client.metadata.id == kvs_id + assert reopened_client.metadata.name == 'open-by-id-test' + + # Verify the data is still there + record = await reopened_client.get_value(key='test-key') + assert record is not None + assert record.value == 'test-value' + + # Clean up + await reopened_client.drop() + + async def test_kvs_client_purge_on_start(configuration: Configuration) -> None: """Test that purge_on_start=True clears existing data in the key-value store.""" configuration.purge_on_start = True diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index 10ef63a8ef..200e12e3e6 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -36,6 +36,41 @@ async def rq_client(configuration: Configuration) -> AsyncGenerator[FileSystemRe await client.drop() +async def test_open_request_queue_by_id(configuration: Configuration) -> None: + """Test opening a request queue by ID after creating it by name.""" + storage_client = FileSystemStorageClient() + + # First create a request queue by name + original_client = await storage_client.open_request_queue_client( + name='open-by-id-test', + configuration=configuration, + ) + + # Get the ID from the created client + rq_id = original_client.metadata.id + + # Add a request to verify it persists + await original_client.add_batch_of_requests([Request.from_url('https://example.com/test')]) + + # Now try to open the same request queue using just the ID + reopened_client = await storage_client.open_request_queue_client( + id=rq_id, + configuration=configuration, + ) + + # Verify it's the same request queue + assert reopened_client.metadata.id == rq_id + assert reopened_client.metadata.name == 'open-by-id-test' + + # Verify the request is still there + request = await reopened_client.fetch_next_request() + assert request is not None + assert request.url == 'https://example.com/test' + + # Clean up + await reopened_client.drop() + + async def test_open_creates_new_rq(configuration: Configuration) -> None: """Test that open() creates a new request queue with proper metadata and files on disk.""" client = await FileSystemStorageClient().open_request_queue_client( From 89bfa5bba7463157920f27e7faeed517ba148a49 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 15 May 2025 12:15:53 +0200 Subject: [PATCH 03/43] Address feedback --- src/crawlee/_autoscaling/autoscaled_pool.py | 3 +- src/crawlee/_utils/file.py | 80 +++++++++++-------- src/crawlee/statistics/_error_snapshotter.py | 54 +++++++++---- .../_file_system/_dataset_client.py | 6 +- .../_file_system/_key_value_store_client.py | 8 +- .../_file_system/_request_queue_client.py | 12 +-- .../crawlers/_basic/test_basic_crawler.py | 1 - .../_file_system/test_fs_kvs_client.py | 4 +- .../_memory/test_memory_kvs_client.py | 4 +- 9 files changed, 103 insertions(+), 69 deletions(-) diff --git a/src/crawlee/_autoscaling/autoscaled_pool.py b/src/crawlee/_autoscaling/autoscaled_pool.py index 7a751d1783..5a9aa1fcff 100644 --- a/src/crawlee/_autoscaling/autoscaled_pool.py +++ b/src/crawlee/_autoscaling/autoscaled_pool.py @@ -142,8 +142,7 @@ async def run(self) -> None: logger.info('Waiting for remaining tasks to finish') - tasks_to_wait = list(run.worker_tasks) - for task in tasks_to_wait: + for task in run.worker_tasks: if not task.done(): with suppress(BaseException): await task diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index 7ce1413103..b5893e1fe6 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -6,7 +6,7 @@ import os import tempfile from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, overload if TYPE_CHECKING: from collections.abc import AsyncIterator @@ -54,59 +54,75 @@ async def json_dumps(obj: Any) -> str: return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str) -async def atomic_write_text(path: Path, data: str) -> None: - dir_path = path.parent +@overload +async def atomic_write( + path: Path, + data: str, + *, + is_binary: bool = False, +) -> None: ... - def _sync_write_text() -> str: - # create a temp file in the target dir, return its name - fd, tmp_path = tempfile.mkstemp( - suffix=f'{path.suffix}.tmp', - prefix=f'{path.name}.', - dir=str(dir_path), - ) - try: - with os.fdopen(fd, 'w', encoding='utf-8') as tmp_file: - tmp_file.write(data) - except: - Path(tmp_path).unlink(missing_ok=True) - raise - return tmp_path - tmp_path = await asyncio.to_thread(_sync_write_text) +@overload +async def atomic_write( + path: Path, + data: bytes, + *, + is_binary: bool = True, +) -> None: ... - try: - await asyncio.to_thread(os.replace, tmp_path, str(path)) - except (FileNotFoundError, PermissionError): - # fallback if tmp went missing - await asyncio.to_thread(path.write_text, data, encoding='utf-8') - finally: - await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True) +async def atomic_write( + path: Path, + data: str | bytes, + *, + is_binary: bool = False, +) -> None: + """Write data to a file atomically to prevent data corruption or partial writes. -async def atomic_write_bytes(path: Path, data: bytes) -> None: + This function handles both text and binary data. It ensures atomic writing by creating + a temporary file and then atomically replacing the target file, which prevents data + corruption if the process is interrupted during the write operation. + + For example, if a process (crawler) is interrupted while writing a file, the file may end up in an + incomplete or corrupted state. This might be especially unwanted for metadata files. + + Args: + path: The path to the destination file. + data: The data to write to the file (string or bytes). + is_binary: If True, write in binary mode. If False (default), write in text mode. + """ dir_path = path.parent - def _sync_write_bytes() -> str: + def _sync_write() -> str: + # create a temp file in the target dir, return its name fd, tmp_path = tempfile.mkstemp( suffix=f'{path.suffix}.tmp', prefix=f'{path.name}.', dir=str(dir_path), ) try: - with os.fdopen(fd, 'wb') as tmp_file: - tmp_file.write(data) - except: + if is_binary: + with os.fdopen(fd, 'wb') as tmp_file: + tmp_file.write(data) # type: ignore[arg-type] + else: + with os.fdopen(fd, 'w', encoding='utf-8') as tmp_file: + tmp_file.write(data) # type: ignore[arg-type] + except Exception: # broader exception handling Path(tmp_path).unlink(missing_ok=True) raise return tmp_path - tmp_path = await asyncio.to_thread(_sync_write_bytes) + tmp_path = await asyncio.to_thread(_sync_write) try: await asyncio.to_thread(os.replace, tmp_path, str(path)) except (FileNotFoundError, PermissionError): # fallback if tmp went missing - await asyncio.to_thread(path.write_bytes, data) + if is_binary: + await asyncio.to_thread(path.write_bytes, data) # type: ignore[arg-type] + else: + await asyncio.to_thread(path.write_text, data, encoding='utf-8') # type: ignore[arg-type] finally: await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True) diff --git a/src/crawlee/statistics/_error_snapshotter.py b/src/crawlee/statistics/_error_snapshotter.py index 0d15973e2f..4404904226 100644 --- a/src/crawlee/statistics/_error_snapshotter.py +++ b/src/crawlee/statistics/_error_snapshotter.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio import hashlib import re import string @@ -28,23 +29,42 @@ async def capture_snapshot( file_and_line: str, context: BasicCrawlingContext, ) -> None: - """Capture error snapshot and save it to key value store.""" - snapshot = await context.get_snapshot() - if not snapshot: - return - - base = self._get_snapshot_base_name(error_message, file_and_line) - kvs = await KeyValueStore.open(name=self._kvs_name) - - # Save HTML snapshot if present - if snapshot.html: - key_html = f'{base}.html' - await kvs.set_value(key_html, snapshot.html, content_type='text/html') - - # Save screenshot snapshot if present - if snapshot.screenshot: - key_jpg = f'{base}.jpg' - await kvs.set_value(key_jpg, snapshot.screenshot, content_type='image/jpeg') + """Capture error snapshot and save it to key value store. + + It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because + it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler` + returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with + an exception. + + Args: + error_message: Used in filename of the snapshot. + file_and_line: Used in filename of the snapshot. + context: Context that is used to get the snapshot. + """ + if snapshot := await context.get_snapshot(): + kvs = await KeyValueStore.open(name=self._kvs_name) + snapshot_base_name = self._get_snapshot_base_name(error_message, file_and_line) + snapshot_save_tasks = list[asyncio.Task]() + + if snapshot.html: + snapshot_save_tasks.append( + asyncio.create_task(self._save_html(kvs, snapshot.html, base_name=snapshot_base_name)) + ) + + if snapshot.screenshot: + snapshot_save_tasks.append( + asyncio.create_task(self._save_screenshot(kvs, snapshot.screenshot, base_name=snapshot_base_name)) + ) + + await asyncio.gather(*snapshot_save_tasks) + + async def _save_html(self, kvs: KeyValueStore, html: str, base_name: str) -> None: + file_name = f'{base_name}.html' + await kvs.set_value(file_name, html, content_type='text/html') + + async def _save_screenshot(self, kvs: KeyValueStore, screenshot: bytes, base_name: str) -> None: + file_name = f'{base_name}.jpg' + await kvs.set_value(file_name, screenshot, content_type='image/jpeg') def _sanitize_filename(self, filename: str) -> str: return re.sub(f'[^{re.escape(self.ALLOWED_CHARACTERS)}]', '', filename[: self.MAX_FILENAME_LENGTH]) diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index 18f5769a54..cff991efbb 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -13,7 +13,7 @@ from crawlee._consts import METADATA_FILENAME from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.file import atomic_write_text, json_dumps +from crawlee._utils.file import atomic_write, json_dumps from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata @@ -442,7 +442,7 @@ async def _update_metadata( # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) - await atomic_write_text(self.path_to_metadata, data) + await atomic_write(self.path_to_metadata, data) async def _push_item(self, item: dict[str, Any], item_id: int) -> None: """Push a single item to the dataset. @@ -463,7 +463,7 @@ async def _push_item(self, item: dict[str, Any], item_id: int) -> None: # Dump the serialized item to the file. data = await json_dumps(item) - await atomic_write_text(file_path, data) + await atomic_write(file_path, data) async def _get_sorted_data_files(self) -> list[Path]: """Retrieve and return a sorted list of data files in the dataset directory. diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 1730b8340b..d67b0c2258 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -14,7 +14,7 @@ from crawlee._consts import METADATA_FILENAME from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.file import atomic_write_bytes, atomic_write_text, infer_mime_type, json_dumps +from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata @@ -325,10 +325,10 @@ async def set_value(self, *, key: str, value: Any, content_type: str | None = No await asyncio.to_thread(self.path_to_kvs.mkdir, parents=True, exist_ok=True) # Write the value to the file. - await atomic_write_bytes(record_path, value_bytes) + await atomic_write(record_path, value_bytes, is_binary=True) # Write the record metadata to the file. - await atomic_write_text(record_metadata_filepath, record_metadata_content) + await atomic_write(record_metadata_filepath, record_metadata_content) # Update the KVS metadata to record the access and modification. await self._update_metadata(update_accessed_at=True, update_modified_at=True) @@ -445,7 +445,7 @@ async def _update_metadata( # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) - await atomic_write_text(self.path_to_metadata, data) + await atomic_write(self.path_to_metadata, data) def _encode_key(self, key: str) -> str: """Encode a key to make it safe for use in a file path.""" diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 5bb9c5133b..4cf419fa03 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -14,7 +14,7 @@ from crawlee import Request from crawlee._consts import METADATA_FILENAME from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.file import atomic_write_text, json_dumps +from crawlee._utils.file import atomic_write, json_dumps from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata @@ -336,7 +336,7 @@ async def add_batch_of_requests( # Update the existing request file request_path = self.path_to_rq / f'{existing_request.id}.json' request_data = await json_dumps(existing_request.model_dump()) - await atomic_write_text(request_path, request_data) + await atomic_write(request_path, request_data) processed_requests.append( ProcessedRequest( @@ -362,7 +362,7 @@ async def add_batch_of_requests( request_dict['_sequence'] = sequence_number request_data = await json_dumps(request_dict) - await atomic_write_text(request_path, request_data) + await atomic_write(request_path, request_data) # Update metadata counts new_total_request_count += 1 @@ -620,7 +620,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | return None request_data = await json_dumps(request.model_dump()) - await atomic_write_text(request_path, request_data) + await atomic_write(request_path, request_data) # Update metadata timestamps await self._update_metadata( @@ -678,7 +678,7 @@ async def reclaim_request( return None request_data = await json_dumps(request.model_dump()) - await atomic_write_text(request_path, request_data) + await atomic_write(request_path, request_data) # Update metadata timestamps await self._update_metadata(update_modified_at=True, update_accessed_at=True) @@ -781,4 +781,4 @@ async def _update_metadata( # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) - await atomic_write_text(self.path_to_metadata, data) + await atomic_write(self.path_to_metadata, data) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index fcf4971f51..d90efba086 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -618,7 +618,6 @@ async def test_crawler_get_storages() -> None: assert isinstance(kvs, KeyValueStore) -# THIS async def test_crawler_run_requests() -> None: crawler = BasicCrawler() seen_urls = list[str]() diff --git a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py index aa33a0ac1e..e2df900d0f 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py @@ -312,9 +312,9 @@ async def test_iterate_keys_with_exclusive_start_key(kvs_client: FileSystemKeyVa """Test that exclusive_start_key parameter returns only keys after it alphabetically.""" # Add some values with alphabetical keys await kvs_client.set_value(key='a-key', value='value-a') - await kvs_client.set_value(key='b-key', value='value-b') - await kvs_client.set_value(key='c-key', value='value-c') await kvs_client.set_value(key='d-key', value='value-d') + await kvs_client.set_value(key='c-key', value='value-c') + await kvs_client.set_value(key='b-key', value='value-b') # Iterate with exclusive start key keys = [key.key async for key in kvs_client.iterate_keys(exclusive_start_key='b-key')] diff --git a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py index 5d8789f6c3..0af70285c1 100644 --- a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py @@ -167,7 +167,7 @@ async def test_iterate_keys(kvs_client: MemoryKeyValueStoreClient) -> None: async def test_iterate_keys_with_exclusive_start_key(kvs_client: MemoryKeyValueStoreClient) -> None: """Test that exclusive_start_key parameter returns only keys after it alphabetically.""" # Set some values - for key in ['a_key', 'b_key', 'c_key', 'd_key', 'e_key']: + for key in ['b_key', 'c_key', 'a_key', 'e_key', 'd_key']: await kvs_client.set_value(key=key, value=f'value for {key}') # Get keys starting after 'b_key' @@ -181,7 +181,7 @@ async def test_iterate_keys_with_exclusive_start_key(kvs_client: MemoryKeyValueS async def test_iterate_keys_with_limit(kvs_client: MemoryKeyValueStoreClient) -> None: """Test that the limit parameter returns only the specified number of keys.""" # Set some values - for key in ['a_key', 'b_key', 'c_key', 'd_key', 'e_key']: + for key in ['a_key', 'e_key', 'c_key', 'b_key', 'd_key']: await kvs_client.set_value(key=key, value=f'value for {key}') # Get first 3 keys From 4050c7560616bbcaa46ab4bae0f35e6ec13074b3 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 16 May 2025 10:37:20 +0200 Subject: [PATCH 04/43] Add purge_if_needed method and improve some typing based on Pylance --- src/crawlee/_utils/docs.py | 8 +++-- src/crawlee/_utils/file.py | 4 +-- src/crawlee/request_loaders/_request_list.py | 4 +-- .../_request_manager_tandem.py | 4 +-- .../storage_clients/_base/_dataset_client.py | 2 +- .../storage_clients/_base/_storage_client.py | 34 ++++++++++++++++++- .../_file_system/_dataset_client.py | 13 ++++--- .../_file_system/_key_value_store_client.py | 2 +- .../_file_system/_request_queue_client.py | 2 +- .../_file_system/_storage_client.py | 30 +++++++++------- .../_memory/_dataset_client.py | 8 ++--- .../_memory/_key_value_store_client.py | 2 +- .../_memory/_request_queue_client.py | 2 +- .../_memory/_storage_client.py | 31 ++++++++++------- src/crawlee/storages/_dataset.py | 10 +++--- src/crawlee/storages/_key_value_store.py | 6 ++-- src/crawlee/storages/_request_queue.py | 10 +++--- 17 files changed, 107 insertions(+), 65 deletions(-) diff --git a/src/crawlee/_utils/docs.py b/src/crawlee/_utils/docs.py index 08d73addf1..8f0120ca99 100644 --- a/src/crawlee/_utils/docs.py +++ b/src/crawlee/_utils/docs.py @@ -1,11 +1,13 @@ from __future__ import annotations -from typing import Callable, Literal +from typing import Any, Callable, Literal, TypeVar GroupName = Literal['Classes', 'Abstract classes', 'Data structures', 'Event payloads', 'Errors', 'Functions'] +T = TypeVar('T', bound=Callable[..., Any]) -def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001 + +def docs_group(group_name: GroupName) -> Callable[[T], T]: # noqa: ARG001 """Mark a symbol for rendering and grouping in documentation. This decorator is used solely for documentation purposes and does not modify the behavior @@ -18,7 +20,7 @@ def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001 The original callable without modification. """ - def wrapper(func: Callable) -> Callable: + def wrapper(func: T) -> T: return func return wrapper diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index b5893e1fe6..ab7288dc3f 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -128,7 +128,7 @@ def _sync_write() -> str: async def export_json_to_stream( - iterator: AsyncIterator[dict], + iterator: AsyncIterator[dict[str, Any]], dst: TextIO, **kwargs: Unpack[ExportDataJsonKwargs], ) -> None: @@ -137,7 +137,7 @@ async def export_json_to_stream( async def export_csv_to_stream( - iterator: AsyncIterator[dict], + iterator: AsyncIterator[dict[str, Any]], dst: TextIO, **kwargs: Unpack[ExportDataCsvKwargs], ) -> None: diff --git a/src/crawlee/request_loaders/_request_list.py b/src/crawlee/request_loaders/_request_list.py index 3f545e1615..2f88327f65 100644 --- a/src/crawlee/request_loaders/_request_list.py +++ b/src/crawlee/request_loaders/_request_list.py @@ -54,13 +54,13 @@ def __init__( def name(self) -> str | None: return self._name - @override @property + @override async def handled_count(self) -> int: return self._handled_count - @override @property + @override async def total_count(self) -> int: return self._assumed_total_count diff --git a/src/crawlee/request_loaders/_request_manager_tandem.py b/src/crawlee/request_loaders/_request_manager_tandem.py index 35cc59e102..43f07709ec 100644 --- a/src/crawlee/request_loaders/_request_manager_tandem.py +++ b/src/crawlee/request_loaders/_request_manager_tandem.py @@ -32,13 +32,13 @@ def __init__(self, request_loader: RequestLoader, request_manager: RequestManage self._read_only_loader = request_loader self._read_write_manager = request_manager - @override @property + @override async def handled_count(self) -> int: return await self._read_write_manager.handled_count - @override @property + @override async def total_count(self) -> int: return (await self._read_only_loader.total_count) + (await self._read_write_manager.total_count) diff --git a/src/crawlee/storage_clients/_base/_dataset_client.py b/src/crawlee/storage_clients/_base/_dataset_client.py index 854e32dfce..3ae8d38f77 100644 --- a/src/crawlee/storage_clients/_base/_dataset_client.py +++ b/src/crawlee/storage_clients/_base/_dataset_client.py @@ -112,7 +112,7 @@ async def iterate_items( unwind: str | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict]: + ) -> AsyncIterator[dict[str, Any]]: """Iterate over the dataset items with filtering options. The backend method for the `Dataset.iterate_items` call. diff --git a/src/crawlee/storage_clients/_base/_storage_client.py b/src/crawlee/storage_clients/_base/_storage_client.py index 36f9cb7567..745613d068 100644 --- a/src/crawlee/storage_clients/_base/_storage_client.py +++ b/src/crawlee/storage_clients/_base/_storage_client.py @@ -3,6 +3,8 @@ from abc import ABC, abstractmethod from typing import TYPE_CHECKING +from crawlee._utils.docs import docs_group + if TYPE_CHECKING: from crawlee.configuration import Configuration @@ -11,8 +13,20 @@ from ._request_queue_client import RequestQueueClient +@docs_group('Abstract classes') class StorageClient(ABC): - """Base class for storage clients.""" + """Base class for storage clients. + + The `StorageClient` serves as an abstract base class that defines the interface for accessing Crawlee's + storage types: datasets, key-value stores, and request queues. It provides methods to open clients for + each of these storage types and handles common functionality. + + Storage clients implementations can be provided for various backends (file system, memory, databases, + various cloud providers, etc.) to support different use cases from development to production environments. + + Each storage client implementation is responsible for ensuring proper initialization, data persistence + (where applicable), and consistent access patterns across all storage types it supports. + """ @abstractmethod async def open_dataset_client( @@ -47,3 +61,21 @@ async def open_request_queue_client( def get_rate_limit_errors(self) -> dict[int, int]: """Return statistics about rate limit errors encountered by the HTTP client in storage client.""" return {} + + async def _purge_if_needed( + self, + client: DatasetClient | KeyValueStoreClient | RequestQueueClient, + configuration: Configuration, + ) -> None: + """Purge the client if needed. + + The purge is only performed if the configuration indicates that it should be done and the client + is not a named storage. Named storages are considered global and will typically outlive the run, + so they are not purged. + + Args: + client: The storage client to potentially purge. + configuration: Configuration that determines whether purging should occur. + """ + if configuration.purge_on_start and client.metadata.name is None: + await client.purge() diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index cff991efbb..80aec8923b 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -6,7 +6,7 @@ from datetime import datetime, timezone from logging import getLogger from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from pydantic import ValidationError from typing_extensions import override @@ -19,7 +19,6 @@ if TYPE_CHECKING: from collections.abc import AsyncIterator - from typing import Any from crawlee.configuration import Configuration @@ -83,8 +82,8 @@ def __init__( self._lock = asyncio.Lock() """A lock to ensure that only one operation is performed at a time.""" - @override @property + @override def metadata(self) -> DatasetMetadata: return self._metadata @@ -258,7 +257,7 @@ async def get_data( view: str | None = None, ) -> DatasetItemsListPage: # Check for unsupported arguments and log a warning if found. - unsupported_args = { + unsupported_args: dict[str, Any] = { 'clean': clean, 'fields': fields, 'omit': omit, @@ -307,7 +306,7 @@ async def get_data( selected_files = selected_files[:limit] # Read and parse each data file. - items = [] + items = list[dict[str, Any]]() for file_path in selected_files: try: file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8') @@ -353,9 +352,9 @@ async def iterate_items( unwind: str | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict]: + ) -> AsyncIterator[dict[str, Any]]: # Check for unsupported arguments and log a warning if found. - unsupported_args = { + unsupported_args: dict[str, Any] = { 'clean': clean, 'fields': fields, 'omit': omit, diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index d67b0c2258..9af23e67bc 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -79,8 +79,8 @@ def __init__( self._lock = asyncio.Lock() """A lock to ensure that only one operation is performed at a time.""" - @override @property + @override def metadata(self) -> KeyValueStoreMetadata: return self._metadata diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 4cf419fa03..09bd264d0c 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -99,8 +99,8 @@ def __init__( self._sequence_counter = 0 """A counter to track the order of requests added to the queue.""" - @override @property + @override def metadata(self) -> RequestQueueMetadata: return self._metadata diff --git a/src/crawlee/storage_clients/_file_system/_storage_client.py b/src/crawlee/storage_clients/_file_system/_storage_client.py index 346fb4cdc2..c4edd6f83c 100644 --- a/src/crawlee/storage_clients/_file_system/_storage_client.py +++ b/src/crawlee/storage_clients/_file_system/_storage_client.py @@ -2,6 +2,7 @@ from typing_extensions import override +from crawlee._utils.docs import docs_group from crawlee.configuration import Configuration from crawlee.storage_clients._base import StorageClient @@ -10,8 +11,20 @@ from ._request_queue_client import FileSystemRequestQueueClient +@docs_group('Classes') class FileSystemStorageClient(StorageClient): - """File system storage client.""" + """File system implementation of the storage client. + + This storage client provides access to datasets, key-value stores, and request queues that persist data + to the local file system. Each storage type is implemented with its own specific file system client + that stores data in a structured directory hierarchy. + + Data is stored in JSON format in predictable file paths, making it easy to inspect and manipulate + the stored data outside of the Crawlee application if needed. + + All data persists between program runs but is limited to access from the local machine + where the files are stored. + """ @override async def open_dataset_client( @@ -23,10 +36,7 @@ async def open_dataset_client( ) -> FileSystemDatasetClient: configuration = configuration or Configuration.get_global_configuration() client = await FileSystemDatasetClient.open(id=id, name=name, configuration=configuration) - - if configuration.purge_on_start and client.metadata.name is None: - await client.purge() - + await self._purge_if_needed(client, configuration) return client @override @@ -39,10 +49,7 @@ async def open_key_value_store_client( ) -> FileSystemKeyValueStoreClient: configuration = configuration or Configuration.get_global_configuration() client = await FileSystemKeyValueStoreClient.open(id=id, name=name, configuration=configuration) - - if configuration.purge_on_start and client.metadata.name is None: - await client.purge() - + await self._purge_if_needed(client, configuration) return client @override @@ -55,8 +62,5 @@ async def open_request_queue_client( ) -> FileSystemRequestQueueClient: configuration = configuration or Configuration.get_global_configuration() client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=configuration) - - if configuration.purge_on_start and client.metadata.name is None: - await client.purge() - + await self._purge_if_needed(client, configuration) return client diff --git a/src/crawlee/storage_clients/_memory/_dataset_client.py b/src/crawlee/storage_clients/_memory/_dataset_client.py index 9a78ac7f6a..e48da40382 100644 --- a/src/crawlee/storage_clients/_memory/_dataset_client.py +++ b/src/crawlee/storage_clients/_memory/_dataset_client.py @@ -57,8 +57,8 @@ def __init__( # List to hold dataset items self._records = list[dict[str, Any]]() - @override @property + @override def metadata(self) -> DatasetMetadata: return self._metadata @@ -141,7 +141,7 @@ async def get_data( view: str | None = None, ) -> DatasetItemsListPage: # Check for unsupported arguments and log a warning if found - unsupported_args = { + unsupported_args: dict[str, Any] = { 'clean': clean, 'fields': fields, 'omit': omit, @@ -196,9 +196,9 @@ async def iterate_items( unwind: str | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict]: + ) -> AsyncIterator[dict[str, Any]]: # Check for unsupported arguments and log a warning if found - unsupported_args = { + unsupported_args: dict[str, Any] = { 'clean': clean, 'fields': fields, 'omit': omit, diff --git a/src/crawlee/storage_clients/_memory/_key_value_store_client.py b/src/crawlee/storage_clients/_memory/_key_value_store_client.py index b527ebc013..d138047549 100644 --- a/src/crawlee/storage_clients/_memory/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_memory/_key_value_store_client.py @@ -53,8 +53,8 @@ def __init__( # Dictionary to hold key-value records with metadata self._records = dict[str, KeyValueStoreRecord]() - @override @property + @override def metadata(self) -> KeyValueStoreMetadata: return self._metadata diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index 0d60011979..a8a9d30adb 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -68,8 +68,8 @@ def __init__( # Dictionary to track in-progress requests (fetched but not yet handled or reclaimed) self._in_progress = dict[str, Request]() - @override @property + @override def metadata(self) -> RequestQueueMetadata: return self._metadata diff --git a/src/crawlee/storage_clients/_memory/_storage_client.py b/src/crawlee/storage_clients/_memory/_storage_client.py index ad34d99638..d23458c9f9 100644 --- a/src/crawlee/storage_clients/_memory/_storage_client.py +++ b/src/crawlee/storage_clients/_memory/_storage_client.py @@ -2,6 +2,7 @@ from typing_extensions import override +from crawlee._utils.docs import docs_group from crawlee.configuration import Configuration from crawlee.storage_clients._base import StorageClient @@ -10,8 +11,21 @@ from ._request_queue_client import MemoryRequestQueueClient +@docs_group('Classes') class MemoryStorageClient(StorageClient): - """Memory storage client.""" + """Memory implementation of the storage client. + + This storage client provides access to datasets, key-value stores, and request queues that store all data + in memory using Python data structures (lists and dictionaries). No data is persisted between process runs, + meaning all stored data is lost when the program terminates. + + The memory implementation provides fast access to data but is limited by available memory and does not + support data sharing across different processes. All storage operations happen entirely in memory with + no disk operations. + + The memory storage client is useful for testing and development environments, or short-lived crawler + operations where persistence is not required. + """ @override async def open_dataset_client( @@ -23,10 +37,7 @@ async def open_dataset_client( ) -> MemoryDatasetClient: configuration = configuration or Configuration.get_global_configuration() client = await MemoryDatasetClient.open(id=id, name=name, configuration=configuration) - - if configuration.purge_on_start and client.metadata.name is None: - await client.purge() - + await self._purge_if_needed(client, configuration) return client @override @@ -39,10 +50,7 @@ async def open_key_value_store_client( ) -> MemoryKeyValueStoreClient: configuration = configuration or Configuration.get_global_configuration() client = await MemoryKeyValueStoreClient.open(id=id, name=name, configuration=configuration) - - if configuration.purge_on_start and client.metadata.name is None: - await client.purge() - + await self._purge_if_needed(client, configuration) return client @override @@ -55,8 +63,5 @@ async def open_request_queue_client( ) -> MemoryRequestQueueClient: configuration = configuration or Configuration.get_global_configuration() client = await MemoryRequestQueueClient.open(id=id, name=name, configuration=configuration) - - if configuration.purge_on_start and client.metadata.name is None: - await client.purge() - + await self._purge_if_needed(client, configuration) return client diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index a950a78dc9..a5ac8834a9 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -84,18 +84,18 @@ def __init__(self, client: DatasetClient) -> None: """ self._client = client - @override @property + @override def id(self) -> str: return self._client.metadata.id - @override @property + @override def name(self) -> str | None: return self._client.metadata.name - @override @property + @override def metadata(self) -> DatasetMetadata: return self._client.metadata @@ -233,7 +233,7 @@ async def iterate_items( unwind: str | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict]: + ) -> AsyncIterator[dict[str, Any]]: """Iterate over items in the dataset according to specified filters and sorting. This method allows for asynchronously iterating through dataset items while applying various filters such as @@ -281,7 +281,7 @@ async def list_items( unwind: str | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> list[dict]: + ) -> list[dict[str, Any]]: """Retrieve a list of all items from the dataset according to specified filters and sorting. This method collects all dataset items into a list while applying various filters such as diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index c24e9a5418..e3413021d9 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -93,18 +93,18 @@ def __init__(self, client: KeyValueStoreClient) -> None: self._autosave_lock = asyncio.Lock() self._persist_state_event_started = False - @override @property + @override def id(self) -> str: return self._client.metadata.id - @override @property + @override def name(self) -> str | None: return self._client.metadata.name - @override @property + @override def metadata(self) -> KeyValueStoreMetadata: return self._client.metadata diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index c9fa3a1bff..66231cc138 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -92,28 +92,28 @@ def __init__(self, client: RequestQueueClient) -> None: self._add_requests_tasks = list[asyncio.Task]() """A list of tasks for adding requests to the queue.""" - @override @property + @override def id(self) -> str: return self._client.metadata.id - @override @property + @override def name(self) -> str | None: return self._client.metadata.name - @override @property + @override def metadata(self) -> RequestQueueMetadata: return self._client.metadata - @override @property + @override async def handled_count(self) -> int: return self._client.metadata.handled_request_count - @override @property + @override async def total_count(self) -> int: return self._client.metadata.total_request_count From 26f46e2504a05ebcf0a4e74364c83f27b90df0f8 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 20 May 2025 15:02:12 +0200 Subject: [PATCH 05/43] Address more feedback --- src/crawlee/_types.py | 2 +- .../_file_system/_key_value_store_client.py | 12 +- .../_file_system/_request_queue_client.py | 472 ++++++++++-------- src/crawlee/storages/_key_value_store.py | 1 - .../_file_system/test_fs_kvs_client.py | 29 +- 5 files changed, 296 insertions(+), 220 deletions(-) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 5aade878eb..6dd758958c 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -610,7 +610,7 @@ class GetDataKwargs(TypedDict): """The maximum number of items to retrieve. Unlimited if None.""" clean: NotRequired[bool] - """Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.""" + """Return only non-empty items and excludes hidden fields. Shortcut for `skip_hidden` and `skip_empty`.""" desc: NotRequired[bool] """Set to True to sort results in descending order.""" diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 9af23e67bc..fe99dad08d 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -419,7 +419,17 @@ async def iterate_keys( @override async def get_public_url(self, *, key: str) -> str: - raise NotImplementedError('Public URLs are not supported for file system key-value stores.') + """Return a file:// URL for the given key. + + Args: + key: The key to get the public URL for. + + Returns: + A file:// URL pointing to the file on the local filesystem. + """ + record_path = self.path_to_kvs / self._encode_key(key) + absolute_path = record_path.absolute() + return absolute_path.as_uri() async def _update_metadata( self, diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 09bd264d0c..407bf8df57 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -37,9 +37,9 @@ class FileSystemRequestQueueClient(RequestQueueClient): {STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json ``` - The implementation uses file timestamps for FIFO ordering of regular requests and maintains in-memory sets - for tracking in-progress and forefront requests. File system storage provides durability at the cost of - slower I/O operations compared to memory-based storage. + The implementation uses sequence numbers embedded in request files for FIFO ordering of regular requests. + It maintains in-memory data structures for tracking in-progress requests and prioritizing forefront requests. + File system storage provides durability at the cost of slower I/O operations compared to memory-based storage. This implementation is ideal for long-running crawlers where persistence is important and for situations where you need to resume crawling after process termination. @@ -273,40 +273,32 @@ async def add_batch_of_requests( async with self._lock: new_total_request_count = self._metadata.total_request_count new_pending_request_count = self._metadata.pending_request_count - - processed_requests = [] - - # Create the requests directory if it doesn't exist - await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True) + processed_requests = list[ProcessedRequest]() for request in requests: - # Check if the request is already in the queue by unique_key + # Go through existing requests to find if the request already exists in the queue. + existing_request_files = await self._get_request_files() existing_request = None - # List all request files and check for matching unique_key - request_files = await asyncio.to_thread(list, self.path_to_rq.glob('*.json')) - for request_file in request_files: - # Skip metadata file - if request_file.name == METADATA_FILENAME: + for existing_request_file in existing_request_files: + existing_request = await self._parse_request_file(existing_request_file) + + if existing_request is None: continue - file = await asyncio.to_thread(open, request_file) - try: - file_content = json.load(file) - if file_content.get('unique_key') == request.unique_key: - existing_request = Request(**file_content) - break - except (json.JSONDecodeError, ValidationError): - logger.warning(f'Failed to parse request file: {request_file}') - finally: - await asyncio.to_thread(file.close) + # If the unique key matches, we found an existing request + if existing_request.unique_key == request.unique_key: + break + + existing_request = None + # Set the processed request flags. was_already_present = existing_request is not None was_already_handled = ( was_already_present and existing_request and existing_request.handled_at is not None ) - # If the request is already in the queue and handled, don't add it again + # If the request is already in the queue and handled, do not enqueue it again. if was_already_handled and existing_request: processed_requests.append( ProcessedRequest( @@ -318,7 +310,7 @@ async def add_batch_of_requests( ) continue - # If forefront and existing request is not handled, mark it as forefront + # If forefront and existing request is not handled, mark it as forefront. if forefront and was_already_present and not was_already_handled and existing_request: self._forefront_requests.insert(0, existing_request.id) processed_requests.append( @@ -331,7 +323,7 @@ async def add_batch_of_requests( ) continue - # If the request is already in the queue but not handled, update it + # If the request is already in the queue but not handled, update it. if was_already_present and existing_request: # Update the existing request file request_path = self.path_to_rq / f'{existing_request.id}.json' @@ -359,7 +351,7 @@ async def add_batch_of_requests( # Add sequence number to ensure FIFO ordering sequence_number = self._sequence_counter self._sequence_counter += 1 - request_dict['_sequence'] = sequence_number + request_dict['sequence'] = sequence_number request_data = await json_dumps(request_dict) await atomic_write(request_path, request_data) @@ -404,27 +396,10 @@ async def get_request(self, request_id: str) -> Request | None: The retrieved request, or None, if it did not exist. """ request_path = self.path_to_rq / f'{request_id}.json' - - try: - file = await asyncio.to_thread(open, request_path) - except FileNotFoundError: - logger.warning(f'Request file "{request_path}" not found.') - return None - - try: - file_content = json.load(file) - except json.JSONDecodeError as exc: - logger.warning(f'Failed to parse request file {request_path}: {exc!s}') - return None - finally: - await asyncio.to_thread(file.close) - - try: - return Request(**file_content) - except ValidationError as exc: - logger.warning(f'Failed to validate request file {request_path}: {exc!s}') - - return None + request = await self._parse_request_file(request_path) + if request is None: + logger.warning(f'Request with ID "{request_id}" not found in the queue.') + return request @override async def fetch_next_request(self) -> Request | None: @@ -442,152 +417,21 @@ async def fetch_next_request(self) -> Request | None: # Create the requests directory if it doesn't exist await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True) - # First check forefront requests in the exact order they were added - for request_id in list(self._forefront_requests): - # Skip if already in progress - if request_id in self._in_progress: - continue - - request_path = self.path_to_rq / f'{request_id}.json' - - # Skip if file doesn't exist - if not await asyncio.to_thread(request_path.exists): - self._forefront_requests.remove(request_id) - continue - - file = await asyncio.to_thread(open, request_path) - try: - file_content = json.load(file) - # Skip if already handled - if file_content.get('handled_at') is not None: - self._forefront_requests.remove(request_id) - continue - - # Create request object - request = Request(**file_content) - - # Mark as in-progress in memory - self._in_progress.add(request.id) - - # Remove from forefront list - self._forefront_requests.remove(request.id) - - # Update accessed timestamp - await self._update_metadata(update_accessed_at=True) - except (json.JSONDecodeError, ValidationError) as exc: - logger.warning(f'Failed to parse request file {request_path}: {exc!s}') - self._forefront_requests.remove(request_id) - else: - return request - finally: - await asyncio.to_thread(file.close) - - # List all request files for regular (non-forefront) requests - request_files = await asyncio.to_thread(list, self.path_to_rq.glob('*.json')) - - # Dictionary to store request files by their sequence number - request_sequences = {} - requests_without_sequence = [] + # First try forefront requests (highest priority) + forefront_request = await self._try_get_forefront_request() + if forefront_request is not None: + return forefront_request - # Filter out metadata files and in-progress requests - for request_file in request_files: - # Skip metadata file - if request_file.name == METADATA_FILENAME: - continue - - # Extract request ID from filename - request_id = request_file.stem - - # Skip if already in progress or in forefront - if request_id in self._in_progress or request_id in self._forefront_requests: - continue - - # Read the file to get the sequence number - try: - file = await asyncio.to_thread(open, request_file) - try: - file_content = json.load(file) - # Skip if already handled - if file_content.get('handled_at') is not None: - continue - - # Use sequence number for ordering if available - sequence_number = file_content.get('_sequence') - if sequence_number is not None: - request_sequences[sequence_number] = request_file - else: - # For backward compatibility with existing files - requests_without_sequence.append(request_file) - finally: - await asyncio.to_thread(file.close) - except (json.JSONDecodeError, ValidationError) as exc: - logger.warning(f'Failed to parse request file {request_file}: {exc!s}') + # Collect and categorize regular requests + request_sequences, requests_without_sequence = await self._categorize_regular_requests() - # Process requests with sequence numbers first, in FIFO order - for sequence in sorted(request_sequences.keys()): - request_file = request_sequences[sequence] - file = await asyncio.to_thread(open, request_file) - try: - file_content = json.load(file) - # Skip if already handled (double-check) - if file_content.get('handled_at') is not None: - continue + # Try to get a request with a sequence number (FIFO order) + sequenced_request = await self._try_get_sequenced_request(request_sequences) + if sequenced_request is not None: + return sequenced_request - # Create request object - request = Request(**file_content) - - # Mark as in-progress in memory - self._in_progress.add(request.id) - - # Update accessed timestamp - await self._update_metadata(update_accessed_at=True) - except (json.JSONDecodeError, ValidationError) as exc: - logger.warning(f'Failed to parse request file {request_file}: {exc!s}') - else: - return request - finally: - await asyncio.to_thread(file.close) - - # Process requests without sequence numbers using file timestamps (backward compatibility) - if requests_without_sequence: - # Get file creation times for sorting - request_file_times = {} - for request_file in requests_without_sequence: - try: - file_stat = await asyncio.to_thread(request_file.stat) - request_file_times[request_file] = file_stat.st_mtime - except Exception: # noqa: PERF203 - # If we can't get the time, use 0 (oldest) - request_file_times[request_file] = 0 - - # Sort by creation time - requests_without_sequence.sort(key=lambda f: request_file_times[f]) - - # Process requests without sequence in file timestamp order - for request_file in requests_without_sequence: - file = await asyncio.to_thread(open, request_file) - try: - file_content = json.load(file) - # Skip if already handled - if file_content.get('handled_at') is not None: - continue - - # Create request object - request = Request(**file_content) - - # Mark as in-progress in memory - self._in_progress.add(request.id) - - # Update accessed timestamp - await self._update_metadata(update_accessed_at=True) - except (json.JSONDecodeError, ValidationError) as exc: - logger.warning(f'Failed to parse request file {request_file}: {exc!s}') - else: - return request - finally: - await asyncio.to_thread(file.close) - - return None + # Fall back to requests without sequence numbers (using file timestamps) + return await self._try_get_non_sequenced_request(requests_without_sequence) @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: @@ -622,7 +466,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | request_data = await json_dumps(request.model_dump()) await atomic_write(request_path, request_data) - # Update metadata timestamps + # Update RQ metadata await self._update_metadata( update_modified_at=True, update_accessed_at=True, @@ -701,33 +545,17 @@ async def is_empty(self) -> bool: # Update accessed timestamp when checking if queue is empty await self._update_metadata(update_accessed_at=True) - # Create the requests directory if it doesn't exist - await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True) - - # List all request files - request_files = await asyncio.to_thread(list, self.path_to_rq.glob('*.json')) + request_files = await self._get_request_files() # Check each file to see if there are any unhandled requests for request_file in request_files: - # Skip metadata file - if request_file.name == METADATA_FILENAME: - continue + request = await self._parse_request_file(request_file) - try: - file = await asyncio.to_thread(open, request_file) - except FileNotFoundError: - logger.warning(f'Request file "{request_file}" not found.') + if request is None: continue - try: - file_content = json.load(file) - except json.JSONDecodeError: - logger.warning(f'Failed to parse request file: {request_file}') - finally: - await asyncio.to_thread(file.close) - # If any request is not handled, the queue is not empty - if file_content.get('handled_at') is None: + if request.handled_at is None: return False # If we got here, all requests are handled or there are no requests @@ -782,3 +610,221 @@ async def _update_metadata( # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) await atomic_write(self.path_to_metadata, data) + + async def _try_get_forefront_request(self) -> Request | None: + """Try to get the next available forefront request. + + Returns: + The next forefront request or None if no forefront requests are available. + """ + for request_id in list(self._forefront_requests): + # Skip if already in progress + if request_id in self._in_progress: + continue + + request_path = self.path_to_rq / f'{request_id}.json' + + # Skip if file doesn't exist + if not await asyncio.to_thread(request_path.exists): + self._forefront_requests.remove(request_id) + continue + + # Parse the request file + request = await self._parse_request_file(request_path) + + # Skip if parsing failed + if request is None: + self._forefront_requests.remove(request_id) + continue + + # Skip if already handled + if request.handled_at is not None: + self._forefront_requests.remove(request_id) + continue + + # Mark as in-progress in memory + self._in_progress.add(request.id) + + # Remove from forefront list + self._forefront_requests.remove(request.id) + + # Update accessed timestamp + await self._update_metadata(update_accessed_at=True) + + return request + + return None + + async def _categorize_regular_requests(self) -> tuple[dict[int, Path], list[Path]]: + """Categorize regular (non-forefront) requests by sequence number. + + Returns: + A tuple containing: + - Dictionary mapping sequence numbers to request file paths + - List of request file paths without sequence numbers + """ + # List all request files for regular (non-forefront) requests + request_files = await self._get_request_files() + + # Dictionary to store request files by their sequence number + request_sequences = {} + requests_without_sequence = [] + + # Filter out metadata files and in-progress requests + for request_file in request_files: + # Extract request ID from filename + request_id = request_file.stem + + # Skip if already in progress or in forefront + if request_id in self._in_progress or request_id in self._forefront_requests: + continue + + request = await self._parse_request_file(request_file) + + if request is None: + continue + + # Skip if already handled + if request.handled_at is not None: + continue + + sequence_number = None if request.model_extra is None else request.model_extra.get('sequence') + + # If the request has a sequence number, add it to the dictionary + if sequence_number: + request_sequences[sequence_number] = request_file + else: + # If no sequence number, add to the list for ordering by file timestamp + requests_without_sequence.append(request_file) + + return request_sequences, requests_without_sequence + + async def _try_get_sequenced_request(self, request_sequences: dict[int, Path]) -> Request | None: + """Try to get the next request with a sequence number in FIFO order. + + Args: + request_sequences: Dictionary mapping sequence numbers to request file paths + + Returns: + The next sequenced request or None if no valid sequenced requests are available + """ + # Process requests with sequence numbers first, in FIFO order + for sequence in sorted(request_sequences.keys()): + request_file = request_sequences[sequence] + + # Parse the request file + request = await self._parse_request_file(request_file) + + # Skip if parsing failed + if request is None: + continue + + # Skip if already handled (double-check) + if request.handled_at is not None: + continue + + # Mark as in-progress in memory + self._in_progress.add(request.id) + + # Update accessed timestamp + await self._update_metadata(update_accessed_at=True) + + return request + + return None + + async def _try_get_non_sequenced_request(self, requests_without_sequence: list[Path]) -> Request | None: + """Try to get the next request without a sequence number, using file timestamps for ordering. + + Args: + requests_without_sequence: List of request file paths without sequence numbers + + Returns: + The next non-sequenced request or None if no valid non-sequenced requests are available + """ + if not requests_without_sequence: + return None + + # Get file creation times for sorting + request_file_times = {} + for request_file in requests_without_sequence: + try: + file_stat = await asyncio.to_thread(request_file.stat) + request_file_times[request_file] = file_stat.st_mtime + except Exception: # noqa: PERF203 + # If we can't get the time, use 0 (oldest) + request_file_times[request_file] = 0 + + # Sort by creation time + requests_without_sequence.sort(key=lambda f: request_file_times[f]) + + # Process requests without sequence in file timestamp order + for request_file in requests_without_sequence: + request = await self._parse_request_file(request_file) + + if request is None: + continue + + if request.handled_at is not None: + continue + + # Mark as in-progress in memory + self._in_progress.add(request.id) + + # Update accessed timestamp + await self._update_metadata(update_accessed_at=True) + return request + + return None + + async def _get_request_files(self) -> list[Path]: + """Get all request files in the queue. + + Returns: + A list of paths to all request files in the queue. + """ + # Create the requests directory if it doesn't exist. + await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True) + + # List all the json files. + files = await asyncio.to_thread(list, self.path_to_rq.glob('*.json')) + + # Filter out metadata file and non-file entries. + filtered = filter( + lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, + files, + ) + + return list(filtered) + + async def _parse_request_file(self, file_path: Path) -> Request | None: + """Parse a request file and return the `Request` object. + + Args: + file_path: The path to the request file. + + Returns: + The parsed `Request` object or `None` if the file could not be read or parsed. + """ + # Open the request file. + try: + file = await asyncio.to_thread(open, file_path) + except FileNotFoundError: + logger.warning(f'Request file "{file_path}" not found.') + return None + + # Read the file content and parse it as JSON. + try: + file_content = json.load(file) + except json.JSONDecodeError as exc: + logger.warning(f'Failed to parse request file {file_path}: {exc!s}') + return None + finally: + await asyncio.to_thread(file.close) + + # Validate the content against the Request model. + try: + return Request.model_validate(file_content) + except ValidationError as exc: + logger.warning(f'Failed to validate request file {file_path}: {exc!s}') + return None diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index e3413021d9..6fc370dcb2 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -91,7 +91,6 @@ def __init__(self, client: KeyValueStoreClient) -> None: """ self._client = client self._autosave_lock = asyncio.Lock() - self._persist_state_event_started = False @property @override diff --git a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py index e2df900d0f..0f0a31e9d9 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py @@ -2,6 +2,7 @@ import asyncio import json +import urllib.parse from datetime import datetime from typing import TYPE_CHECKING @@ -369,10 +370,30 @@ async def test_metadata_updates(kvs_client: FileSystemKeyValueStoreClient) -> No assert kvs_client.metadata.accessed_at > accessed_after_get -async def test_get_public_url_not_supported(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that get_public_url raises NotImplementedError for the file system implementation.""" - with pytest.raises(NotImplementedError, match='Public URLs are not supported'): - await kvs_client.get_public_url(key='any-key') +async def test_get_public_url(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that get_public_url returns a valid file:// URL for the given key.""" + # Set a value first to ensure the file exists + test_key = 'test-url-key' + test_value = 'Test URL value' + await kvs_client.set_value(key=test_key, value=test_value) + + # Get the URL + url = await kvs_client.get_public_url(key=test_key) + + # Verify it's a valid file:// URL + assert url.startswith('file:///') + + # The encoded key name should be in the URL + encoded_key = urllib.parse.quote(test_key, safe='') + assert encoded_key in url + + # Verify the path in the URL points to the actual file + file_path = kvs_client.path_to_kvs / encoded_key + assert file_path.exists() + + # Verify file content without using urlopen (avoiding blocking IO) + content = file_path.read_text(encoding='utf-8') + assert content == test_value async def test_concurrent_operations(kvs_client: FileSystemKeyValueStoreClient) -> None: From c83a36a76135a15a1fcfccda984a3239ccad86d9 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 4 Jun 2025 14:25:16 +0200 Subject: [PATCH 06/43] RQ FS client improvements --- Makefile | 6 +- src/crawlee/_request.py | 39 +- .../_file_system/_request_queue_client.py | 566 ++++++++---------- .../_memory/_request_queue_client.py | 17 +- .../_file_system/test_fs_rq_client.py | 5 +- 5 files changed, 293 insertions(+), 340 deletions(-) diff --git a/Makefile b/Makefile index 4954cb7afd..4dba98a4e9 100644 --- a/Makefile +++ b/Makefile @@ -30,13 +30,13 @@ type-check: uv run mypy unit-tests: - uv run pytest --numprocesses=auto --verbose --cov=src/crawlee tests/unit + uv run pytest --numprocesses=auto -vv --cov=src/crawlee tests/unit unit-tests-cov: - uv run pytest --numprocesses=auto --verbose --cov=src/crawlee --cov-report=html tests/unit + uv run pytest --numprocesses=auto -vv --cov=src/crawlee --cov-report=html tests/unit e2e-templates-tests $(args): - uv run pytest --numprocesses=$(E2E_TESTS_CONCURRENCY) --verbose tests/e2e/project_template "$(args)" + uv run pytest --numprocesses=$(E2E_TESTS_CONCURRENCY) -vv tests/e2e/project_template "$(args)" format: uv run ruff check --fix diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index adb43949ea..3637ca70c2 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -158,7 +158,23 @@ class Request(BaseModel): ``` """ - model_config = ConfigDict(populate_by_name=True) + model_config = ConfigDict(populate_by_name=True, extra='allow') + + id: str + """A unique identifier for the request. Note that this is not used for deduplication, and should not be confused + with `unique_key`.""" + + unique_key: Annotated[str, Field(alias='uniqueKey')] + """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing + to the same URL. + + If `unique_key` is not provided, then it is automatically generated by normalizing the URL. + For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key` + of `http://www.example.com/something`. + + Pass an arbitrary non-empty text value to the `unique_key` property to override the default behavior + and specify which URLs shall be considered equal. + """ url: Annotated[str, BeforeValidator(validate_http_url), Field()] """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters @@ -207,22 +223,6 @@ class Request(BaseModel): handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None """Timestamp when the request was handled.""" - unique_key: Annotated[str, Field(alias='uniqueKey')] - """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing - to the same URL. - - If `unique_key` is not provided, then it is automatically generated by normalizing the URL. - For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key` - of `http://www.example.com/something`. - - Pass an arbitrary non-empty text value to the `unique_key` property - to override the default behavior and specify which URLs shall be considered equal. - """ - - id: str - """A unique identifier for the request. Note that this is not used for deduplication, and should not be confused - with `unique_key`.""" - @classmethod def from_url( cls, @@ -398,6 +398,11 @@ def forefront(self) -> bool: def forefront(self, new_value: bool) -> None: self.crawlee_data.forefront = new_value + @property + def was_already_handled(self) -> bool: + """Indicates whether the request was handled.""" + return self.handled_at is not None + class RequestWithLock(Request): """A crawling request with information about locks.""" diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 407bf8df57..5c01c14d5e 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -16,7 +16,12 @@ from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.file import atomic_write, json_dumps from crawlee.storage_clients._base import RequestQueueClient -from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata +from crawlee.storage_clients.models import ( + AddRequestsResponse, + ProcessedRequest, + RequestQueueMetadata, + UnprocessedRequest, +) if TYPE_CHECKING: from collections.abc import Sequence @@ -65,6 +70,8 @@ def __init__( stats: dict, total_request_count: int, storage_dir: Path, + sequence_counter: int, + forefront_sequence_counter: int, ) -> None: """Initialize a new instance. @@ -84,21 +91,20 @@ def __init__( ) self._storage_dir = storage_dir + """The base directory where the request queue is stored.""" + + self._sequence_counter = sequence_counter + """A counter to track the order of (normal) requests added to the queue.""" + + self._forefront_sequence_counter = forefront_sequence_counter + """A counter to track the order of forefront requests added to the queue.""" - # Internal attributes self._lock = asyncio.Lock() """A lock to ensure that only one operation is performed at a time.""" self._in_progress = set[str]() """A set of request IDs that are currently being processed.""" - self._forefront_requests = list[str]() - """A list of request IDs that should be prioritized (added with forefront=True). - Most recent forefront requests are added at the beginning of the list.""" - - self._sequence_counter = 0 - """A counter to track the order of requests added to the queue.""" - @property @override def metadata(self) -> RequestQueueMetadata: @@ -132,7 +138,7 @@ async def open( if not rq_base_path.exists(): await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True) - # Get a new instance by ID. + # Open an existing RQ by its ID, raise an error if not found. if id: found = False for rq_dir in rq_base_path.iterdir(): @@ -148,19 +154,20 @@ async def open( try: file_content = json.load(file) metadata = RequestQueueMetadata(**file_content) + + rq_path = ( + rq_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT + if metadata.name is None + else rq_base_path / metadata.name + ) + sequence_counter, forefront_sequence_counter = await cls._get_sequence_counters(rq_path) + if metadata.id == id: client = cls( - id=metadata.id, - name=metadata.name, - created_at=metadata.created_at, - accessed_at=metadata.accessed_at, - modified_at=metadata.modified_at, - had_multiple_clients=metadata.had_multiple_clients, - handled_request_count=metadata.handled_request_count, - pending_request_count=metadata.pending_request_count, - stats=metadata.stats, - total_request_count=metadata.total_request_count, + **metadata.model_dump(), storage_dir=storage_dir, + sequence_counter=sequence_counter, + forefront_sequence_counter=forefront_sequence_counter, ) await client._update_metadata(update_accessed_at=True) found = True @@ -173,7 +180,7 @@ async def open( if not found: raise ValueError(f'Request queue with ID "{id}" not found') - # Get a new instance by name. + # Open an existing RQ by its name, or create a new one if not found. else: rq_path = rq_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else rq_base_path / name metadata_path = rq_path / METADATA_FILENAME @@ -194,18 +201,14 @@ async def open( except ValidationError as exc: raise ValueError(f'Invalid metadata file for request queue "{name}"') from exc + metadata.name = name + sequence_counter, forefront_sequence_counter = await cls._get_sequence_counters(rq_path) + client = cls( - id=metadata.id, - name=name, - created_at=metadata.created_at, - accessed_at=metadata.accessed_at, - modified_at=metadata.modified_at, - had_multiple_clients=metadata.had_multiple_clients, - handled_request_count=metadata.handled_request_count, - pending_request_count=metadata.pending_request_count, - stats=metadata.stats, - total_request_count=metadata.total_request_count, + **metadata.model_dump(), storage_dir=storage_dir, + sequence_counter=sequence_counter, + forefront_sequence_counter=forefront_sequence_counter, ) await client._update_metadata(update_accessed_at=True) @@ -225,6 +228,8 @@ async def open( stats={}, total_request_count=0, storage_dir=storage_dir, + sequence_counter=0, + forefront_sequence_counter=0, ) await client._update_metadata() @@ -240,9 +245,9 @@ async def drop(self) -> None: @override async def purge(self) -> None: async with self._lock: - for file_path in self.path_to_rq.glob('*'): - if file_path.name == METADATA_FILENAME: - continue + request_files = await self._get_request_files(self.path_to_rq) + + for file_path in request_files: await asyncio.to_thread(file_path.unlink) # Update metadata counts @@ -274,12 +279,13 @@ async def add_batch_of_requests( new_total_request_count = self._metadata.total_request_count new_pending_request_count = self._metadata.pending_request_count processed_requests = list[ProcessedRequest]() + unprocessed_requests = list[UnprocessedRequest]() for request in requests: - # Go through existing requests to find if the request already exists in the queue. - existing_request_files = await self._get_request_files() + existing_request_files = await self._get_request_files(self.path_to_rq) existing_request = None + # Go through existing requests to find if the request already exists in the queue. for existing_request_file in existing_request_files: existing_request = await self._parse_request_file(existing_request_file) @@ -292,86 +298,81 @@ async def add_batch_of_requests( existing_request = None - # Set the processed request flags. - was_already_present = existing_request is not None - was_already_handled = ( - was_already_present and existing_request and existing_request.handled_at is not None - ) + # If there is no existing request with the same unique key, add the new request. + if existing_request is None: + request_path = self._get_request_path(request.id) + + # Add sequence number to ensure FIFO ordering. + if forefront: + sequence_number = self._forefront_sequence_counter + self._forefront_sequence_counter += 1 + else: + sequence_number = self._sequence_counter + self._sequence_counter += 1 + + # Update the request data and dump it to the file. + request_dict = request.model_dump() + request_dict['__sequence'] = sequence_number + request_dict['__forefront'] = forefront + request_data = await json_dumps(request_dict) + await atomic_write(request_path, request_data) - # If the request is already in the queue and handled, do not enqueue it again. - if was_already_handled and existing_request: - processed_requests.append( - ProcessedRequest( - id=existing_request.id, - unique_key=request.unique_key, - was_already_present=True, - was_already_handled=True, - ) - ) - continue + # Update the metadata counts. + new_total_request_count += 1 + new_pending_request_count += 1 - # If forefront and existing request is not handled, mark it as forefront. - if forefront and was_already_present and not was_already_handled and existing_request: - self._forefront_requests.insert(0, existing_request.id) processed_requests.append( ProcessedRequest( - id=existing_request.id, + id=request.id, unique_key=request.unique_key, - was_already_present=True, + was_already_present=False, was_already_handled=False, ) ) - continue - - # If the request is already in the queue but not handled, update it. - if was_already_present and existing_request: - # Update the existing request file - request_path = self.path_to_rq / f'{existing_request.id}.json' - request_data = await json_dumps(existing_request.model_dump()) - await atomic_write(request_path, request_data) - processed_requests.append( - ProcessedRequest( - id=existing_request.id, - unique_key=request.unique_key, - was_already_present=True, - was_already_handled=False, + # If the request already exists, we need to update it. + else: + # Set the processed request flags. + was_already_present = existing_request is not None + was_already_handled = existing_request.was_already_handled if existing_request else False + + # If the request is already in the RQ and handled, just continue with the next one. + if was_already_present and was_already_handled: + processed_requests.append( + ProcessedRequest( + id=existing_request.id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=True, + ) ) - ) - continue - - # Add the new request to the queue - request_path = self.path_to_rq / f'{request.id}.json' - # Create a data dictionary from the request and remove handled_at if it's None - request_dict = request.model_dump() - if request_dict.get('handled_at') is None: - request_dict.pop('handled_at', None) - - # Add sequence number to ensure FIFO ordering - sequence_number = self._sequence_counter - self._sequence_counter += 1 - request_dict['sequence'] = sequence_number - - request_data = await json_dumps(request_dict) - await atomic_write(request_path, request_data) - - # Update metadata counts - new_total_request_count += 1 - new_pending_request_count += 1 - - # If forefront, add to the forefront list - if forefront: - self._forefront_requests.insert(0, request.id) + # If the request is already in the RQ but not handled yet, update it. + elif was_already_present and not was_already_handled: + request_path = self._get_request_path(request.id) + request_dict = existing_request.model_dump() + request_dict['__forefront'] = forefront + request_data = await json_dumps(existing_request.model_dump()) + await atomic_write(request_path, request_data) + + processed_requests.append( + ProcessedRequest( + id=existing_request.id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=False, + ) + ) - processed_requests.append( - ProcessedRequest( - id=request.id, - unique_key=request.unique_key, - was_already_present=False, - was_already_handled=False, - ) - ) + else: + logger.warning(f'Request with unique key "{request.unique_key}" could not be processed.') + unprocessed_requests.append( + UnprocessedRequest( + unique_key=request.unique_key, + url=request.url, + method=request.method, + ) + ) await self._update_metadata( update_modified_at=True, @@ -382,7 +383,7 @@ async def add_batch_of_requests( return AddRequestsResponse( processed_requests=processed_requests, - unprocessed_requests=[], + unprocessed_requests=unprocessed_requests, ) @override @@ -395,10 +396,12 @@ async def get_request(self, request_id: str) -> Request | None: Returns: The retrieved request, or None, if it did not exist. """ - request_path = self.path_to_rq / f'{request_id}.json' + request_path = self._get_request_path(request_id) request = await self._parse_request_file(request_path) + if request is None: logger.warning(f'Request with ID "{request_id}" not found in the queue.') + return request @override @@ -414,24 +417,74 @@ async def fetch_next_request(self) -> Request | None: The request or `None` if there are no more pending requests. """ async with self._lock: - # Create the requests directory if it doesn't exist - await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True) + request_files = await self._get_request_files(self.path_to_rq) - # First try forefront requests (highest priority) - forefront_request = await self._try_get_forefront_request() - if forefront_request is not None: - return forefront_request + requests = list[Request]() + forefront_requests = list[Request]() - # Collect and categorize regular requests - request_sequences, requests_without_sequence = await self._categorize_regular_requests() + for request_file in request_files: + request = await self._parse_request_file(request_file) + + if request is None: + continue + + if request.was_already_handled: + # If the request was already handled, skip it. + continue + + if request.model_extra is None: + raise ValueError(f'Request file "{request_file}" does not contain "model_extra" data.') + + forefront = request.model_extra.get('__forefront') + + if forefront is None: + raise ValueError(f'Request file "{request_file}" does not contain the __forefront flag.') + + if forefront is True: + forefront_requests.append(request) + else: + requests.append(request) + + # Sort requests by their sequence numbers. + forefront_requests.sort( + key=lambda r: r.model_extra.get('__sequence', 0) if r.model_extra else 0, + reverse=True, + ) + requests.sort( + key=lambda r: r.model_extra.get('__sequence', 0) if r.model_extra else 0, + reverse=False, + ) + + next_request: Request | None = None + + while next_request is None: + if forefront_requests: + next_request = forefront_requests.pop(0) + + if next_request.id in self._in_progress: + # If the request is already in progress, skip it. + next_request = None + continue + + # Mark the request as in progress + self._in_progress.add(next_request.id) + + elif requests: + next_request = requests.pop(0) + + if next_request.id in self._in_progress: + # If the request is already in progress, skip it + next_request = None + continue + + # Mark the request as in progress + self._in_progress.add(next_request.id) - # Try to get a request with a sequence number (FIFO order) - sequenced_request = await self._try_get_sequenced_request(request_sequences) - if sequenced_request is not None: - return sequenced_request + else: + # No more requests available, break out of the loop + break - # Fall back to requests without sequence numbers (using file timestamps) - return await self._try_get_non_sequenced_request(requests_without_sequence) + return next_request @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: @@ -458,7 +511,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | request.handled_at = datetime.now(timezone.utc) # Write the updated request back to the requests directory - request_path = self.path_to_rq / f'{request.id}.json' + request_path = self._get_request_path(request.id) if not await asyncio.to_thread(request_path.exists): return None @@ -500,28 +553,32 @@ async def reclaim_request( Information about the queue operation. `None` if the given request was not in progress. """ async with self._lock: - # Check if the request is in progress + # Check if the request is in progress. if request.id not in self._in_progress: + logger.info(f'Reclaiming request {request.id} that is not in progress.') return None - # Remove from in-progress set self._in_progress.discard(request.id) - # If forefront is true, mark this request as priority - if forefront: - self._forefront_requests.insert(0, request.id) - # Make sure it's not in the forefront list if it was previously added there - elif request.id in self._forefront_requests: - self._forefront_requests.remove(request.id) - - # To simulate changing the file timestamp for FIFO ordering, - # we'll update the file with current timestamp - request_path = self.path_to_rq / f'{request.id}.json' + request_path = self._get_request_path(request.id) if not await asyncio.to_thread(request_path.exists): return None - request_data = await json_dumps(request.model_dump()) + # Update the request file with the new forefront status and sequence number + request_dict = request.model_dump() + request_dict['__forefront'] = forefront + + # Update sequence number to ensure proper ordering. + if forefront: + sequence_number = self._forefront_sequence_counter + self._forefront_sequence_counter += 1 + else: + sequence_number = self._sequence_counter + self._sequence_counter += 1 + + request_dict['__sequence'] = sequence_number + request_data = await json_dumps(request_dict) await atomic_write(request_path, request_data) # Update metadata timestamps @@ -542,25 +599,34 @@ async def is_empty(self) -> bool: True if the queue is empty, False otherwise. """ async with self._lock: - # Update accessed timestamp when checking if queue is empty await self._update_metadata(update_accessed_at=True) + request_files = await self._get_request_files(self.path_to_rq) - request_files = await self._get_request_files() - - # Check each file to see if there are any unhandled requests + # Check each file to see if there are any unhandled requests. for request_file in request_files: request = await self._parse_request_file(request_file) if request is None: continue - # If any request is not handled, the queue is not empty + # If any request is not handled, the queue is not empty. if request.handled_at is None: return False - # If we got here, all requests are handled or there are no requests + # If we got here, all requests are handled or there are no requests. return True + def _get_request_path(self, request_id: str) -> Path: + """Get the path to a specific request file. + + Args: + request_id: The ID of the request. + + Returns: + The path to the request file. + """ + return self.path_to_rq / f'{request_id}.json' + async def _update_metadata( self, *, @@ -611,183 +677,21 @@ async def _update_metadata( data = await json_dumps(self._metadata.model_dump()) await atomic_write(self.path_to_metadata, data) - async def _try_get_forefront_request(self) -> Request | None: - """Try to get the next available forefront request. - - Returns: - The next forefront request or None if no forefront requests are available. - """ - for request_id in list(self._forefront_requests): - # Skip if already in progress - if request_id in self._in_progress: - continue - - request_path = self.path_to_rq / f'{request_id}.json' - - # Skip if file doesn't exist - if not await asyncio.to_thread(request_path.exists): - self._forefront_requests.remove(request_id) - continue - - # Parse the request file - request = await self._parse_request_file(request_path) - - # Skip if parsing failed - if request is None: - self._forefront_requests.remove(request_id) - continue - - # Skip if already handled - if request.handled_at is not None: - self._forefront_requests.remove(request_id) - continue - - # Mark as in-progress in memory - self._in_progress.add(request.id) - - # Remove from forefront list - self._forefront_requests.remove(request.id) - - # Update accessed timestamp - await self._update_metadata(update_accessed_at=True) - - return request - - return None - - async def _categorize_regular_requests(self) -> tuple[dict[int, Path], list[Path]]: - """Categorize regular (non-forefront) requests by sequence number. - - Returns: - A tuple containing: - - Dictionary mapping sequence numbers to request file paths - - List of request file paths without sequence numbers - """ - # List all request files for regular (non-forefront) requests - request_files = await self._get_request_files() - - # Dictionary to store request files by their sequence number - request_sequences = {} - requests_without_sequence = [] - - # Filter out metadata files and in-progress requests - for request_file in request_files: - # Extract request ID from filename - request_id = request_file.stem - - # Skip if already in progress or in forefront - if request_id in self._in_progress or request_id in self._forefront_requests: - continue - - request = await self._parse_request_file(request_file) - - if request is None: - continue - - # Skip if already handled - if request.handled_at is not None: - continue - - sequence_number = None if request.model_extra is None else request.model_extra.get('sequence') - - # If the request has a sequence number, add it to the dictionary - if sequence_number: - request_sequences[sequence_number] = request_file - else: - # If no sequence number, add to the list for ordering by file timestamp - requests_without_sequence.append(request_file) - - return request_sequences, requests_without_sequence - - async def _try_get_sequenced_request(self, request_sequences: dict[int, Path]) -> Request | None: - """Try to get the next request with a sequence number in FIFO order. - - Args: - request_sequences: Dictionary mapping sequence numbers to request file paths - - Returns: - The next sequenced request or None if no valid sequenced requests are available - """ - # Process requests with sequence numbers first, in FIFO order - for sequence in sorted(request_sequences.keys()): - request_file = request_sequences[sequence] - - # Parse the request file - request = await self._parse_request_file(request_file) - - # Skip if parsing failed - if request is None: - continue - - # Skip if already handled (double-check) - if request.handled_at is not None: - continue - - # Mark as in-progress in memory - self._in_progress.add(request.id) - - # Update accessed timestamp - await self._update_metadata(update_accessed_at=True) - - return request - - return None - - async def _try_get_non_sequenced_request(self, requests_without_sequence: list[Path]) -> Request | None: - """Try to get the next request without a sequence number, using file timestamps for ordering. + @classmethod + async def _get_request_files(cls, path_to_rq: Path) -> list[Path]: + """Get all request files from the RQ. Args: - requests_without_sequence: List of request file paths without sequence numbers + path_to_rq: The path to the request queue directory. Returns: - The next non-sequenced request or None if no valid non-sequenced requests are available - """ - if not requests_without_sequence: - return None - - # Get file creation times for sorting - request_file_times = {} - for request_file in requests_without_sequence: - try: - file_stat = await asyncio.to_thread(request_file.stat) - request_file_times[request_file] = file_stat.st_mtime - except Exception: # noqa: PERF203 - # If we can't get the time, use 0 (oldest) - request_file_times[request_file] = 0 - - # Sort by creation time - requests_without_sequence.sort(key=lambda f: request_file_times[f]) - - # Process requests without sequence in file timestamp order - for request_file in requests_without_sequence: - request = await self._parse_request_file(request_file) - - if request is None: - continue - - if request.handled_at is not None: - continue - - # Mark as in-progress in memory - self._in_progress.add(request.id) - - # Update accessed timestamp - await self._update_metadata(update_accessed_at=True) - return request - - return None - - async def _get_request_files(self) -> list[Path]: - """Get all request files in the queue. - - Returns: - A list of paths to all request files in the queue. + A list of paths to all request files. """ # Create the requests directory if it doesn't exist. - await asyncio.to_thread(self.path_to_rq.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True) # List all the json files. - files = await asyncio.to_thread(list, self.path_to_rq.glob('*.json')) + files = await asyncio.to_thread(list, path_to_rq.glob('*.json')) # Filter out metadata file and non-file entries. filtered = filter( @@ -797,7 +701,8 @@ async def _get_request_files(self) -> list[Path]: return list(filtered) - async def _parse_request_file(self, file_path: Path) -> Request | None: + @classmethod + async def _parse_request_file(cls, file_path: Path) -> Request | None: """Parse a request file and return the `Request` object. Args: @@ -828,3 +733,44 @@ async def _parse_request_file(self, file_path: Path) -> Request | None: except ValidationError as exc: logger.warning(f'Failed to validate request file {file_path}: {exc!s}') return None + + @classmethod + async def _get_sequence_counters(cls, path_to_rq: Path) -> tuple[int, int]: + """Get the current sequence counters for the request queue. + + Args: + path_to_rq: The path to the request queue directory. + + Returns: + A tuple containing the current sequence counter for regular requests and for forefront requests. + """ + max_sequence = -1 + max_forefront_sequence = -1 + + # Get all request files + request_files = await cls._get_request_files(path_to_rq) + + for request_file in request_files: + request = await cls._parse_request_file(request_file) + if request is None: + continue + + # Extract sequence number and forefront flag from model_extra + if request.model_extra: + sequence = request.model_extra.get('__sequence') + is_forefront = request.model_extra.get('__forefront') + + if sequence is None: + logger.warning(f'Request file "{request_file}" does not contain "__sequence" field.') + continue + + if is_forefront is None: + logger.warning(f'Request file "{request_file}" does not contain "__forefront" field.') + continue + + if is_forefront: + max_forefront_sequence = max(max_forefront_sequence, sequence) + else: + max_sequence = max(max_sequence, sequence) + + return max_sequence, max_forefront_sequence diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index a8a9d30adb..5cf59db820 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -214,10 +214,15 @@ async def fetch_next_request(self) -> Request | None: """ # Find the first request that's not handled or in progress for request in self._records: - if request.handled_at is None and request.id not in self._in_progress: - # Mark as in progress - self._in_progress[request.id] = request - return request + if request.was_already_handled: + continue + + if request.id in self._in_progress: + continue + + # Mark as in progress + self._in_progress[request.id] = request + return request return None @@ -259,7 +264,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | return None # Set handled_at timestamp if not already set - if request.handled_at is None: + if not request.was_already_handled: request.handled_at = datetime.now(timezone.utc) # Update the request in records @@ -341,7 +346,7 @@ async def is_empty(self) -> bool: await self._update_metadata(update_accessed_at=True) # Queue is empty if there are no pending requests - pending_requests = [r for r in self._records if r.handled_at is None] + pending_requests = [request for request in self._records if not request.was_already_handled] return len(pending_requests) == 0 async def _update_metadata( diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index 200e12e3e6..ef7c91329f 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -179,7 +179,7 @@ async def test_add_requests(rq_client: FileSystemRequestQueueClient) -> None: assert 'url' in content assert content['url'].startswith('https://example.com/') assert 'id' in content - assert 'handled_at' not in content # Not yet handled + assert content['handled_at'] is None async def test_add_duplicate_request(rq_client: FileSystemRequestQueueClient) -> None: @@ -336,9 +336,6 @@ async def test_reclaim_request_with_forefront(rq_client: FileSystemRequestQueueC # Reclaim it with forefront priority await rq_client.reclaim_request(first_request, forefront=True) - # Verify it's in the forefront set - assert first_request.id in rq_client._forefront_requests - # It should be returned before the second request reclaimed_request = await rq_client.fetch_next_request() assert reclaimed_request is not None From c967fe5e75d191bf79411bd39ce50d03a262a4d2 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 5 Jun 2025 10:37:01 +0200 Subject: [PATCH 07/43] Add caching to RQ FS client --- .../_file_system/_request_queue_client.py | 225 +++++++++++------- .../_memory/_request_queue_client.py | 2 - .../_file_system/test_fs_rq_client.py | 6 +- tests/unit/storages/test_request_queue.py | 3 +- 4 files changed, 148 insertions(+), 88 deletions(-) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 5c01c14d5e..1718cd8db8 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -3,6 +3,7 @@ import asyncio import json import shutil +from collections import deque from datetime import datetime, timezone from logging import getLogger from pathlib import Path @@ -94,10 +95,16 @@ def __init__( """The base directory where the request queue is stored.""" self._sequence_counter = sequence_counter - """A counter to track the order of (normal) requests added to the queue.""" + """A counter to track the order of (normal) requests added to the queue. + + This number is going to be used as a sequence number for next request. + """ self._forefront_sequence_counter = forefront_sequence_counter - """A counter to track the order of forefront requests added to the queue.""" + """A counter to track the order of forefront requests added to the queue. + + This number is going to be used as a sequence number for next forefront request. + """ self._lock = asyncio.Lock() """A lock to ensure that only one operation is performed at a time.""" @@ -105,6 +112,15 @@ def __init__( self._in_progress = set[str]() """A set of request IDs that are currently being processed.""" + self._cache_size = 50 + """Maximum number of requests to keep in cache.""" + + self._request_cache = deque[Request]() + """Cache for requests: forefront requests at the beginning, regular requests at the end.""" + + self._cache_needs_refresh = True + """Flag indicating whether the cache needs to be refreshed from filesystem.""" + @property @override def metadata(self) -> RequestQueueMetadata: @@ -237,11 +253,15 @@ async def open( @override async def drop(self) -> None: - # If the client directory exists, remove it recursively. - if self.path_to_rq.exists(): - async with self._lock: + async with self._lock: + # Remove the RQ dir recursively if it exists. + if self.path_to_rq.exists(): await asyncio.to_thread(shutil.rmtree, self.path_to_rq) + self._in_progress.clear() + self._request_cache.clear() + self._cache_needs_refresh = True + @override async def purge(self) -> None: async with self._lock: @@ -250,13 +270,15 @@ async def purge(self) -> None: for file_path in request_files: await asyncio.to_thread(file_path.unlink) + self._in_progress.clear() + self._request_cache.clear() + self._cache_needs_refresh = True + # Update metadata counts await self._update_metadata( update_modified_at=True, update_accessed_at=True, - new_handled_request_count=0, new_pending_request_count=0, - new_total_request_count=0, ) @override @@ -381,6 +403,10 @@ async def add_batch_of_requests( new_pending_request_count=new_pending_request_count, ) + # Invalidate the cache if we added forefront requests. + if forefront: + self._cache_needs_refresh = True + return AddRequestsResponse( processed_requests=processed_requests, unprocessed_requests=unprocessed_requests, @@ -401,7 +427,9 @@ async def get_request(self, request_id: str) -> Request | None: if request is None: logger.warning(f'Request with ID "{request_id}" not found in the queue.') + return None + self._in_progress.add(request.id) return request @override @@ -417,72 +445,26 @@ async def fetch_next_request(self) -> Request | None: The request or `None` if there are no more pending requests. """ async with self._lock: - request_files = await self._get_request_files(self.path_to_rq) - - requests = list[Request]() - forefront_requests = list[Request]() - - for request_file in request_files: - request = await self._parse_request_file(request_file) - - if request is None: - continue - - if request.was_already_handled: - # If the request was already handled, skip it. - continue - - if request.model_extra is None: - raise ValueError(f'Request file "{request_file}" does not contain "model_extra" data.') - - forefront = request.model_extra.get('__forefront') - - if forefront is None: - raise ValueError(f'Request file "{request_file}" does not contain the __forefront flag.') - - if forefront is True: - forefront_requests.append(request) - else: - requests.append(request) - - # Sort requests by their sequence numbers. - forefront_requests.sort( - key=lambda r: r.model_extra.get('__sequence', 0) if r.model_extra else 0, - reverse=True, - ) - requests.sort( - key=lambda r: r.model_extra.get('__sequence', 0) if r.model_extra else 0, - reverse=False, - ) + # Refresh cache if needed or if it's empty. + if self._cache_needs_refresh or not self._request_cache: + await self._refresh_cache() next_request: Request | None = None - while next_request is None: - if forefront_requests: - next_request = forefront_requests.pop(0) + # Fetch from the front of the deque (forefront requests are at the beginning). + while self._request_cache and next_request is None: + candidate = self._request_cache.popleft() - if next_request.id in self._in_progress: - # If the request is already in progress, skip it. - next_request = None - continue + # Skip requests that are already in progress, however this should not happen. + if candidate.id not in self._in_progress: + next_request = candidate - # Mark the request as in progress - self._in_progress.add(next_request.id) + # If cache is getting low, mark for refresh on next call. + if len(self._request_cache) < self._cache_size // 4: + self._cache_needs_refresh = True - elif requests: - next_request = requests.pop(0) - - if next_request.id in self._in_progress: - # If the request is already in progress, skip it - next_request = None - continue - - # Mark the request as in progress - self._in_progress.add(next_request.id) - - else: - # No more requests available, break out of the loop - break + if next_request is not None: + self._in_progress.add(next_request.id) return next_request @@ -499,27 +481,29 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | Information about the queue operation. `None` if the given request was not in progress. """ async with self._lock: - # Check if the request is in progress + # Check if the request is in progress. if request.id not in self._in_progress: + logger.warning(f'Marking request {request.id} as handled that is not in progress.') return None - # Remove from in-progress set - self._in_progress.discard(request.id) - - # Update the request object - set handled_at timestamp + # Update the request's handled_at timestamp. if request.handled_at is None: request.handled_at = datetime.now(timezone.utc) - # Write the updated request back to the requests directory + # Dump the updated request to the file. request_path = self._get_request_path(request.id) if not await asyncio.to_thread(request_path.exists): + logger.warning(f'Request file for {request.id} does not exist, cannot mark as handled.') return None request_data = await json_dumps(request.model_dump()) await atomic_write(request_path, request_data) - # Update RQ metadata + # Remove from in-progress. + self._in_progress.discard(request.id) + + # Update RQ metadata. await self._update_metadata( update_modified_at=True, update_accessed_at=True, @@ -558,17 +542,12 @@ async def reclaim_request( logger.info(f'Reclaiming request {request.id} that is not in progress.') return None - self._in_progress.discard(request.id) - request_path = self._get_request_path(request.id) if not await asyncio.to_thread(request_path.exists): + logger.warning(f'Request file for {request.id} does not exist, cannot reclaim.') return None - # Update the request file with the new forefront status and sequence number - request_dict = request.model_dump() - request_dict['__forefront'] = forefront - # Update sequence number to ensure proper ordering. if forefront: sequence_number = self._forefront_sequence_counter @@ -577,12 +556,27 @@ async def reclaim_request( sequence_number = self._sequence_counter self._sequence_counter += 1 + # Dump the updated request to the file. + request_dict = request.model_dump() + request_dict['__forefront'] = forefront request_dict['__sequence'] = sequence_number request_data = await json_dumps(request_dict) await atomic_write(request_path, request_data) - # Update metadata timestamps - await self._update_metadata(update_modified_at=True, update_accessed_at=True) + # Remove from in-progress. + self._in_progress.discard(request.id) + + # Update RQ metadata. + await self._update_metadata( + update_modified_at=True, + update_accessed_at=True, + ) + + # Add the request back to the cache. + if forefront: + self._request_cache.appendleft(request) + else: + self._request_cache.append(request) return ProcessedRequest( id=request.id, @@ -677,6 +671,69 @@ async def _update_metadata( data = await json_dumps(self._metadata.model_dump()) await atomic_write(self.path_to_metadata, data) + async def _refresh_cache(self) -> None: + """Refresh the request cache from filesystem. + + This method loads up to _cache_size requests from the filesystem, + prioritizing forefront requests and maintaining proper ordering. + """ + self._request_cache.clear() + + request_files = await self._get_request_files(self.path_to_rq) + + forefront_requests = [] + regular_requests = [] + + for request_file in request_files: + request = await self._parse_request_file(request_file) + + if request is None or request.was_already_handled: + continue + + if request.id in self._in_progress: + continue + + if request.model_extra is None: + logger.warning(f'Request file "{request_file}" does not contain model_extra field.') + continue + + forefront = request.model_extra.get('__forefront') + if forefront is None: + logger.warning(f'Request file "{request_file}" does not contain "__forefront" field.') + continue + + if forefront: + forefront_requests.append(request) + else: + regular_requests.append(request) + + # Sort forefront requests by sequence (newest first for LIFO behavior). + forefront_requests.sort( + key=lambda request: request.model_extra.get('__sequence', 0) if request.model_extra else 0, + reverse=True, + ) + + # Sort regular requests by sequence (oldest first for FIFO behavior). + regular_requests.sort( + key=lambda request: request.model_extra.get('__sequence', 0) if request.model_extra else 0, + reverse=False, + ) + + # Add forefront requests to the beginning of the cache (left side). Since forefront_requests are sorted + # by sequence (newest first), we need to add them in reverse order to maintain correct priority. + for request in reversed(forefront_requests): + if len(self._request_cache) >= self._cache_size: + break + self._request_cache.appendleft(request) + + # Add regular requests to the end of the cache (right side). + for request in regular_requests: + if len(self._request_cache) >= self._cache_size: + break + self._request_cache.append(request) + + self._cache_needs_refresh = False + @classmethod async def _get_request_files(cls, path_to_rq: Path) -> list[Path]: """Get all request files from the RQ. @@ -747,7 +804,7 @@ async def _get_sequence_counters(cls, path_to_rq: Path) -> tuple[int, int]: max_sequence = -1 max_forefront_sequence = -1 - # Get all request files + # Get all request files. request_files = await cls._get_request_files(path_to_rq) for request_file in request_files: @@ -755,7 +812,7 @@ async def _get_sequence_counters(cls, path_to_rq: Path) -> tuple[int, int]: if request is None: continue - # Extract sequence number and forefront flag from model_extra + # Extract sequence number and forefront flag from model_extra. if request.model_extra: sequence = request.model_extra.get('__sequence') is_forefront = request.model_extra.get('__forefront') diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index 5cf59db820..dac8e8dcac 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -126,9 +126,7 @@ async def purge(self) -> None: await self._update_metadata( update_modified_at=True, update_accessed_at=True, - new_handled_request_count=0, new_pending_request_count=0, - new_total_request_count=0, ) @override diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index ef7c91329f..5147beadfa 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -109,13 +109,17 @@ async def test_rq_client_purge_on_start(configuration: Configuration) -> None: await rq_client1.add_batch_of_requests([Request.from_url('https://example.com')]) # Verify request was added + assert rq_client1.metadata.pending_request_count == 1 assert rq_client1.metadata.total_request_count == 1 + assert rq_client1.metadata.handled_request_count == 0 # Reopen rq_client2 = await FileSystemStorageClient().open_request_queue_client(configuration=configuration) # Verify data was purged - assert rq_client2.metadata.total_request_count == 0 + assert rq_client2.metadata.pending_request_count == 0 + assert rq_client2.metadata.total_request_count == 1 + assert rq_client2.metadata.handled_request_count == 0 async def test_rq_client_no_purge_on_start(configuration: Configuration) -> None: diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index 8c8e227af7..98236c3a49 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -603,6 +603,7 @@ async def test_purge( # Verify requests were added assert rq.metadata.total_request_count == 3 assert rq.metadata.pending_request_count == 3 + assert rq.metadata.handled_request_count == 0 # Record the queue ID queue_id = rq.id @@ -615,7 +616,7 @@ async def test_purge( assert rq.name == 'purge_test_queue' # Same name preserved # Queue should be empty now - assert rq.metadata.total_request_count == 0 + assert rq.metadata.total_request_count == 3 assert rq.metadata.pending_request_count == 0 assert rq.metadata.handled_request_count == 0 assert await rq.is_empty() is True From 7df046f7b530bb01cb9561ef1a81ff63fd6b6350 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 5 Jun 2025 17:51:34 +0200 Subject: [PATCH 08/43] RQ FS performance optimization in add_requests --- .../_file_system/_request_queue_client.py | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 1718cd8db8..5ac8abfff9 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -303,23 +303,24 @@ async def add_batch_of_requests( processed_requests = list[ProcessedRequest]() unprocessed_requests = list[UnprocessedRequest]() + # Prepare a dictionary to track existing requests by their unique keys. + existing_unique_keys: dict[str, Path] = {} + existing_request_files = await self._get_request_files(self.path_to_rq) + + for request_file in existing_request_files: + existing_request = await self._parse_request_file(request_file) + if existing_request is not None: + existing_unique_keys[existing_request.unique_key] = request_file + + # Process each request in the batch. for request in requests: - existing_request_files = await self._get_request_files(self.path_to_rq) + existing_request_file = existing_unique_keys.get(request.unique_key) existing_request = None - # Go through existing requests to find if the request already exists in the queue. - for existing_request_file in existing_request_files: + # Only load the full request from disk if we found a duplicate + if existing_request_file is not None: existing_request = await self._parse_request_file(existing_request_file) - if existing_request is None: - continue - - # If the unique key matches, we found an existing request - if existing_request.unique_key == request.unique_key: - break - - existing_request = None - # If there is no existing request with the same unique key, add the new request. if existing_request is None: request_path = self._get_request_path(request.id) @@ -343,6 +344,9 @@ async def add_batch_of_requests( new_total_request_count += 1 new_pending_request_count += 1 + # Add to our index for subsequent requests in this batch + existing_unique_keys[request.unique_key] = self._get_request_path(request.id) + processed_requests.append( ProcessedRequest( id=request.id, @@ -352,7 +356,7 @@ async def add_batch_of_requests( ) ) - # If the request already exists, we need to update it. + # If the request already exists in the RQ, just update it if needed. else: # Set the processed request flags. was_already_present = existing_request is not None @@ -371,10 +375,10 @@ async def add_batch_of_requests( # If the request is already in the RQ but not handled yet, update it. elif was_already_present and not was_already_handled: - request_path = self._get_request_path(request.id) + request_path = self._get_request_path(existing_request.id) request_dict = existing_request.model_dump() request_dict['__forefront'] = forefront - request_data = await json_dumps(existing_request.model_dump()) + request_data = await json_dumps(request_dict) await atomic_write(request_path, request_data) processed_requests.append( From 35555651fe7cb0fa5201fb69447515f43afa5595 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 6 Jun 2025 14:26:13 +0200 Subject: [PATCH 09/43] RQ FS performance issues in fetch_next_request --- .../_file_system/_request_queue_client.py | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 5ac8abfff9..f472195165 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -57,6 +57,9 @@ class FileSystemRequestQueueClient(RequestQueueClient): _STORAGE_SUBSUBDIR_DEFAULT = 'default' """The name of the subdirectory for the default request queue.""" + _MAX_REQUESTS_IN_CACHE = 100_000 + """Maximum number of requests to keep in cache for faster access.""" + def __init__( self, *, @@ -112,9 +115,6 @@ def __init__( self._in_progress = set[str]() """A set of request IDs that are currently being processed.""" - self._cache_size = 50 - """Maximum number of requests to keep in cache.""" - self._request_cache = deque[Request]() """Cache for requests: forefront requests at the beginning, regular requests at the end.""" @@ -463,10 +463,6 @@ async def fetch_next_request(self) -> Request | None: if candidate.id not in self._in_progress: next_request = candidate - # If cache is getting low, mark for refresh on next call. - if len(self._request_cache) < self._cache_size // 4: - self._cache_needs_refresh = True - if next_request is not None: self._in_progress.add(next_request.id) @@ -678,15 +674,15 @@ async def _update_metadata( async def _refresh_cache(self) -> None: """Refresh the request cache from filesystem. - This method loads up to _cache_size requests from the filesystem, + This method loads up to _MAX_REQUESTS_IN_CACHE requests from the filesystem, prioritizing forefront requests and maintaining proper ordering. """ self._request_cache.clear() - request_files = await self._get_request_files(self.path_to_rq) + forefront_requests = list[Request]() + regular_requests = list[Request]() - forefront_requests = [] - regular_requests = [] + request_files = await self._get_request_files(self.path_to_rq) for request_file in request_files: request = await self._parse_request_file(request_file) @@ -726,13 +722,13 @@ async def _refresh_cache(self) -> None: # Add forefront requests to the beginning of the cache (left side). Since forefront_requests are sorted # by sequence (newest first), we need to add them in reverse order to maintain correct priority. for request in reversed(forefront_requests): - if len(self._request_cache) >= self._cache_size: + if len(self._request_cache) >= self._MAX_REQUESTS_IN_CACHE: break self._request_cache.appendleft(request) # Add regular requests to the end of the cache (right side). for request in regular_requests: - if len(self._request_cache) >= self._cache_size: + if len(self._request_cache) >= self._MAX_REQUESTS_IN_CACHE: break self._request_cache.append(request) From 946d1e2cd6a73848ae14d3e8fe3152dac5d958ec Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 6 Jun 2025 15:15:44 +0200 Subject: [PATCH 10/43] RQ FS fetch performance for is_empty --- .../_file_system/_dataset_client.py | 6 +- .../_file_system/_key_value_store_client.py | 6 +- .../_file_system/_request_queue_client.py | 65 ++++++++++++------- 3 files changed, 44 insertions(+), 33 deletions(-) diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index 80aec8923b..6650212628 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -160,11 +160,7 @@ async def open( metadata_path = dataset_path / METADATA_FILENAME # If the dataset directory exists, reconstruct the client from the metadata file. - if dataset_path.exists(): - # If metadata file is missing, raise an error. - if not metadata_path.exists(): - raise ValueError(f'Metadata file not found for dataset "{name}"') - + if dataset_path.exists() and metadata_path.exists(): file = await asyncio.to_thread(open, metadata_path) try: file_content = json.load(file) diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index fe99dad08d..58d4e2cd5c 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -154,11 +154,7 @@ async def open( metadata_path = kvs_path / METADATA_FILENAME # If the key-value store directory exists, reconstruct the client from the metadata file. - if kvs_path.exists(): - # If metadata file is missing, raise an error. - if not metadata_path.exists(): - raise ValueError(f'Metadata file not found for key-value store "{name}"') - + if kvs_path.exists() and metadata_path.exists(): file = await asyncio.to_thread(open, metadata_path) try: file_content = json.load(file) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index f472195165..aa246d67b4 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -118,9 +118,12 @@ def __init__( self._request_cache = deque[Request]() """Cache for requests: forefront requests at the beginning, regular requests at the end.""" - self._cache_needs_refresh = True + self._request_cache_needs_refresh = True """Flag indicating whether the cache needs to be refreshed from filesystem.""" + self._is_empty_cache: bool | None = None + """Cache for is_empty result: None means unknown, True/False is cached state.""" + @property @override def metadata(self) -> RequestQueueMetadata: @@ -202,11 +205,7 @@ async def open( metadata_path = rq_path / METADATA_FILENAME # If the RQ directory exists, reconstruct the client from the metadata file. - if rq_path.exists(): - # If metadata file is missing, raise an error. - if not metadata_path.exists(): - raise ValueError(f'Metadata file not found for request queue "{name}"') - + if rq_path.exists() and metadata_path.exists(): file = await asyncio.to_thread(open, metadata_path) try: file_content = json.load(file) @@ -260,7 +259,10 @@ async def drop(self) -> None: self._in_progress.clear() self._request_cache.clear() - self._cache_needs_refresh = True + self._request_cache_needs_refresh = True + + # Invalidate is_empty cache. + self._is_empty_cache = None @override async def purge(self) -> None: @@ -272,15 +274,17 @@ async def purge(self) -> None: self._in_progress.clear() self._request_cache.clear() - self._cache_needs_refresh = True + self._request_cache_needs_refresh = True - # Update metadata counts await self._update_metadata( update_modified_at=True, update_accessed_at=True, new_pending_request_count=0, ) + # Invalidate is_empty cache. + self._is_empty_cache = None + @override async def add_batch_of_requests( self, @@ -298,6 +302,7 @@ async def add_batch_of_requests( Response containing information about the added requests. """ async with self._lock: + self._is_empty_cache = None new_total_request_count = self._metadata.total_request_count new_pending_request_count = self._metadata.pending_request_count processed_requests = list[ProcessedRequest]() @@ -409,7 +414,10 @@ async def add_batch_of_requests( # Invalidate the cache if we added forefront requests. if forefront: - self._cache_needs_refresh = True + self._request_cache_needs_refresh = True + + # Invalidate is_empty cache. + self._is_empty_cache = None return AddRequestsResponse( processed_requests=processed_requests, @@ -450,7 +458,7 @@ async def fetch_next_request(self) -> Request | None: """ async with self._lock: # Refresh cache if needed or if it's empty. - if self._cache_needs_refresh or not self._request_cache: + if self._request_cache_needs_refresh or not self._request_cache: await self._refresh_cache() next_request: Request | None = None @@ -481,6 +489,8 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | Information about the queue operation. `None` if the given request was not in progress. """ async with self._lock: + self._is_empty_cache = None + # Check if the request is in progress. if request.id not in self._in_progress: logger.warning(f'Marking request {request.id} as handled that is not in progress.') @@ -537,6 +547,8 @@ async def reclaim_request( Information about the queue operation. `None` if the given request was not in progress. """ async with self._lock: + self._is_empty_cache = None + # Check if the request is in progress. if request.id not in self._in_progress: logger.info(f'Reclaiming request {request.id} that is not in progress.') @@ -587,28 +599,35 @@ async def reclaim_request( @override async def is_empty(self) -> bool: - """Check if the queue is empty. - - Returns: - True if the queue is empty, False otherwise. - """ + """Check if the queue is empty, using a cached value if available and valid.""" async with self._lock: + # If we have a cached value, return it immediately. + if self._is_empty_cache is not None: + return self._is_empty_cache + + # If we have a cached requests, check them first (fast path). + if self._request_cache: + for req in self._request_cache: + if req.handled_at is None: + self._is_empty_cache = False + return False + self._is_empty_cache = True + return True + + # Fallback: check files on disk (slow path). await self._update_metadata(update_accessed_at=True) request_files = await self._get_request_files(self.path_to_rq) - # Check each file to see if there are any unhandled requests. for request_file in request_files: request = await self._parse_request_file(request_file) - if request is None: continue - - # If any request is not handled, the queue is not empty. if request.handled_at is None: + self._is_empty_cache = False return False - # If we got here, all requests are handled or there are no requests. - return True + self._is_empty_cache = True + return True def _get_request_path(self, request_id: str) -> Path: """Get the path to a specific request file. @@ -732,7 +751,7 @@ async def _refresh_cache(self) -> None: break self._request_cache.append(request) - self._cache_needs_refresh = False + self._request_cache_needs_refresh = False @classmethod async def _get_request_files(cls, path_to_rq: Path) -> list[Path]: From 9f10b955c6d8c0d82940ad1c6bec0be6d5274565 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 6 Jun 2025 15:37:46 +0200 Subject: [PATCH 11/43] rm code duplication for open methods --- src/crawlee/storages/_dataset.py | 34 ++++-------------- src/crawlee/storages/_key_value_store.py | 35 ++++--------------- src/crawlee/storages/_request_queue.py | 35 ++++--------------- src/crawlee/storages/_utils.py | 44 ++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 83 deletions(-) create mode 100644 src/crawlee/storages/_utils.py diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index a5ac8834a9..dcd57a5c95 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -12,6 +12,7 @@ from ._base import Storage from ._key_value_store import KeyValueStore +from ._utils import open_storage_instance if TYPE_CHECKING: from collections.abc import AsyncIterator @@ -109,39 +110,18 @@ async def open( configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> Dataset: - if id and name: - raise ValueError('Only one of "id" or "name" can be specified, not both.') - - # Check for default instance if no id or name provided - if id is None and name is None and cls._default_instance is not None: - return cls._default_instance - - # Check if the dataset is already cached - if id is not None and id in cls._cache_by_id: - return cls._cache_by_id[id] - if name is not None and name in cls._cache_by_name: - return cls._cache_by_name[name] - configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client - - client = await storage_client.open_dataset_client( + return await open_storage_instance( + cls, id=id, name=name, configuration=configuration, + cache_by_id=cls._cache_by_id, + cache_by_name=cls._cache_by_name, + default_instance_attr='_default_instance', + client_opener=storage_client.open_dataset_client, ) - dataset = cls(client) - - # Cache the dataset instance by ID and name - cls._cache_by_id[dataset.id] = dataset - if dataset.name is not None: - cls._cache_by_name[dataset.name] = dataset - - # Store as default instance if neither id nor name was provided - if id is None and name is None: - cls._default_instance = dataset - - return dataset @override async def drop(self) -> None: diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index 6fc370dcb2..659cd67384 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -14,6 +14,7 @@ from crawlee.storage_clients.models import KeyValueStoreMetadata from ._base import Storage +from ._utils import open_storage_instance if TYPE_CHECKING: from collections.abc import AsyncIterator @@ -117,41 +118,19 @@ async def open( configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> KeyValueStore: - if id and name: - raise ValueError('Only one of "id" or "name" can be specified, not both.') - - # Check for default instance if no id or name provided - if id is None and name is None and cls._default_instance is not None: - return cls._default_instance - - # Check if the key-value store is already cached - if id is not None and id in cls._cache_by_id: - return cls._cache_by_id[id] - if name is not None and name in cls._cache_by_name: - return cls._cache_by_name[name] - configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client - - client = await storage_client.open_key_value_store_client( + return await open_storage_instance( + cls, id=id, name=name, configuration=configuration, + cache_by_id=cls._cache_by_id, + cache_by_name=cls._cache_by_name, + default_instance_attr='_default_instance', + client_opener=storage_client.open_key_value_store_client, ) - kvs = cls(client) - - # Cache the key-value store instance by ID and name - cls._cache_by_id[kvs.id] = kvs - if kvs.name is not None: - cls._cache_by_name[kvs.name] = kvs - - # Store as default instance if neither id nor name was provided - if id is None and name is None: - cls._default_instance = kvs - - return kvs - @override async def drop(self) -> None: if self.id in self._cache_by_id: diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index 66231cc138..83da36654e 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -13,6 +13,7 @@ from crawlee.request_loaders import RequestManager from ._base import Storage +from ._utils import open_storage_instance if TYPE_CHECKING: from collections.abc import Sequence @@ -127,41 +128,19 @@ async def open( configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> RequestQueue: - if id and name: - raise ValueError('Only one of "id" or "name" can be specified, not both.') - - # Check for default instance if no id or name provided - if id is None and name is None and cls._default_instance is not None: - return cls._default_instance - - # Check if the request queue is already cached - if id is not None and id in cls._cache_by_id: - return cls._cache_by_id[id] - if name is not None and name in cls._cache_by_name: - return cls._cache_by_name[name] - configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client - - client = await storage_client.open_request_queue_client( + return await open_storage_instance( + cls, id=id, name=name, configuration=configuration, + cache_by_id=cls._cache_by_id, + cache_by_name=cls._cache_by_name, + default_instance_attr='_default_instance', + client_opener=storage_client.open_request_queue_client, ) - request_queue = cls(client) - - # Cache the request queue instance by ID and name - cls._cache_by_id[request_queue.id] = request_queue - if request_queue.name is not None: - cls._cache_by_name[request_queue.name] = request_queue - - # Store as default instance if neither id nor name was provided - if id is None and name is None: - cls._default_instance = request_queue - - return request_queue - @override async def drop(self) -> None: # Remove from cache before dropping diff --git a/src/crawlee/storages/_utils.py b/src/crawlee/storages/_utils.py new file mode 100644 index 0000000000..e8c190dfa2 --- /dev/null +++ b/src/crawlee/storages/_utils.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from typing import Any, Callable, TypeVar, cast + +from ._base import Storage + +T = TypeVar('T', bound=Storage) + + +async def open_storage_instance( + cls: type[T], + *, + id: str | None, + name: str | None, + configuration: Any, + cache_by_id: dict[str, T], + cache_by_name: dict[str, T], + default_instance_attr: str, + client_opener: Callable[..., Any], +) -> T: + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + default_instance = getattr(cls, default_instance_attr) + if id is None and name is None and default_instance is not None: + return cast('T', default_instance) + + if id is not None and id in cache_by_id: + return cache_by_id[id] + if name is not None and name in cache_by_name: + return cache_by_name[name] + + client = await client_opener(id=id, name=name, configuration=configuration) + instance = cls(client) # type: ignore[call-arg] + instance_name = getattr(instance, 'name', None) + + cache_by_id[instance.id] = instance + if instance_name is not None: + cache_by_name[instance_name] = instance + + if id is None and name is None: + setattr(cls, default_instance_attr, instance) + + return instance From 0864ff8dd084f291f553d8272307898718932bf2 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 9 Jun 2025 17:10:19 +0200 Subject: [PATCH 12/43] Request loaders use async getters for handled/total req cnt --- src/crawlee/request_loaders/_request_list.py | 6 ++---- src/crawlee/request_loaders/_request_loader.py | 10 ++++------ src/crawlee/request_loaders/_request_manager_tandem.py | 10 ++++------ src/crawlee/storages/_request_queue.py | 6 ++---- 4 files changed, 12 insertions(+), 20 deletions(-) diff --git a/src/crawlee/request_loaders/_request_list.py b/src/crawlee/request_loaders/_request_list.py index 2f88327f65..aaba12f5c4 100644 --- a/src/crawlee/request_loaders/_request_list.py +++ b/src/crawlee/request_loaders/_request_list.py @@ -54,14 +54,12 @@ def __init__( def name(self) -> str | None: return self._name - @property @override - async def handled_count(self) -> int: + async def get_handled_count(self) -> int: return self._handled_count - @property @override - async def total_count(self) -> int: + async def get_total_count(self) -> int: return self._assumed_total_count @override diff --git a/src/crawlee/request_loaders/_request_loader.py b/src/crawlee/request_loaders/_request_loader.py index 0a2e96e02f..e33707b7ae 100644 --- a/src/crawlee/request_loaders/_request_loader.py +++ b/src/crawlee/request_loaders/_request_loader.py @@ -25,15 +25,13 @@ class RequestLoader(ABC): - Managing state information such as the total and handled request counts. """ - @property @abstractmethod - async def handled_count(self) -> int: - """The number of requests that have been handled.""" + async def get_handled_count(self) -> int: + """Get the number of requests in the loader that have been handled.""" - @property @abstractmethod - async def total_count(self) -> int: - """The total number of requests in the loader.""" + async def get_total_count(self) -> int: + """Get the total number of requests in the loader (i.e. pending + handled).""" @abstractmethod async def is_empty(self) -> bool: diff --git a/src/crawlee/request_loaders/_request_manager_tandem.py b/src/crawlee/request_loaders/_request_manager_tandem.py index 43f07709ec..6a5fe8aa65 100644 --- a/src/crawlee/request_loaders/_request_manager_tandem.py +++ b/src/crawlee/request_loaders/_request_manager_tandem.py @@ -32,15 +32,13 @@ def __init__(self, request_loader: RequestLoader, request_manager: RequestManage self._read_only_loader = request_loader self._read_write_manager = request_manager - @property @override - async def handled_count(self) -> int: - return await self._read_write_manager.handled_count + async def get_handled_count(self) -> int: + return await self._read_write_manager.get_handled_count() - @property @override - async def total_count(self) -> int: - return (await self._read_only_loader.total_count) + (await self._read_write_manager.total_count) + async def get_total_count(self) -> int: + return (await self._read_only_loader.get_total_count()) + (await self._read_write_manager.get_total_count()) @override async def is_empty(self) -> bool: diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index 83da36654e..d08243b650 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -108,14 +108,12 @@ def name(self) -> str | None: def metadata(self) -> RequestQueueMetadata: return self._client.metadata - @property @override - async def handled_count(self) -> int: + async def get_handled_count(self) -> int: return self._client.metadata.handled_request_count - @property @override - async def total_count(self) -> int: + async def get_total_count(self) -> int: return self._client.metadata.total_request_count @override From af0d129aafaaff10934410520950f353784e5cd0 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 9 Jun 2025 17:36:33 +0200 Subject: [PATCH 13/43] Add missing_ok when removing files --- .../storage_clients/_file_system/_key_value_store_client.py | 6 +++--- .../storage_clients/_file_system/_request_queue_client.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 58d4e2cd5c..1e417e36df 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -204,7 +204,7 @@ async def purge(self) -> None: for file_path in self.path_to_kvs.glob('*'): if file_path.name == METADATA_FILENAME: continue - await asyncio.to_thread(file_path.unlink) + await asyncio.to_thread(file_path.unlink, missing_ok=True) await self._update_metadata( update_accessed_at=True, @@ -338,11 +338,11 @@ async def delete_value(self, *, key: str) -> None: async with self._lock: # Delete the value file and its metadata if found if record_path.exists(): - await asyncio.to_thread(record_path.unlink) + await asyncio.to_thread(record_path.unlink, missing_ok=True) # Delete the metadata file if it exists if metadata_path.exists(): - await asyncio.to_thread(metadata_path.unlink) + await asyncio.to_thread(metadata_path.unlink, missing_ok=True) else: logger.warning(f'Found value file for key "{key}" but no metadata file when trying to delete it.') diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index aa246d67b4..e69e15a981 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -270,7 +270,7 @@ async def purge(self) -> None: request_files = await self._get_request_files(self.path_to_rq) for file_path in request_files: - await asyncio.to_thread(file_path.unlink) + await asyncio.to_thread(file_path.unlink, missing_ok=True) self._in_progress.clear() self._request_cache.clear() From 9998a58390748564d8795f5fabbb45a002c6994f Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 10 Jun 2025 14:08:21 +0200 Subject: [PATCH 14/43] Improve is_empty --- .../storage_clients/_file_system/_request_queue_client.py | 7 ++++++- .../storage_clients/_memory/_request_queue_client.py | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index e69e15a981..9d9072458f 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -605,6 +605,11 @@ async def is_empty(self) -> bool: if self._is_empty_cache is not None: return self._is_empty_cache + # If there are in-progress requests, return False immediately. + if len(self._in_progress) > 0: + self._is_empty_cache = False + return False + # If we have a cached requests, check them first (fast path). if self._request_cache: for req in self._request_cache: @@ -612,7 +617,7 @@ async def is_empty(self) -> bool: self._is_empty_cache = False return False self._is_empty_cache = True - return True + return len(self._in_progress) == 0 # Fallback: check files on disk (slow path). await self._update_metadata(update_accessed_at=True) diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index dac8e8dcac..3302a7aaed 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -343,9 +343,9 @@ async def is_empty(self) -> bool: """ await self._update_metadata(update_accessed_at=True) - # Queue is empty if there are no pending requests + # Queue is empty if there are no pending requests and no requests in progress pending_requests = [request for request in self._records if not request.was_already_handled] - return len(pending_requests) == 0 + return len(pending_requests) == 0 and len(self._in_progress) == 0 async def _update_metadata( self, From fdee1117620200aba73d60ff6b8bab5d0cba065f Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 10 Jun 2025 14:49:49 +0200 Subject: [PATCH 15/43] Optimize RQ memory storage client --- .../_file_system/_request_queue_client.py | 65 +---- .../_memory/_dataset_client.py | 4 - .../_memory/_key_value_store_client.py | 4 - .../_memory/_request_queue_client.py | 224 ++++++++---------- uv.lock | 4 +- 5 files changed, 107 insertions(+), 194 deletions(-) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 9d9072458f..7c4e1ca50f 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -292,15 +292,6 @@ async def add_batch_of_requests( *, forefront: bool = False, ) -> AddRequestsResponse: - """Add a batch of requests to the queue. - - Args: - requests: The requests to add. - forefront: Whether to add the requests to the beginning of the queue. - - Returns: - Response containing information about the added requests. - """ async with self._lock: self._is_empty_cache = None new_total_request_count = self._metadata.total_request_count @@ -426,36 +417,20 @@ async def add_batch_of_requests( @override async def get_request(self, request_id: str) -> Request | None: - """Retrieve a request from the queue. - - Args: - request_id: ID of the request to retrieve. - - Returns: - The retrieved request, or None, if it did not exist. - """ - request_path = self._get_request_path(request_id) - request = await self._parse_request_file(request_path) + async with self._lock: + request_path = self._get_request_path(request_id) + request = await self._parse_request_file(request_path) - if request is None: - logger.warning(f'Request with ID "{request_id}" not found in the queue.') - return None + if request is None: + logger.warning(f'Request with ID "{request_id}" not found in the queue.') + return None - self._in_progress.add(request.id) - return request + self._in_progress.add(request.id) + await self._update_metadata(update_accessed_at=True) + return request @override async def fetch_next_request(self) -> Request | None: - """Return the next request in the queue to be processed. - - Once you successfully finish processing of the request, you need to call `RequestQueue.mark_request_as_handled` - to mark the request as handled in the queue. If there was some error in processing the request, call - `RequestQueue.reclaim_request` instead, so that the queue will give the request to some other consumer - in another call to the `fetch_next_request` method. - - Returns: - The request or `None` if there are no more pending requests. - """ async with self._lock: # Refresh cache if needed or if it's empty. if self._request_cache_needs_refresh or not self._request_cache: @@ -478,16 +453,6 @@ async def fetch_next_request(self) -> Request | None: @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: - """Mark a request as handled after successful processing. - - Handled requests will never again be returned by the `fetch_next_request` method. - - Args: - request: The request to mark as handled. - - Returns: - Information about the queue operation. `None` if the given request was not in progress. - """ async with self._lock: self._is_empty_cache = None @@ -535,17 +500,6 @@ async def reclaim_request( *, forefront: bool = False, ) -> ProcessedRequest | None: - """Reclaim a failed request back to the queue. - - The request will be returned for processing later again by another call to `fetch_next_request`. - - Args: - request: The request to return to the queue. - forefront: Whether to add the request to the head or the end of the queue. - - Returns: - Information about the queue operation. `None` if the given request was not in progress. - """ async with self._lock: self._is_empty_cache = None @@ -599,7 +553,6 @@ async def reclaim_request( @override async def is_empty(self) -> bool: - """Check if the queue is empty, using a cached value if available and valid.""" async with self._lock: # If we have a cached value, return it immediately. if self._is_empty_cache is not None: diff --git a/src/crawlee/storage_clients/_memory/_dataset_client.py b/src/crawlee/storage_clients/_memory/_dataset_client.py index e48da40382..3827de7bb4 100644 --- a/src/crawlee/storage_clients/_memory/_dataset_client.py +++ b/src/crawlee/storage_clients/_memory/_dataset_client.py @@ -95,10 +95,6 @@ async def drop(self) -> None: @override async def purge(self) -> None: - """Delete all records from the dataset, but keep the dataset itself. - - This method clears all data items from the dataset while preserving the dataset structure. - """ self._records.clear() await self._update_metadata( update_accessed_at=True, diff --git a/src/crawlee/storage_clients/_memory/_key_value_store_client.py b/src/crawlee/storage_clients/_memory/_key_value_store_client.py index d138047549..39e3f326e1 100644 --- a/src/crawlee/storage_clients/_memory/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_memory/_key_value_store_client.py @@ -86,10 +86,6 @@ async def drop(self) -> None: @override async def purge(self) -> None: - """Delete all stored values from the key-value store, but keep the store itself. - - This method clears all key-value pairs while preserving the store structure. - """ self._records.clear() await self._update_metadata(update_accessed_at=True, update_modified_at=True) diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index 3302a7aaed..cdf7a86dd2 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -1,5 +1,7 @@ from __future__ import annotations +from collections import deque +from contextlib import suppress from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING @@ -22,10 +24,9 @@ class MemoryRequestQueueClient(RequestQueueClient): """Memory implementation of the request queue client. - This client stores requests in memory using a Python list and dictionary. No data is persisted between - process runs, which means all requests are lost when the program terminates. This implementation - is primarily useful for testing, development, and short-lived crawler runs where persistence - is not required. + No data is persisted between process runs, which means all requests are lost when + the program terminates. This implementation is primarily useful for testing, + development, and short-lived crawler runs where persistence is not required. This client provides fast access to request data but is limited by available memory and does not support data sharing across different processes. @@ -62,11 +63,20 @@ def __init__( total_request_count=total_request_count, ) - # List to hold RQ items - self._records = list[Request]() + self._pending_requests = deque[Request]() + """Pending requests are those that have been added to the queue but not yet fetched for processing.""" - # Dictionary to track in-progress requests (fetched but not yet handled or reclaimed) - self._in_progress = dict[str, Request]() + self._handled_requests = dict[str, Request]() + """Handled requests are those that have been processed and marked as handled.""" + + self._in_progress_requests = dict[str, Request]() + """In-progress requests are those that have been fetched but not yet marked as handled or reclaimed.""" + + self._requests_by_id = dict[str, Request]() + """ID -> Request mapping for fast lookup by request ID.""" + + self._requests_by_unique_key = dict[str, Request]() + """Unique key -> Request mapping for fast lookup by unique key.""" @property @override @@ -101,9 +111,11 @@ async def open( @override async def drop(self) -> None: - # Clear all data - self._records.clear() - self._in_progress.clear() + self._pending_requests.clear() + self._handled_requests.clear() + self._requests_by_id.clear() + self._requests_by_unique_key.clear() + self._in_progress_requests.clear() await self._update_metadata( update_modified_at=True, @@ -115,13 +127,11 @@ async def drop(self) -> None: @override async def purge(self) -> None: - """Delete all requests from the queue, but keep the queue itself. - - This method clears all requests including both pending and handled ones, - but preserves the queue structure. - """ - self._records.clear() - self._in_progress.clear() + self._pending_requests.clear() + self._handled_requests.clear() + self._requests_by_id.clear() + self._requests_by_unique_key.clear() + self._in_progress_requests.clear() await self._update_metadata( update_modified_at=True, @@ -136,28 +146,15 @@ async def add_batch_of_requests( *, forefront: bool = False, ) -> AddRequestsResponse: - """Add a batch of requests to the queue. - - Args: - requests: The requests to add. - forefront: Whether to add the requests to the beginning of the queue. - - Returns: - Response containing information about the added requests. - """ processed_requests = [] for request in requests: - # Ensure the request has an ID - if not request.id: - request.id = crypto_random_object_id() - - # Check if the request is already in the queue by unique_key - existing_request = next((r for r in self._records if r.unique_key == request.unique_key), None) + # Check if the request is already in the queue by unique_key. + existing_request = self._requests_by_unique_key.get(request.unique_key) was_already_present = existing_request is not None was_already_handled = was_already_present and existing_request and existing_request.handled_at is not None - # If the request is already in the queue and handled, don't add it again + # If the request is already in the queue and handled, don't add it again. if was_already_handled: processed_requests.append( ProcessedRequest( @@ -169,23 +166,37 @@ async def add_batch_of_requests( ) continue - # If the request is already in the queue but not handled, update it - if was_already_present: - # Update the existing request with any new data - for idx, rec in enumerate(self._records): - if rec.unique_key == request.unique_key: - self._records[idx] = request - break + # If the request is already in the queue but not handled, update it. + if was_already_present and existing_request: + # Update the existing request with any new data and + # remove old request from pending queue if it's there. + with suppress(ValueError): + self._pending_requests.remove(existing_request) + + # Update indexes. + self._requests_by_id[request.id] = request + self._requests_by_unique_key[request.unique_key] = request + + # Add updated request back to queue. + if forefront: + self._pending_requests.appendleft(request) + else: + self._pending_requests.append(request) + # Add the new request to the queue. else: - # Add the new request to the queue if forefront: - self._records.insert(0, request) + self._pending_requests.appendleft(request) else: - self._records.append(request) + self._pending_requests.append(request) - # Update metadata counts - self._metadata.total_request_count += 1 - self._metadata.pending_request_count += 1 + # Update indexes. + self._requests_by_id[request.id] = request + self._requests_by_unique_key[request.unique_key] = request + + await self._update_metadata( + new_total_request_count=self._metadata.total_request_count + 1, + new_pending_request_count=self._metadata.pending_request_count + 1, + ) processed_requests.append( ProcessedRequest( @@ -205,81 +216,55 @@ async def add_batch_of_requests( @override async def fetch_next_request(self) -> Request | None: - """Return the next request in the queue to be processed. + while self._pending_requests: + request = self._pending_requests.popleft() - Returns: - The request or `None` if there are no more pending requests. - """ - # Find the first request that's not handled or in progress - for request in self._records: + # Skip if already handled (shouldn't happen, but safety check). if request.was_already_handled: continue - if request.id in self._in_progress: - continue + # Skip if already in progress (shouldn't happen, but safety check). + if request.id in self._in_progress_requests: + self._pending_requests.appendleft(request) + break - # Mark as in progress - self._in_progress[request.id] = request + # Mark as in progress. + self._in_progress_requests[request.id] = request return request return None @override async def get_request(self, request_id: str) -> Request | None: - """Retrieve a request from the queue. - - Args: - request_id: ID of the request to retrieve. - - Returns: - The retrieved request, or None, if it did not exist. - """ - # Check in-progress requests first - if request_id in self._in_progress: - return self._in_progress[request_id] - - # Otherwise search in the records - for request in self._records: - if request.id == request_id: - return request - - return None + await self._update_metadata(update_accessed_at=True) + return self._requests_by_id.get(request_id) @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: - """Mark a request as handled after successful processing. - - Handled requests will never again be returned by the `fetch_next_request` method. - - Args: - request: The request to mark as handled. - - Returns: - Information about the queue operation. `None` if the given request was not in progress. - """ - # Check if the request is in progress - if request.id not in self._in_progress: + # Check if the request is in progress. + if request.id not in self._in_progress_requests: return None - # Set handled_at timestamp if not already set + # Set handled_at timestamp if not already set. if not request.was_already_handled: request.handled_at = datetime.now(timezone.utc) - # Update the request in records - for idx, rec in enumerate(self._records): - if rec.id == request.id: - self._records[idx] = request - break + # Move request to handled storage. + self._handled_requests[request.id] = request - # Remove from in-progress - del self._in_progress[request.id] + # Update indexes (keep the request in indexes for get_request to work). + self._requests_by_id[request.id] = request + self._requests_by_unique_key[request.unique_key] = request - # Update metadata counts - self._metadata.handled_request_count += 1 - self._metadata.pending_request_count -= 1 + # Remove from in-progress. + del self._in_progress_requests[request.id] - # Update metadata timestamps - await self._update_metadata(update_modified_at=True) + # Update metadata. + await self._update_metadata( + new_handled_request_count=self._metadata.handled_request_count + 1, + new_pending_request_count=self._metadata.pending_request_count - 1, + update_modified_at=True, + ) return ProcessedRequest( id=request.id, @@ -295,36 +280,20 @@ async def reclaim_request( *, forefront: bool = False, ) -> ProcessedRequest | None: - """Reclaim a failed request back to the queue. - - The request will be returned for processing later again by another call to `fetch_next_request`. - - Args: - request: The request to return to the queue. - forefront: Whether to add the request to the head or the end of the queue. - - Returns: - Information about the queue operation. `None` if the given request was not in progress. - """ - # Check if the request is in progress - if request.id not in self._in_progress: + # Check if the request is in progress. + if request.id not in self._in_progress_requests: return None - # Remove from in-progress - del self._in_progress[request.id] + # Remove from in-progress. + del self._in_progress_requests[request.id] - # If forefront is true, move the request to the beginning of the queue + # Add request back to pending queue. if forefront: - # First remove the request from its current position - for idx, rec in enumerate(self._records): - if rec.id == request.id: - self._records.pop(idx) - break - - # Then insert it at the beginning - self._records.insert(0, request) + self._pending_requests.appendleft(request) + else: + self._pending_requests.append(request) - # Update metadata timestamps + # Update metadata timestamps. await self._update_metadata(update_modified_at=True) return ProcessedRequest( @@ -343,9 +312,8 @@ async def is_empty(self) -> bool: """ await self._update_metadata(update_accessed_at=True) - # Queue is empty if there are no pending requests and no requests in progress - pending_requests = [request for request in self._records if not request.was_already_handled] - return len(pending_requests) == 0 and len(self._in_progress) == 0 + # Queue is empty if there are no pending requests and no requests in progress. + return len(self._pending_requests) == 0 and len(self._in_progress_requests) == 0 async def _update_metadata( self, diff --git a/uv.lock b/uv.lock index 7a57e06cc9..a68c7037d1 100644 --- a/uv.lock +++ b/uv.lock @@ -763,7 +763,7 @@ provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impe dev = [ { name = "apify-client" }, { name = "build", specifier = "~=1.2.2" }, - { name = "dycw-pytest-only", specifier = ">=2.1.1" }, + { name = "dycw-pytest-only", specifier = "~=2.1.0" }, { name = "mypy", specifier = "~=1.16.0" }, { name = "pre-commit", specifier = "~=4.2.0" }, { name = "proxy-py", specifier = "~=2.4.0" }, @@ -771,7 +771,7 @@ dev = [ { name = "pytest", specifier = "~=8.4.0" }, { name = "pytest-asyncio", specifier = "~=1.0.0" }, { name = "pytest-cov", specifier = "~=6.1.0" }, - { name = "pytest-timeout", specifier = ">=2.4.0" }, + { name = "pytest-timeout", specifier = "~=2.4.0" }, { name = "pytest-xdist", specifier = "~=3.7.0" }, { name = "ruff", specifier = "~=0.11.0" }, { name = "setuptools" }, From 79cdfc0bab0ce27415cee90becf8502ce10d7b04 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 11 Jun 2025 13:59:49 +0200 Subject: [PATCH 16/43] Add upgrading guide and skip problematic test --- docs/upgrading/upgrading_to_v1.md | 122 ++++++++++++++++++ .../crawlers/_basic/test_basic_crawler.py | 3 +- 2 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 docs/upgrading/upgrading_to_v1.md diff --git a/docs/upgrading/upgrading_to_v1.md b/docs/upgrading/upgrading_to_v1.md new file mode 100644 index 0000000000..7a2b65601c --- /dev/null +++ b/docs/upgrading/upgrading_to_v1.md @@ -0,0 +1,122 @@ +--- +id: upgrading-to-v1 +title: Upgrading to v1 +--- + +This page summarizes the breaking changes between Crawlee for Python v0.6 and v1.0. + +## Storage clients + +In v1.0, we are introducing a new storage clients system. We have completely reworked their interface, +making it much simpler to write your own storage clients. This allows you to easily store your request queues, +key-value stores, and datasets in various destinations. + +### New storage clients + +Previously, the `MemoryStorageClient` handled both in-memory storage and file system persistence, depending +on configuration. In v1.0, we've split this into two dedicated classes: + +- `MemoryStorageClient` - stores all data in memory only. +- `FileSystemStorageClient` - persists data on the file system, with in-memory caching for improved performance. + For details about the new interface, see the `BaseStorageClient` documentation. You can also check out + the [Storage clients guide](https://crawlee.dev/python/docs/guides/TODO) for more information on available + storage clients and instructions on writing your own. + +### Memory storage client + +Before: + +```python +from crawlee.configuration import Configuration +from crawlee.storage_clients import MemoryStorageClient + +configuration = Configuration(persist_storage=False) +storage_client = MemoryStorageClient.from_config(configuration) +``` + +Now: + +```python +from crawlee.storage_clients import MemoryStorageClient + +storage_client = MemoryStorageClient() +``` + +### File-system storage client + +Before: + +```python +from crawlee.configuration import Configuration +from crawlee.storage_clients import MemoryStorageClient + +configuration = Configuration(persist_storage=True) +storage_client = MemoryStorageClient.from_config(configuration) +``` + +Now: + +```python +from crawlee.storage_clients import FileSystemStorageClient + +storage_client = FileSystemStorageClient() +``` + +The way you register storage clients remains the same: + +```python +from crawlee import service_locator +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import MemoryStorageClient + +storage_client = MemoryStorageClient() + +# Either via the service locator: +service_locator.set_storage_client(storage_client) + +# Or provide it directly to the crawler: +crawler = ParselCrawler(storage_client=storage_client) +``` + +### Breaking changes + +The `persist_storage` and `persist_metadata` fields have been removed from the `Configuration` class. +Persistence is now determined solely by the storage client class you use. + +### Writing custom storage clients + +The storage client interface has been fully reworked. Collection storage clients have been removed - now there is +one storage client class per storage type (`RequestQueue`, `KeyValueStore`, and `Dataset`). Writing your own storage +clients is now much simpler, allowing you to store your request queues, key-value stores, and datasets in any +destination you choose. + +## Dataset + +- There are two new methods: + - `purge` + - `list_items` +- The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead. +- The `get_info` and `storage_object` properties have been replaced by the new `metadata` property. +- The `set_metadata` method has been removed. +- The `write_to_json` and `write_to_csv` methods have been removed - use `export_to` instead. + +## Key-value store + +- There are three new methods: + - `purge` + - `delete_value` + - `list_keys` +- The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead. +- The `get_info` and `storage_object` properties have been replaced by the new `metadata` property. +- The `set_metadata` method has been removed. + +## Request queue + +- There are two new methods: + - `purge` + - `add_requests` (renamed from `add_requests_batched`) +- The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead. +- The `get_info` and `storage_object` properties have been replaced by the new `metadata` property. +- The `set_metadata` method has been removed. +- `resource_directory` from `RequestQueueMetadata` removed – use `path_to_...` property. +- `RequestQueueHead` model replaced with `RequestQueueHeadWithLocks`. diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index d90efba086..c068d49063 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1110,7 +1110,8 @@ async def handler(context: BasicCrawlingContext) -> None: assert (await store.get_value(BasicCrawler._CRAWLEE_STATE_KEY))['counter'] == 2 -@pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.timeout was introduced in Python 3.11.') +# @pytest.mark.skipif(sys.version_info[:3] < (3, 11), reason='asyncio.timeout was introduced in Python 3.11.') +@pytest.mark.skip @pytest.mark.parametrize( 'sleep_type', [ From e818585f5950648f9c7b95f8da00818e0a94567e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Wed, 11 Jun 2025 15:19:12 +0200 Subject: [PATCH 17/43] chore: update `docusaurus-plugin-typedoc-api`, fix failing docs build --- website/package.json | 2 +- website/yarn.lock | 51 ++++++++++++++++---------------------------- 2 files changed, 19 insertions(+), 34 deletions(-) diff --git a/website/package.json b/website/package.json index 6533d38ef7..5aaab5f9aa 100644 --- a/website/package.json +++ b/website/package.json @@ -34,7 +34,7 @@ "typescript": "5.8.3" }, "dependencies": { - "@apify/docusaurus-plugin-typedoc-api": "^4.4.2", + "@apify/docusaurus-plugin-typedoc-api": "^4.4.5", "@apify/utilities": "^2.8.0", "@docusaurus/core": "^3.8.0", "@docusaurus/faster": "^3.8.0", diff --git a/website/yarn.lock b/website/yarn.lock index 6cf2deb23e..d906b5f6d6 100644 --- a/website/yarn.lock +++ b/website/yarn.lock @@ -231,14 +231,10 @@ __metadata: languageName: node linkType: hard -"@apify/docusaurus-plugin-typedoc-api@npm:^4.4.2": - version: 4.4.3 - resolution: "@apify/docusaurus-plugin-typedoc-api@npm:4.4.3" - dependencies: - "@docusaurus/plugin-content-docs": "npm:^3.5.2" - "@docusaurus/types": "npm:^3.5.2" - "@docusaurus/utils": "npm:^3.5.2" - "@types/react": "npm:^18.3.11" +"@apify/docusaurus-plugin-typedoc-api@npm:^4.4.5": + version: 4.4.5 + resolution: "@apify/docusaurus-plugin-typedoc-api@npm:4.4.5" + dependencies: "@vscode/codicons": "npm:^0.0.35" html-entities: "npm:2.3.2" marked: "npm:^9.1.6" @@ -246,11 +242,17 @@ __metadata: typedoc: "npm:^0.26.11" zx: "npm:^8.1.4" peerDependencies: - "@docusaurus/core": ^3.5.2 - "@docusaurus/mdx-loader": ^3.5.2 - react: ">=18.0.0" + "@docusaurus/core": ^3.8.1 + "@docusaurus/mdx-loader": ^3.8.1 + "@docusaurus/plugin-content-docs": ^3.8.1 + "@docusaurus/preset-classic": ^3.8.1 + "@docusaurus/types": ^3.8.1 + "@docusaurus/utils": ^3.8.1 + "@types/react": ^18.3.11 || >=19.0.0 + react: ">=18.0.0 || >=19.0.0" + react-dom: ^18.2.0 || >=19.0.0 typescript: ^5.0.0 - checksum: 10c0/df92ab55554def51f37725c6a3b6401d8e09b25ac06feedb2d0ea5e2a0fea6279f7c41c01d398f9826eb91a4f83bb3b36ed7af4c2a92d49d8b08dc31079b2665 + checksum: 10c0/f0a17fdc96a4f69c711e6a40ada229f707379de35a06d4fa6dddd00fa9b8714e8a4ae0046eafc919fc4188d13a2d7acb7b364d14dc88f75ba03d483def90d9bd languageName: node linkType: hard @@ -2425,7 +2427,7 @@ __metadata: languageName: node linkType: hard -"@docusaurus/plugin-content-docs@npm:3.8.0, @docusaurus/plugin-content-docs@npm:^3.5.2": +"@docusaurus/plugin-content-docs@npm:3.8.0": version: 3.8.0 resolution: "@docusaurus/plugin-content-docs@npm:3.8.0" dependencies: @@ -2729,7 +2731,7 @@ __metadata: languageName: node linkType: hard -"@docusaurus/types@npm:3.8.0, @docusaurus/types@npm:^3.5.2": +"@docusaurus/types@npm:3.8.0": version: 3.8.0 resolution: "@docusaurus/types@npm:3.8.0" dependencies: @@ -2775,7 +2777,7 @@ __metadata: languageName: node linkType: hard -"@docusaurus/utils@npm:3.8.0, @docusaurus/utils@npm:^3.5.2": +"@docusaurus/utils@npm:3.8.0": version: 3.8.0 resolution: "@docusaurus/utils@npm:3.8.0" dependencies: @@ -4527,13 +4529,6 @@ __metadata: languageName: node linkType: hard -"@types/prop-types@npm:*": - version: 15.7.14 - resolution: "@types/prop-types@npm:15.7.14" - checksum: 10c0/1ec775160bfab90b67a782d735952158c7e702ca4502968aa82565bd8e452c2de8601c8dfe349733073c31179116cf7340710160d3836aa8a1ef76d1532893b1 - languageName: node - linkType: hard - "@types/qs@npm:*": version: 6.14.0 resolution: "@types/qs@npm:6.14.0" @@ -4589,16 +4584,6 @@ __metadata: languageName: node linkType: hard -"@types/react@npm:^18.3.11": - version: 18.3.23 - resolution: "@types/react@npm:18.3.23" - dependencies: - "@types/prop-types": "npm:*" - csstype: "npm:^3.0.2" - checksum: 10c0/49331800b76572eb2992a5c44801dbf8c612a5f99c8f4e4200f06c7de6f3a6e9455c661784a6c5469df96fa45622cb4a9d0982c44e6a0d5719be5f2ef1f545ed - languageName: node - linkType: hard - "@types/retry@npm:0.12.0": version: 0.12.0 resolution: "@types/retry@npm:0.12.0" @@ -6630,7 +6615,7 @@ __metadata: version: 0.0.0-use.local resolution: "crawlee@workspace:." dependencies: - "@apify/docusaurus-plugin-typedoc-api": "npm:^4.4.2" + "@apify/docusaurus-plugin-typedoc-api": "npm:^4.4.5" "@apify/eslint-config-ts": "npm:^0.4.0" "@apify/tsconfig": "npm:^0.1.0" "@apify/utilities": "npm:^2.8.0" From 65db9ac4097b9d55f94a13a4a6e9190839f1fe00 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 11 Jun 2025 15:31:12 +0200 Subject: [PATCH 18/43] fix docs --- docs/upgrading/upgrading_to_v1.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/docs/upgrading/upgrading_to_v1.md b/docs/upgrading/upgrading_to_v1.md index 7a2b65601c..00df2586a5 100644 --- a/docs/upgrading/upgrading_to_v1.md +++ b/docs/upgrading/upgrading_to_v1.md @@ -18,9 +18,10 @@ on configuration. In v1.0, we've split this into two dedicated classes: - `MemoryStorageClient` - stores all data in memory only. - `FileSystemStorageClient` - persists data on the file system, with in-memory caching for improved performance. - For details about the new interface, see the `BaseStorageClient` documentation. You can also check out - the [Storage clients guide](https://crawlee.dev/python/docs/guides/TODO) for more information on available - storage clients and instructions on writing your own. + +For details about the new interface, see the `BaseStorageClient` documentation. You can also check out +the [Storage clients guide](https://crawlee.dev/python/docs/guides/) for more information on available +storage clients and instructions on writing your own. ### Memory storage client @@ -93,8 +94,8 @@ destination you choose. ## Dataset - There are two new methods: - - `purge` - - `list_items` + - `purge` + - `list_items` - The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead. - The `get_info` and `storage_object` properties have been replaced by the new `metadata` property. - The `set_metadata` method has been removed. @@ -103,9 +104,9 @@ destination you choose. ## Key-value store - There are three new methods: - - `purge` - - `delete_value` - - `list_keys` + - `purge` + - `delete_value` + - `list_keys` - The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead. - The `get_info` and `storage_object` properties have been replaced by the new `metadata` property. - The `set_metadata` method has been removed. @@ -113,8 +114,8 @@ destination you choose. ## Request queue - There are two new methods: - - `purge` - - `add_requests` (renamed from `add_requests_batched`) + - `purge` + - `add_requests` (renamed from `add_requests_batched`) - The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead. - The `get_info` and `storage_object` properties have been replaced by the new `metadata` property. - The `set_metadata` method has been removed. From 2b786f741428dcc67019d27adb8773afb2ac557a Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 12 Jun 2025 11:29:17 +0200 Subject: [PATCH 19/43] add retries to atomic write --- src/crawlee/_utils/file.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index ab7288dc3f..da6ef8f7e4 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -60,6 +60,7 @@ async def atomic_write( data: str, *, is_binary: bool = False, + retry_count: int = 0, ) -> None: ... @@ -69,6 +70,7 @@ async def atomic_write( data: bytes, *, is_binary: bool = True, + retry_count: int = 0, ) -> None: ... @@ -77,6 +79,7 @@ async def atomic_write( data: str | bytes, *, is_binary: bool = False, + retry_count: int = 0, ) -> None: """Write data to a file atomically to prevent data corruption or partial writes. @@ -84,18 +87,17 @@ async def atomic_write( a temporary file and then atomically replacing the target file, which prevents data corruption if the process is interrupted during the write operation. - For example, if a process (crawler) is interrupted while writing a file, the file may end up in an - incomplete or corrupted state. This might be especially unwanted for metadata files. - Args: path: The path to the destination file. data: The data to write to the file (string or bytes). is_binary: If True, write in binary mode. If False (default), write in text mode. + retry_count: Internal parameter to track the number of retry attempts (default: 0). """ + max_retries = 3 dir_path = path.parent def _sync_write() -> str: - # create a temp file in the target dir, return its name + # Create a tmp file in the target dir, return its name. fd, tmp_path = tempfile.mkstemp( suffix=f'{path.suffix}.tmp', prefix=f'{path.name}.', @@ -108,21 +110,26 @@ def _sync_write() -> str: else: with os.fdopen(fd, 'w', encoding='utf-8') as tmp_file: tmp_file.write(data) # type: ignore[arg-type] - except Exception: # broader exception handling + except Exception: Path(tmp_path).unlink(missing_ok=True) raise return tmp_path - tmp_path = await asyncio.to_thread(_sync_write) - try: + tmp_path = await asyncio.to_thread(_sync_write) await asyncio.to_thread(os.replace, tmp_path, str(path)) except (FileNotFoundError, PermissionError): - # fallback if tmp went missing - if is_binary: - await asyncio.to_thread(path.write_bytes, data) # type: ignore[arg-type] - else: - await asyncio.to_thread(path.write_text, data, encoding='utf-8') # type: ignore[arg-type] + if retry_count < max_retries: + await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True) + return await atomic_write( + path, + data, + is_binary=is_binary, + retry_count=retry_count + 1, + ) + # If we reach the maximum number of retries, raise the exception. + raise + finally: await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True) From 2cb04c5661b8b3b7adcc87683dac50db8fde5ba5 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Thu, 12 Jun 2025 11:01:46 +0200 Subject: [PATCH 20/43] chore(deps): update dependency pytest-cov to ~=6.2.0 (#1244) --- pyproject.toml | 2 +- uv.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 394d7ec218..49ab8221a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,7 +102,7 @@ dev = [ "proxy-py~=2.4.0", "pydoc-markdown~=4.8.0", "pytest-asyncio~=1.0.0", - "pytest-cov~=6.1.0", + "pytest-cov~=6.2.0", "pytest-timeout~=2.4.0", "pytest-xdist~=3.7.0", "pytest~=8.4.0", diff --git a/uv.lock b/uv.lock index a68c7037d1..f2d8040fe2 100644 --- a/uv.lock +++ b/uv.lock @@ -770,7 +770,7 @@ dev = [ { name = "pydoc-markdown", specifier = "~=4.8.0" }, { name = "pytest", specifier = "~=8.4.0" }, { name = "pytest-asyncio", specifier = "~=1.0.0" }, - { name = "pytest-cov", specifier = "~=6.1.0" }, + { name = "pytest-cov", specifier = "~=6.2.0" }, { name = "pytest-timeout", specifier = "~=2.4.0" }, { name = "pytest-xdist", specifier = "~=3.7.0" }, { name = "ruff", specifier = "~=0.11.0" }, @@ -2260,15 +2260,15 @@ wheels = [ [[package]] name = "pytest-cov" -version = "6.1.1" +version = "6.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "coverage", extra = ["toml"] }, { name = "pytest" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/25/69/5f1e57f6c5a39f81411b550027bf72842c4567ff5fd572bed1edc9e4b5d9/pytest_cov-6.1.1.tar.gz", hash = "sha256:46935f7aaefba760e716c2ebfbe1c216240b9592966e7da99ea8292d4d3e2a0a", size = 66857, upload-time = "2025-04-05T14:07:51.592Z" } +sdist = { url = "https://files.pythonhosted.org/packages/88/17/139b134cb36e496a62780b2ff19ea47fd834f2d180a32e6dd9210f4a8a77/pytest_cov-6.2.0.tar.gz", hash = "sha256:9a4331e087a0f5074dc1e19fe0485a07a462b346cbb91e2ac903ec5504abce10", size = 68872, upload-time = "2025-06-11T21:55:02.68Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/28/d0/def53b4a790cfb21483016430ed828f64830dd981ebe1089971cd10cab25/pytest_cov-6.1.1-py3-none-any.whl", hash = "sha256:bddf29ed2d0ab6f4df17b4c55b0a657287db8684af9c42ea546b21b1041b3dde", size = 23841, upload-time = "2025-04-05T14:07:49.641Z" }, + { url = "https://files.pythonhosted.org/packages/aa/66/a38138fbf711b2b93592dfd7303bba561f6bc05f85361a0388c105ceb727/pytest_cov-6.2.0-py3-none-any.whl", hash = "sha256:bd19301caf600ead1169db089ed0ad7b8f2b962214330a696b8c85a0b497b2ff", size = 24448, upload-time = "2025-06-11T21:55:00.938Z" }, ] [[package]] From 0c8c4ec1fb61a2403b28e01017713776ebbb7af5 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 12 Jun 2025 11:57:46 +0200 Subject: [PATCH 21/43] Fix atomic write on Windows --- src/crawlee/_utils/file.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index da6ef8f7e4..f476f3bcd1 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -4,6 +4,7 @@ import csv import json import os +import sys import tempfile from pathlib import Path from typing import TYPE_CHECKING, overload @@ -95,14 +96,21 @@ async def atomic_write( """ max_retries = 3 dir_path = path.parent + tmp_path: str | None = None - def _sync_write() -> str: - # Create a tmp file in the target dir, return its name. + def _write_windows() -> None: + if is_binary: + path.write_bytes(data) # type: ignore[arg-type] + else: + path.write_text(data, encoding='utf-8') # type: ignore[arg-type] + + def _write_linux() -> str: fd, tmp_path = tempfile.mkstemp( suffix=f'{path.suffix}.tmp', prefix=f'{path.name}.', dir=str(dir_path), ) + try: if is_binary: with os.fdopen(fd, 'wb') as tmp_file: @@ -116,11 +124,17 @@ def _sync_write() -> str: return tmp_path try: - tmp_path = await asyncio.to_thread(_sync_write) - await asyncio.to_thread(os.replace, tmp_path, str(path)) + # We have to differentiate between Windows and Linux due to the permissions errors + # in Windows when working with temporary files. + if sys.platform == 'win32': + await asyncio.to_thread(_write_windows) + else: + tmp_path = await asyncio.to_thread(_write_linux) + await asyncio.to_thread(os.replace, tmp_path, str(path)) except (FileNotFoundError, PermissionError): if retry_count < max_retries: - await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True) + if tmp_path is not None: + await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True) return await atomic_write( path, data, @@ -131,7 +145,8 @@ def _sync_write() -> str: raise finally: - await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True) + if tmp_path is not None: + await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True) async def export_json_to_stream( From ce1eeb14df5a3753e991d9ebaa83ca98b62beb94 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Sat, 14 Jun 2025 18:24:22 +0200 Subject: [PATCH 22/43] resolve write function during import time --- src/crawlee/_utils/file.py | 83 +++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 32 deletions(-) diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index f476f3bcd1..c580f09e3f 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -17,6 +17,52 @@ from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs +if sys.platform == 'win32': + + def _write_file( + path: Path, + data: str | bytes, + *, + is_binary: bool, + ) -> str | None: + """Windows-specific file write implementation. + + This implementation writes directly to the file without using a temporary file, because + they are problematic due to permissions issues on Windows. + """ + if is_binary: + path.write_bytes(data) # type: ignore[arg-type] + else: + path.write_text(data, encoding='utf-8') # type: ignore[arg-type] + return None +else: + + def _write_file( + path: Path, + data: str | bytes, + *, + is_binary: bool, + ) -> str | None: + """Linux/Unix-specific file write implementation using temporary files.""" + dir_path = path.parent + fd, tmp_path = tempfile.mkstemp( + suffix=f'{path.suffix}.tmp', + prefix=f'{path.name}.', + dir=str(dir_path), + ) + + try: + if is_binary: + with os.fdopen(fd, 'wb') as tmp_file: + tmp_file.write(data) # type: ignore[arg-type] + else: + with os.fdopen(fd, 'w', encoding='utf-8') as tmp_file: + tmp_file.write(data) # type: ignore[arg-type] + except Exception: + Path(tmp_path).unlink(missing_ok=True) + raise + return tmp_path + def infer_mime_type(value: Any) -> str: """Infer the MIME content type from the value. @@ -95,41 +141,14 @@ async def atomic_write( retry_count: Internal parameter to track the number of retry attempts (default: 0). """ max_retries = 3 - dir_path = path.parent tmp_path: str | None = None - def _write_windows() -> None: - if is_binary: - path.write_bytes(data) # type: ignore[arg-type] - else: - path.write_text(data, encoding='utf-8') # type: ignore[arg-type] - - def _write_linux() -> str: - fd, tmp_path = tempfile.mkstemp( - suffix=f'{path.suffix}.tmp', - prefix=f'{path.name}.', - dir=str(dir_path), - ) - - try: - if is_binary: - with os.fdopen(fd, 'wb') as tmp_file: - tmp_file.write(data) # type: ignore[arg-type] - else: - with os.fdopen(fd, 'w', encoding='utf-8') as tmp_file: - tmp_file.write(data) # type: ignore[arg-type] - except Exception: - Path(tmp_path).unlink(missing_ok=True) - raise - return tmp_path - try: - # We have to differentiate between Windows and Linux due to the permissions errors - # in Windows when working with temporary files. - if sys.platform == 'win32': - await asyncio.to_thread(_write_windows) - else: - tmp_path = await asyncio.to_thread(_write_linux) + # Use the platform-specific write function resolved at import time. + tmp_path = await asyncio.to_thread(_write_file, path, data, is_binary=is_binary) + + # On Linux/Unix, replace the destination file with tmp file. + if tmp_path is not None: await asyncio.to_thread(os.replace, tmp_path, str(path)) except (FileNotFoundError, PermissionError): if retry_count < max_retries: From 8c805139d3e15537215104780d1b9172d25337ef Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 16 Jun 2025 14:20:45 +0200 Subject: [PATCH 23/43] Update file utils --- src/crawlee/_utils/file.py | 61 +++++++------------ .../_file_system/_key_value_store_client.py | 2 +- 2 files changed, 22 insertions(+), 41 deletions(-) diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index c580f09e3f..4199cc27f9 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -19,30 +19,21 @@ if sys.platform == 'win32': - def _write_file( - path: Path, - data: str | bytes, - *, - is_binary: bool, - ) -> str | None: + def _write_file(path: Path, data: str | bytes) -> None: """Windows-specific file write implementation. This implementation writes directly to the file without using a temporary file, because they are problematic due to permissions issues on Windows. """ - if is_binary: - path.write_bytes(data) # type: ignore[arg-type] + if isinstance(data, bytes): + path.write_bytes(data) + elif isinstance(data, str): + path.write_text(data, encoding='utf-8') else: - path.write_text(data, encoding='utf-8') # type: ignore[arg-type] - return None + raise TypeError(f'Unsupported data type: {type(data)}. Expected str or bytes.') else: - def _write_file( - path: Path, - data: str | bytes, - *, - is_binary: bool, - ) -> str | None: + def _write_file(path: Path, data: str | bytes) -> None: """Linux/Unix-specific file write implementation using temporary files.""" dir_path = path.parent fd, tmp_path = tempfile.mkstemp( @@ -51,17 +42,22 @@ def _write_file( dir=str(dir_path), ) + if not isinstance(data, (str, bytes)): + raise TypeError(f'Unsupported data type: {type(data)}. Expected str or bytes.') + try: - if is_binary: + if isinstance(data, bytes): with os.fdopen(fd, 'wb') as tmp_file: - tmp_file.write(data) # type: ignore[arg-type] + tmp_file.write(data) else: with os.fdopen(fd, 'w', encoding='utf-8') as tmp_file: - tmp_file.write(data) # type: ignore[arg-type] + tmp_file.write(data) + + # Atomically replace the destination file with the temporary file + Path(tmp_path).replace(path) except Exception: Path(tmp_path).unlink(missing_ok=True) raise - return tmp_path def infer_mime_type(value: Any) -> str: @@ -106,7 +102,6 @@ async def atomic_write( path: Path, data: str, *, - is_binary: bool = False, retry_count: int = 0, ) -> None: ... @@ -116,7 +111,6 @@ async def atomic_write( path: Path, data: bytes, *, - is_binary: bool = True, retry_count: int = 0, ) -> None: ... @@ -125,48 +119,35 @@ async def atomic_write( path: Path, data: str | bytes, *, - is_binary: bool = False, retry_count: int = 0, ) -> None: """Write data to a file atomically to prevent data corruption or partial writes. - This function handles both text and binary data. It ensures atomic writing by creating - a temporary file and then atomically replacing the target file, which prevents data - corruption if the process is interrupted during the write operation. + This function handles both text and binary data. The binary mode is automatically + detected based on the data type (bytes = binary, str = text). It ensures atomic + writing by creating a temporary file and then atomically replacing the target file, + which prevents data corruption if the process is interrupted during the write operation. Args: path: The path to the destination file. data: The data to write to the file (string or bytes). - is_binary: If True, write in binary mode. If False (default), write in text mode. retry_count: Internal parameter to track the number of retry attempts (default: 0). """ max_retries = 3 - tmp_path: str | None = None try: # Use the platform-specific write function resolved at import time. - tmp_path = await asyncio.to_thread(_write_file, path, data, is_binary=is_binary) - - # On Linux/Unix, replace the destination file with tmp file. - if tmp_path is not None: - await asyncio.to_thread(os.replace, tmp_path, str(path)) + await asyncio.to_thread(_write_file, path, data) except (FileNotFoundError, PermissionError): if retry_count < max_retries: - if tmp_path is not None: - await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True) return await atomic_write( path, data, - is_binary=is_binary, retry_count=retry_count + 1, ) # If we reach the maximum number of retries, raise the exception. raise - finally: - if tmp_path is not None: - await asyncio.to_thread(Path(tmp_path).unlink, missing_ok=True) - async def export_json_to_stream( iterator: AsyncIterator[dict[str, Any]], diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 1e417e36df..8464772fb5 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -321,7 +321,7 @@ async def set_value(self, *, key: str, value: Any, content_type: str | None = No await asyncio.to_thread(self.path_to_kvs.mkdir, parents=True, exist_ok=True) # Write the value to the file. - await atomic_write(record_path, value_bytes, is_binary=True) + await atomic_write(record_path, value_bytes) # Write the record metadata to the file. await atomic_write(record_metadata_filepath, record_metadata_content) From 70bc071c69c5a06da34b12bf7e9d7392a0bc5301 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 16 Jun 2025 15:20:01 +0200 Subject: [PATCH 24/43] revert un-intentionally makefile changes --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 79767f3656..00a796d909 100644 --- a/Makefile +++ b/Makefile @@ -55,4 +55,4 @@ build-docs: cd website && corepack enable && yarn && uv run yarn build run-docs: build-api-reference - cd website && corepack enable && yarn && uv run yarn start \ No newline at end of file + cd website && corepack enable && yarn && uv run yarn start From 78efb4ddf234e731a1c784a2280a8b1bec812573 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 18 Jun 2025 12:01:11 +0200 Subject: [PATCH 25/43] Address Honza's comments (p1) --- docs/upgrading/upgrading_to_v1.md | 12 ++++++++++ .../request_loaders/_request_loader.py | 4 ++-- .../storage_clients/_base/_storage_client.py | 12 +++++----- .../_file_system/_storage_client.py | 9 +++++--- .../_memory/_storage_client.py | 6 ++--- src/crawlee/storages/_dataset.py | 2 +- src/crawlee/storages/_key_value_store.py | 2 +- src/crawlee/storages/_request_queue.py | 2 +- .../_file_system/test_fs_dataset_client.py | 16 +++++++------- .../_file_system/test_fs_kvs_client.py | 16 +++++++------- .../_file_system/test_fs_rq_client.py | 22 +++++++++---------- .../_memory/test_memory_dataset_client.py | 10 ++++----- .../_memory/test_memory_kvs_client.py | 10 ++++----- .../_memory/test_memory_rq_client.py | 10 ++++----- 14 files changed, 74 insertions(+), 59 deletions(-) diff --git a/docs/upgrading/upgrading_to_v1.md b/docs/upgrading/upgrading_to_v1.md index 00df2586a5..894104d85e 100644 --- a/docs/upgrading/upgrading_to_v1.md +++ b/docs/upgrading/upgrading_to_v1.md @@ -84,6 +84,18 @@ crawler = ParselCrawler(storage_client=storage_client) The `persist_storage` and `persist_metadata` fields have been removed from the `Configuration` class. Persistence is now determined solely by the storage client class you use. +### Storage client instance behavior + +Instance caching is implemented for the storage open methods: `Dataset.open()`, `KeyValueStore.open()`, +and `RequestQueue.open()`. This means that when you call these methods with the same arguments, +the same instance is returned each time. + +In contrast, when using client methods such as `StorageClient.open_dataset_client()`, each call creates +a new `DatasetClient` instance, even if the arguments are identical. These methods do not use instance caching. + +This usage pattern is not common, and it is generally recommended to open storages using the standard storage +open methods rather than the storage client methods. + ### Writing custom storage clients The storage client interface has been fully reworked. Collection storage clients have been removed - now there is diff --git a/src/crawlee/request_loaders/_request_loader.py b/src/crawlee/request_loaders/_request_loader.py index e33707b7ae..1f9e4aa641 100644 --- a/src/crawlee/request_loaders/_request_loader.py +++ b/src/crawlee/request_loaders/_request_loader.py @@ -31,11 +31,11 @@ async def get_handled_count(self) -> int: @abstractmethod async def get_total_count(self) -> int: - """Get the total number of requests in the loader (i.e. pending + handled).""" + """Get an offline approximation of the total number of requests in the loader (i.e. pending + handled).""" @abstractmethod async def is_empty(self) -> bool: - """Return True if there are no more requests in the source (there might still be unfinished requests).""" + """Return True if there are no more requests in the loader (there might still be unfinished requests).""" @abstractmethod async def is_finished(self) -> bool: diff --git a/src/crawlee/storage_clients/_base/_storage_client.py b/src/crawlee/storage_clients/_base/_storage_client.py index 745613d068..de49500d20 100644 --- a/src/crawlee/storage_clients/_base/_storage_client.py +++ b/src/crawlee/storage_clients/_base/_storage_client.py @@ -29,34 +29,34 @@ class StorageClient(ABC): """ @abstractmethod - async def open_dataset_client( + async def create_dataset_client( self, *, id: str | None = None, name: str | None = None, configuration: Configuration | None = None, ) -> DatasetClient: - """Open a dataset client.""" + """Create a dataset client.""" @abstractmethod - async def open_key_value_store_client( + async def create_kvs_client( self, *, id: str | None = None, name: str | None = None, configuration: Configuration | None = None, ) -> KeyValueStoreClient: - """Open a key-value store client.""" + """Create a key-value store client.""" @abstractmethod - async def open_request_queue_client( + async def create_rq_client( self, *, id: str | None = None, name: str | None = None, configuration: Configuration | None = None, ) -> RequestQueueClient: - """Open a request queue client.""" + """Create a request queue client.""" def get_rate_limit_errors(self) -> dict[int, int]: """Return statistics about rate limit errors encountered by the HTTP client in storage client.""" diff --git a/src/crawlee/storage_clients/_file_system/_storage_client.py b/src/crawlee/storage_clients/_file_system/_storage_client.py index c4edd6f83c..9c293725d3 100644 --- a/src/crawlee/storage_clients/_file_system/_storage_client.py +++ b/src/crawlee/storage_clients/_file_system/_storage_client.py @@ -24,10 +24,13 @@ class FileSystemStorageClient(StorageClient): All data persists between program runs but is limited to access from the local machine where the files are stored. + + Warning: This storage client is not safe for concurrent access from multiple crawler processes. + Use it only when running a single crawler process at a time. """ @override - async def open_dataset_client( + async def create_dataset_client( self, *, id: str | None = None, @@ -40,7 +43,7 @@ async def open_dataset_client( return client @override - async def open_key_value_store_client( + async def create_kvs_client( self, *, id: str | None = None, @@ -53,7 +56,7 @@ async def open_key_value_store_client( return client @override - async def open_request_queue_client( + async def create_rq_client( self, *, id: str | None = None, diff --git a/src/crawlee/storage_clients/_memory/_storage_client.py b/src/crawlee/storage_clients/_memory/_storage_client.py index d23458c9f9..9e3a2a4d2f 100644 --- a/src/crawlee/storage_clients/_memory/_storage_client.py +++ b/src/crawlee/storage_clients/_memory/_storage_client.py @@ -28,7 +28,7 @@ class MemoryStorageClient(StorageClient): """ @override - async def open_dataset_client( + async def create_dataset_client( self, *, id: str | None = None, @@ -41,7 +41,7 @@ async def open_dataset_client( return client @override - async def open_key_value_store_client( + async def create_kvs_client( self, *, id: str | None = None, @@ -54,7 +54,7 @@ async def open_key_value_store_client( return client @override - async def open_request_queue_client( + async def create_rq_client( self, *, id: str | None = None, diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index dcd57a5c95..cb796fee11 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -120,7 +120,7 @@ async def open( cache_by_id=cls._cache_by_id, cache_by_name=cls._cache_by_name, default_instance_attr='_default_instance', - client_opener=storage_client.open_dataset_client, + client_opener=storage_client.create_dataset_client, ) @override diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index 659cd67384..0863193873 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -128,7 +128,7 @@ async def open( cache_by_id=cls._cache_by_id, cache_by_name=cls._cache_by_name, default_instance_attr='_default_instance', - client_opener=storage_client.open_key_value_store_client, + client_opener=storage_client.create_kvs_client, ) @override diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index d08243b650..cda808bb5e 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -136,7 +136,7 @@ async def open( cache_by_id=cls._cache_by_id, cache_by_name=cls._cache_by_name, default_instance_attr='_default_instance', - client_opener=storage_client.open_request_queue_client, + client_opener=storage_client.create_rq_client, ) @override diff --git a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py index c531ffdf41..450a9073e8 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py @@ -28,7 +28,7 @@ def configuration(tmp_path: Path) -> Configuration: @pytest.fixture async def dataset_client(configuration: Configuration) -> AsyncGenerator[FileSystemDatasetClient, None]: """A fixture for a file system dataset client.""" - client = await FileSystemStorageClient().open_dataset_client( + client = await FileSystemStorageClient().create_dataset_client( name='test_dataset', configuration=configuration, ) @@ -38,7 +38,7 @@ async def dataset_client(configuration: Configuration) -> AsyncGenerator[FileSys async def test_open_creates_new_dataset(configuration: Configuration) -> None: """Test that open() creates a new dataset with proper metadata when it doesn't exist.""" - client = await FileSystemStorageClient().open_dataset_client( + client = await FileSystemStorageClient().create_dataset_client( name='new_dataset', configuration=configuration, ) @@ -69,7 +69,7 @@ async def test_open_dataset_by_id(configuration: Configuration) -> None: storage_client = FileSystemStorageClient() # First create a dataset by name - original_client = await storage_client.open_dataset_client( + original_client = await storage_client.create_dataset_client( name='open-by-id-test', configuration=configuration, ) @@ -81,7 +81,7 @@ async def test_open_dataset_by_id(configuration: Configuration) -> None: await original_client.push_data({'test_item': 'test_value'}) # Now try to open the same dataset using just the ID - reopened_client = await storage_client.open_dataset_client( + reopened_client = await storage_client.create_dataset_client( id=dataset_id, configuration=configuration, ) @@ -104,7 +104,7 @@ async def test_dataset_client_purge_on_start(configuration: Configuration) -> No configuration.purge_on_start = True # Create dataset and add data - dataset_client1 = await FileSystemStorageClient().open_dataset_client( + dataset_client1 = await FileSystemStorageClient().create_dataset_client( configuration=configuration, ) await dataset_client1.push_data({'item': 'initial data'}) @@ -114,7 +114,7 @@ async def test_dataset_client_purge_on_start(configuration: Configuration) -> No assert len(items.items) == 1 # Reopen - dataset_client2 = await FileSystemStorageClient().open_dataset_client( + dataset_client2 = await FileSystemStorageClient().create_dataset_client( configuration=configuration, ) @@ -128,14 +128,14 @@ async def test_dataset_client_no_purge_on_start(configuration: Configuration) -> configuration.purge_on_start = False # Create dataset and add data - dataset_client1 = await FileSystemStorageClient().open_dataset_client( + dataset_client1 = await FileSystemStorageClient().create_dataset_client( name='test-no-purge-dataset', configuration=configuration, ) await dataset_client1.push_data({'item': 'preserved data'}) # Reopen - dataset_client2 = await FileSystemStorageClient().open_dataset_client( + dataset_client2 = await FileSystemStorageClient().create_dataset_client( name='test-no-purge-dataset', configuration=configuration, ) diff --git a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py index 0f0a31e9d9..765059d305 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py @@ -28,7 +28,7 @@ def configuration(tmp_path: Path) -> Configuration: @pytest.fixture async def kvs_client(configuration: Configuration) -> AsyncGenerator[FileSystemKeyValueStoreClient, None]: """A fixture for a file system key-value store client.""" - client = await FileSystemStorageClient().open_key_value_store_client( + client = await FileSystemStorageClient().create_kvs_client( name='test_kvs', configuration=configuration, ) @@ -38,7 +38,7 @@ async def kvs_client(configuration: Configuration) -> AsyncGenerator[FileSystemK async def test_open_creates_new_kvs(configuration: Configuration) -> None: """Test that open() creates a new key-value store with proper metadata and files on disk.""" - client = await FileSystemStorageClient().open_key_value_store_client( + client = await FileSystemStorageClient().create_kvs_client( name='new_kvs', configuration=configuration, ) @@ -67,7 +67,7 @@ async def test_open_kvs_by_id(configuration: Configuration) -> None: storage_client = FileSystemStorageClient() # First create a key-value store by name - original_client = await storage_client.open_key_value_store_client( + original_client = await storage_client.create_kvs_client( name='open-by-id-test', configuration=configuration, ) @@ -79,7 +79,7 @@ async def test_open_kvs_by_id(configuration: Configuration) -> None: await original_client.set_value(key='test-key', value='test-value') # Now try to open the same key-value store using just the ID - reopened_client = await storage_client.open_key_value_store_client( + reopened_client = await storage_client.create_kvs_client( id=kvs_id, configuration=configuration, ) @@ -102,7 +102,7 @@ async def test_kvs_client_purge_on_start(configuration: Configuration) -> None: configuration.purge_on_start = True # Create KVS and add data - kvs_client1 = await FileSystemStorageClient().open_key_value_store_client( + kvs_client1 = await FileSystemStorageClient().create_kvs_client( configuration=configuration, ) await kvs_client1.set_value(key='test-key', value='initial value') @@ -113,7 +113,7 @@ async def test_kvs_client_purge_on_start(configuration: Configuration) -> None: assert record.value == 'initial value' # Reopen - kvs_client2 = await FileSystemStorageClient().open_key_value_store_client( + kvs_client2 = await FileSystemStorageClient().create_kvs_client( configuration=configuration, ) @@ -127,14 +127,14 @@ async def test_kvs_client_no_purge_on_start(configuration: Configuration) -> Non configuration.purge_on_start = False # Create KVS and add data - kvs_client1 = await FileSystemStorageClient().open_key_value_store_client( + kvs_client1 = await FileSystemStorageClient().create_kvs_client( name='test-no-purge-kvs', configuration=configuration, ) await kvs_client1.set_value(key='test-key', value='preserved value') # Reopen - kvs_client2 = await FileSystemStorageClient().open_key_value_store_client( + kvs_client2 = await FileSystemStorageClient().create_kvs_client( name='test-no-purge-kvs', configuration=configuration, ) diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index 5147beadfa..52413d7858 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -28,7 +28,7 @@ def configuration(tmp_path: Path) -> Configuration: @pytest.fixture async def rq_client(configuration: Configuration) -> AsyncGenerator[FileSystemRequestQueueClient, None]: """A fixture for a file system request queue client.""" - client = await FileSystemStorageClient().open_request_queue_client( + client = await FileSystemStorageClient().create_rq_client( name='test_request_queue', configuration=configuration, ) @@ -41,7 +41,7 @@ async def test_open_request_queue_by_id(configuration: Configuration) -> None: storage_client = FileSystemStorageClient() # First create a request queue by name - original_client = await storage_client.open_request_queue_client( + original_client = await storage_client.create_rq_client( name='open-by-id-test', configuration=configuration, ) @@ -53,7 +53,7 @@ async def test_open_request_queue_by_id(configuration: Configuration) -> None: await original_client.add_batch_of_requests([Request.from_url('https://example.com/test')]) # Now try to open the same request queue using just the ID - reopened_client = await storage_client.open_request_queue_client( + reopened_client = await storage_client.create_rq_client( id=rq_id, configuration=configuration, ) @@ -73,7 +73,7 @@ async def test_open_request_queue_by_id(configuration: Configuration) -> None: async def test_open_creates_new_rq(configuration: Configuration) -> None: """Test that open() creates a new request queue with proper metadata and files on disk.""" - client = await FileSystemStorageClient().open_request_queue_client( + client = await FileSystemStorageClient().create_rq_client( name='new_request_queue', configuration=configuration, ) @@ -105,7 +105,7 @@ async def test_rq_client_purge_on_start(configuration: Configuration) -> None: configuration.purge_on_start = True # Create request queue and add data - rq_client1 = await FileSystemStorageClient().open_request_queue_client(configuration=configuration) + rq_client1 = await FileSystemStorageClient().create_rq_client(configuration=configuration) await rq_client1.add_batch_of_requests([Request.from_url('https://example.com')]) # Verify request was added @@ -114,7 +114,7 @@ async def test_rq_client_purge_on_start(configuration: Configuration) -> None: assert rq_client1.metadata.handled_request_count == 0 # Reopen - rq_client2 = await FileSystemStorageClient().open_request_queue_client(configuration=configuration) + rq_client2 = await FileSystemStorageClient().create_rq_client(configuration=configuration) # Verify data was purged assert rq_client2.metadata.pending_request_count == 0 @@ -127,14 +127,14 @@ async def test_rq_client_no_purge_on_start(configuration: Configuration) -> None configuration.purge_on_start = False # Create request queue and add data - rq_client1 = await FileSystemStorageClient().open_request_queue_client( + rq_client1 = await FileSystemStorageClient().create_rq_client( name='test-no-purge-rq', configuration=configuration, ) await rq_client1.add_batch_of_requests([Request.from_url('https://example.com')]) # Reopen - rq_client2 = await FileSystemStorageClient().open_request_queue_client( + rq_client2 = await FileSystemStorageClient().create_rq_client( name='test-no-purge-rq', configuration=configuration, ) @@ -383,7 +383,7 @@ async def test_get_request(rq_client: FileSystemRequestQueueClient) -> None: async def test_drop(configuration: Configuration) -> None: """Test dropping the queue removes files from the filesystem.""" - client = await FileSystemStorageClient().open_request_queue_client( + client = await FileSystemStorageClient().create_rq_client( name='drop_test', configuration=configuration, ) @@ -413,7 +413,7 @@ async def test_file_persistence(configuration: Configuration) -> None: configuration.purge_on_start = False # Create a client and add requests - client1 = await FileSystemStorageClient().open_request_queue_client( + client1 = await FileSystemStorageClient().create_rq_client( name='persistence_test', configuration=configuration, ) @@ -439,7 +439,7 @@ async def test_file_persistence(configuration: Configuration) -> None: assert len(request_files) > 0, 'Request files should exist' # Create a new client with same name (which will load from files) - client2 = await FileSystemStorageClient().open_request_queue_client( + client2 = await FileSystemStorageClient().create_rq_client( name='persistence_test', configuration=configuration, ) diff --git a/tests/unit/storage_clients/_memory/test_memory_dataset_client.py b/tests/unit/storage_clients/_memory/test_memory_dataset_client.py index c25074e5c0..6cb77556f4 100644 --- a/tests/unit/storage_clients/_memory/test_memory_dataset_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_dataset_client.py @@ -18,14 +18,14 @@ @pytest.fixture async def dataset_client() -> AsyncGenerator[MemoryDatasetClient, None]: """Fixture that provides a fresh memory dataset client for each test.""" - client = await MemoryStorageClient().open_dataset_client(name='test_dataset') + client = await MemoryStorageClient().create_dataset_client(name='test_dataset') yield client await client.drop() async def test_open_creates_new_dataset() -> None: """Test that open() creates a new dataset with proper metadata and adds it to the cache.""" - client = await MemoryStorageClient().open_dataset_client(name='new_dataset') + client = await MemoryStorageClient().create_dataset_client(name='new_dataset') # Verify correct client type and properties assert isinstance(client, MemoryDatasetClient) @@ -42,7 +42,7 @@ async def test_dataset_client_purge_on_start() -> None: configuration = Configuration(purge_on_start=True) # Create dataset and add data - dataset_client1 = await MemoryStorageClient().open_dataset_client( + dataset_client1 = await MemoryStorageClient().create_dataset_client( name='test_purge_dataset', configuration=configuration, ) @@ -53,7 +53,7 @@ async def test_dataset_client_purge_on_start() -> None: assert len(items.items) == 1 # Reopen - dataset_client2 = await MemoryStorageClient().open_dataset_client( + dataset_client2 = await MemoryStorageClient().create_dataset_client( name='test_purge_dataset', configuration=configuration, ) @@ -65,7 +65,7 @@ async def test_dataset_client_purge_on_start() -> None: async def test_open_with_id_and_name() -> None: """Test that open() can be used with both id and name parameters.""" - client = await MemoryStorageClient().open_dataset_client( + client = await MemoryStorageClient().create_dataset_client( id='some-id', name='some-name', ) diff --git a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py index 0af70285c1..ee699d4230 100644 --- a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py @@ -18,14 +18,14 @@ @pytest.fixture async def kvs_client() -> AsyncGenerator[MemoryKeyValueStoreClient, None]: """Fixture that provides a fresh memory key-value store client for each test.""" - client = await MemoryStorageClient().open_key_value_store_client(name='test_kvs') + client = await MemoryStorageClient().create_kvs_client(name='test_kvs') yield client await client.drop() async def test_open_creates_new_kvs() -> None: """Test that open() creates a new key-value store with proper metadata and adds it to the cache.""" - client = await MemoryStorageClient().open_key_value_store_client(name='new_kvs') + client = await MemoryStorageClient().create_kvs_client(name='new_kvs') # Verify correct client type and properties assert isinstance(client, MemoryKeyValueStoreClient) @@ -41,7 +41,7 @@ async def test_kvs_client_purge_on_start() -> None: configuration = Configuration(purge_on_start=True) # Create KVS and add data - kvs_client1 = await MemoryStorageClient().open_key_value_store_client( + kvs_client1 = await MemoryStorageClient().create_kvs_client( name='test_purge_kvs', configuration=configuration, ) @@ -53,7 +53,7 @@ async def test_kvs_client_purge_on_start() -> None: assert record.value == 'initial value' # Reopen - kvs_client2 = await MemoryStorageClient().open_key_value_store_client( + kvs_client2 = await MemoryStorageClient().create_kvs_client( name='test_purge_kvs', configuration=configuration, ) @@ -65,7 +65,7 @@ async def test_kvs_client_purge_on_start() -> None: async def test_open_with_id_and_name() -> None: """Test that open() can be used with both id and name parameters.""" - client = await MemoryStorageClient().open_key_value_store_client( + client = await MemoryStorageClient().create_kvs_client( id='some-id', name='some-name', ) diff --git a/tests/unit/storage_clients/_memory/test_memory_rq_client.py b/tests/unit/storage_clients/_memory/test_memory_rq_client.py index 028c53ccd2..f5ef1060e5 100644 --- a/tests/unit/storage_clients/_memory/test_memory_rq_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_rq_client.py @@ -18,14 +18,14 @@ @pytest.fixture async def rq_client() -> AsyncGenerator[MemoryRequestQueueClient, None]: """Fixture that provides a fresh memory request queue client for each test.""" - client = await MemoryStorageClient().open_request_queue_client(name='test_rq') + client = await MemoryStorageClient().create_rq_client(name='test_rq') yield client await client.drop() async def test_open_creates_new_rq() -> None: """Test that open() creates a new request queue with proper metadata and adds it to the cache.""" - client = await MemoryStorageClient().open_request_queue_client(name='new_rq') + client = await MemoryStorageClient().create_rq_client(name='new_rq') # Verify correct client type and properties assert isinstance(client, MemoryRequestQueueClient) @@ -45,7 +45,7 @@ async def test_rq_client_purge_on_start() -> None: configuration = Configuration(purge_on_start=True) # Create RQ and add data - rq_client1 = await MemoryStorageClient().open_request_queue_client( + rq_client1 = await MemoryStorageClient().create_rq_client( name='test_purge_rq', configuration=configuration, ) @@ -56,7 +56,7 @@ async def test_rq_client_purge_on_start() -> None: assert await rq_client1.is_empty() is False # Reopen - rq_client2 = await MemoryStorageClient().open_request_queue_client( + rq_client2 = await MemoryStorageClient().create_rq_client( name='test_purge_rq', configuration=configuration, ) @@ -67,7 +67,7 @@ async def test_rq_client_purge_on_start() -> None: async def test_open_with_id_and_name() -> None: """Test that open() can be used with both id and name parameters.""" - client = await MemoryStorageClient().open_request_queue_client( + client = await MemoryStorageClient().create_rq_client( id='some-id', name='some-name', ) From fa18d199e8cf02a3213fc43d4d342f022779954c Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 19 Jun 2025 15:34:57 +0200 Subject: [PATCH 26/43] Introduce storage instance manager --- src/crawlee/_service_locator.py | 17 +++ src/crawlee/_utils/recoverable_state.py | 8 +- src/crawlee/storages/_dataset.py | 26 +--- src/crawlee/storages/_key_value_store.py | 28 +--- src/crawlee/storages/_request_queue.py | 25 +--- .../storages/_storage_instance_manager.py | 124 ++++++++++++++++++ src/crawlee/storages/_utils.py | 44 ------- tests/unit/conftest.py | 20 +-- tests/unit/storages/test_request_queue.py | 11 +- 9 files changed, 177 insertions(+), 126 deletions(-) create mode 100644 src/crawlee/storages/_storage_instance_manager.py delete mode 100644 src/crawlee/storages/_utils.py diff --git a/src/crawlee/_service_locator.py b/src/crawlee/_service_locator.py index 2cb8f8302a..427a5538aa 100644 --- a/src/crawlee/_service_locator.py +++ b/src/crawlee/_service_locator.py @@ -1,11 +1,16 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from crawlee._utils.docs import docs_group from crawlee.configuration import Configuration from crawlee.errors import ServiceConflictError from crawlee.events import EventManager, LocalEventManager from crawlee.storage_clients import FileSystemStorageClient, StorageClient +if TYPE_CHECKING: + from crawlee.storages._storage_instance_manager import StorageInstanceManager + @docs_group('Classes') class ServiceLocator: @@ -18,6 +23,7 @@ def __init__(self) -> None: self._configuration: Configuration | None = None self._event_manager: EventManager | None = None self._storage_client: StorageClient | None = None + self._storage_instance_manager: StorageInstanceManager | None = None # Flags to check if the services were already set. self._configuration_was_retrieved = False @@ -94,5 +100,16 @@ def set_storage_client(self, storage_client: StorageClient) -> None: self._storage_client = storage_client + @property + def storage_instance_manager(self) -> StorageInstanceManager: + """Get the storage instance manager.""" + if self._storage_instance_manager is None: + # Import here to avoid circular imports + from crawlee.storages._storage_instance_manager import StorageInstanceManager + + self._storage_instance_manager = StorageInstanceManager() + + return self._storage_instance_manager + service_locator = ServiceLocator() diff --git a/src/crawlee/_utils/recoverable_state.py b/src/crawlee/_utils/recoverable_state.py index 2cfdcd9ec7..03c6e399c4 100644 --- a/src/crawlee/_utils/recoverable_state.py +++ b/src/crawlee/_utils/recoverable_state.py @@ -6,11 +6,12 @@ from crawlee import service_locator from crawlee.events._types import Event, EventPersistStateData -from crawlee.storages._key_value_store import KeyValueStore if TYPE_CHECKING: import logging + from crawlee.storages._key_value_store import KeyValueStore + TStateModel = TypeVar('TStateModel', bound=BaseModel) @@ -59,7 +60,7 @@ def __init__( self._persist_state_key = persist_state_key self._persist_state_kvs_name = persist_state_kvs_name self._persist_state_kvs_id = persist_state_kvs_id - self._key_value_store: KeyValueStore | None = None + self._key_value_store: 'KeyValueStore | None' = None # noqa: UP037 self._log = logger async def initialize(self) -> TStateModel: @@ -75,6 +76,9 @@ async def initialize(self) -> TStateModel: self._state = self._default_state.model_copy(deep=True) return self.current_value + # Import here to avoid circular imports + from crawlee.storages._key_value_store import KeyValueStore + self._key_value_store = await KeyValueStore.open( name=self._persist_state_kvs_name, id=self._persist_state_kvs_id ) diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index cb796fee11..50badbf246 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -12,11 +12,10 @@ from ._base import Storage from ._key_value_store import KeyValueStore -from ._utils import open_storage_instance if TYPE_CHECKING: from collections.abc import AsyncIterator - from typing import Any, ClassVar, Literal + from typing import Any, Literal from typing_extensions import Unpack @@ -66,15 +65,6 @@ class Dataset(Storage): ``` """ - _cache_by_id: ClassVar[dict[str, Dataset]] = {} - """A dictionary to cache datasets by ID.""" - - _cache_by_name: ClassVar[dict[str, Dataset]] = {} - """A dictionary to cache datasets by name.""" - - _default_instance: ClassVar[Dataset | None] = None - """Cache for the default dataset instance.""" - def __init__(self, client: DatasetClient) -> None: """Initialize a new instance. @@ -112,25 +102,19 @@ async def open( ) -> Dataset: configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client - return await open_storage_instance( + + return await service_locator.storage_instance_manager.open_storage_instance( cls, id=id, name=name, configuration=configuration, - cache_by_id=cls._cache_by_id, - cache_by_name=cls._cache_by_name, - default_instance_attr='_default_instance', client_opener=storage_client.create_dataset_client, ) @override async def drop(self) -> None: - if self.id in self._cache_by_id: - del self._cache_by_id[self.id] - - if self.name in self._cache_by_name: - del self._cache_by_name[self.name] - + storage_instance_manager = service_locator.storage_instance_manager + storage_instance_manager.remove_from_cache(self) await self._client.drop() @override diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index 0863193873..9a9974c20e 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -11,19 +11,20 @@ from crawlee import service_locator from crawlee._types import JsonSerializable # noqa: TC001 from crawlee._utils.docs import docs_group +from crawlee._utils.recoverable_state import RecoverableState from crawlee.storage_clients.models import KeyValueStoreMetadata from ._base import Storage -from ._utils import open_storage_instance if TYPE_CHECKING: from collections.abc import AsyncIterator - from crawlee._utils.recoverable_state import RecoverableState from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecordMetadata +else: + from crawlee._utils.recoverable_state import RecoverableState T = TypeVar('T') @@ -65,15 +66,6 @@ class KeyValueStore(Storage): ``` """ - _cache_by_id: ClassVar[dict[str, KeyValueStore]] = {} - """A dictionary to cache key-value stores by ID.""" - - _cache_by_name: ClassVar[dict[str, KeyValueStore]] = {} - """A dictionary to cache key-value stores by name.""" - - _default_instance: ClassVar[KeyValueStore | None] = None - """Cache for the default key-value store instance.""" - _autosaved_values: ClassVar[ dict[ str, @@ -120,23 +112,19 @@ async def open( ) -> KeyValueStore: configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client - return await open_storage_instance( + + return await service_locator.storage_instance_manager.open_storage_instance( cls, id=id, name=name, configuration=configuration, - cache_by_id=cls._cache_by_id, - cache_by_name=cls._cache_by_name, - default_instance_attr='_default_instance', client_opener=storage_client.create_kvs_client, ) @override async def drop(self) -> None: - if self.id in self._cache_by_id: - del self._cache_by_id[self.id] - if self.name is not None and self.name in self._cache_by_name: - del self._cache_by_name[self.name] + storage_instance_manager = service_locator.storage_instance_manager + storage_instance_manager.remove_from_cache(self) await self._clear_cache() # Clear cache with persistent values. await self._client.drop() @@ -259,8 +247,6 @@ async def get_auto_saved_value( Returns: Return the value of the key. """ - from crawlee._utils.recoverable_state import RecoverableState - default_value = {} if default_value is None else default_value async with self._autosave_lock: diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index cda808bb5e..36551a6e16 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -3,7 +3,7 @@ import asyncio from datetime import timedelta from logging import getLogger -from typing import TYPE_CHECKING, ClassVar, TypeVar +from typing import TYPE_CHECKING, TypeVar from typing_extensions import override @@ -13,7 +13,6 @@ from crawlee.request_loaders import RequestManager from ._base import Storage -from ._utils import open_storage_instance if TYPE_CHECKING: from collections.abc import Sequence @@ -71,15 +70,6 @@ class RequestQueue(Storage, RequestManager): ``` """ - _cache_by_id: ClassVar[dict[str, RequestQueue]] = {} - """A dictionary to cache request queues by ID.""" - - _cache_by_name: ClassVar[dict[str, RequestQueue]] = {} - """A dictionary to cache request queues by name.""" - - _default_instance: ClassVar[RequestQueue | None] = None - """Cache for the default request queue instance.""" - def __init__(self, client: RequestQueueClient) -> None: """Initialize a new instance. @@ -128,25 +118,20 @@ async def open( ) -> RequestQueue: configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client - return await open_storage_instance( + + return await service_locator.storage_instance_manager.open_storage_instance( cls, id=id, name=name, configuration=configuration, - cache_by_id=cls._cache_by_id, - cache_by_name=cls._cache_by_name, - default_instance_attr='_default_instance', client_opener=storage_client.create_rq_client, ) @override async def drop(self) -> None: # Remove from cache before dropping - if self.id in self._cache_by_id: - del self._cache_by_id[self.id] - - if self.name is not None and self.name in self._cache_by_name: - del self._cache_by_name[self.name] + storage_instance_manager = service_locator.storage_instance_manager + storage_instance_manager.remove_from_cache(self) await self._client.drop() diff --git a/src/crawlee/storages/_storage_instance_manager.py b/src/crawlee/storages/_storage_instance_manager.py new file mode 100644 index 0000000000..44d5f17fdd --- /dev/null +++ b/src/crawlee/storages/_storage_instance_manager.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +from typing import Any, Callable, TypeVar, cast + +from crawlee._utils.docs import docs_group + +from ._base import Storage + +T = TypeVar('T', bound='Storage') + + +@docs_group('Classes') +class StorageInstanceManager: + """Manager for caching and managing storage instances. + + This class centralizes the caching logic for all storage types (Dataset, KeyValueStore, RequestQueue) + and provides a unified interface for opening and managing storage instances. + """ + + def __init__(self) -> None: + self._cache_by_id = dict[type[Storage], dict[str, Storage]]() + """Cache for storage instances by ID, separated by storage type.""" + + self._cache_by_name = dict[type[Storage], dict[str, Storage]]() + """Cache for storage instances by name, separated by storage type.""" + + self._default_instances = dict[type[Storage], Storage]() + """Cache for default instances of each storage type.""" + + async def open_storage_instance( + self, + cls: type[T], + *, + id: str | None, + name: str | None, + configuration: Any, + client_opener: Callable[..., Any], + ) -> T: + """Open a storage instance with caching support. + + Args: + cls: The storage class to instantiate. + id: Storage ID. + name: Storage name. + configuration: Configuration object. + client_opener: Function to create the storage client. + + Returns: + The storage instance. + + Raises: + ValueError: If both id and name are specified. + """ + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # Check for default instance + if id is None and name is None and cls in self._default_instances: + return cast('T', self._default_instances[cls]) + + # Check cache + if id is not None: + type_cache_by_id = self._cache_by_id.get(cls, {}) + if id in type_cache_by_id: + cached_instance = type_cache_by_id[id] + if isinstance(cached_instance, cls): + return cached_instance + + if name is not None: + type_cache_by_name = self._cache_by_name.get(cls, {}) + if name in type_cache_by_name: + cached_instance = type_cache_by_name[name] + if isinstance(cached_instance, cls): + return cached_instance + + # Create new instance + client = await client_opener(id=id, name=name, configuration=configuration) + instance = cls(client) # type: ignore[call-arg] + instance_name = getattr(instance, 'name', None) + + # Cache the instance + if cls not in self._cache_by_id: + self._cache_by_id[cls] = {} + if cls not in self._cache_by_name: + self._cache_by_name[cls] = {} + + self._cache_by_id[cls][instance.id] = instance + if instance_name is not None: + self._cache_by_name[cls][instance_name] = instance + + # Set as default if no id/name specified + if id is None and name is None: + self._default_instances[cls] = instance + + return instance + + def remove_from_cache(self, storage_instance: Storage) -> None: + """Remove a storage instance from the cache. + + Args: + storage_instance: The storage instance to remove. + """ + storage_type = type(storage_instance) + + # Remove from ID cache + type_cache_by_id = self._cache_by_id.get(storage_type, {}) + if storage_instance.id in type_cache_by_id: + del type_cache_by_id[storage_instance.id] + + # Remove from name cache + if storage_instance.name is not None: + type_cache_by_name = self._cache_by_name.get(storage_type, {}) + if storage_instance.name in type_cache_by_name: + del type_cache_by_name[storage_instance.name] + + # Remove from default instances + if storage_type in self._default_instances and self._default_instances[storage_type] is storage_instance: + del self._default_instances[storage_type] + + def clear_cache(self) -> None: + """Clear all cached storage instances.""" + self._cache_by_id.clear() + self._cache_by_name.clear() + self._default_instances.clear() diff --git a/src/crawlee/storages/_utils.py b/src/crawlee/storages/_utils.py deleted file mode 100644 index e8c190dfa2..0000000000 --- a/src/crawlee/storages/_utils.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import annotations - -from typing import Any, Callable, TypeVar, cast - -from ._base import Storage - -T = TypeVar('T', bound=Storage) - - -async def open_storage_instance( - cls: type[T], - *, - id: str | None, - name: str | None, - configuration: Any, - cache_by_id: dict[str, T], - cache_by_name: dict[str, T], - default_instance_attr: str, - client_opener: Callable[..., Any], -) -> T: - if id and name: - raise ValueError('Only one of "id" or "name" can be specified, not both.') - - default_instance = getattr(cls, default_instance_attr) - if id is None and name is None and default_instance is not None: - return cast('T', default_instance) - - if id is not None and id in cache_by_id: - return cache_by_id[id] - if name is not None and name in cache_by_name: - return cache_by_name[name] - - client = await client_opener(id=id, name=name, configuration=configuration) - instance = cls(client) # type: ignore[call-arg] - instance_name = getattr(instance, 'name', None) - - cache_by_id[instance.id] = instance - if instance_name is not None: - cache_by_name[instance_name] = instance - - if id is None and name is None: - setattr(cls, default_instance_attr, instance) - - return instance diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index f7aa551dd7..0d139c9372 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -16,7 +16,7 @@ from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_network from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient from crawlee.proxy_configuration import ProxyInfo -from crawlee.storages import Dataset, KeyValueStore, RequestQueue +from crawlee.storages import KeyValueStore from tests.unit.server import TestServer, app, serve_in_thread if TYPE_CHECKING: @@ -61,6 +61,12 @@ def _prepare_test_env() -> None: service_locator._configuration = None service_locator._event_manager = None service_locator._storage_client = None + service_locator._storage_instance_manager = None + + # Reset the retrieval flags + service_locator._configuration_was_retrieved = False + service_locator._event_manager_was_retrieved = False + service_locator._storage_client_was_retrieved = False # Verify that the test environment was set up correctly. assert os.environ.get('CRAWLEE_STORAGE_DIR') == str(tmp_path) @@ -68,18 +74,6 @@ def _prepare_test_env() -> None: assert service_locator._storage_client_was_retrieved is False assert service_locator._event_manager_was_retrieved is False - Dataset._cache_by_id.clear() - Dataset._cache_by_name.clear() - Dataset._default_instance = None - - KeyValueStore._cache_by_id.clear() - KeyValueStore._cache_by_name.clear() - KeyValueStore._default_instance = None - - RequestQueue._cache_by_id.clear() - RequestQueue._cache_by_name.clear() - RequestQueue._default_instance = None - return _prepare_test_env diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index 98236c3a49..1f3d1936df 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -8,7 +8,7 @@ import pytest -from crawlee import Request +from crawlee import Request, service_locator from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, StorageClient from crawlee.storages import RequestQueue @@ -17,6 +17,8 @@ from collections.abc import AsyncGenerator from pathlib import Path + from crawlee.storage_clients import StorageClient + @pytest.fixture(params=['memory', 'file_system']) def storage_client(request: pytest.FixtureRequest) -> StorageClient: @@ -526,10 +528,9 @@ async def test_reopen_default( configuration: Configuration, ) -> None: """Test reopening the default request queue.""" - # First clean up any class-level caches - RequestQueue._cache_by_id.clear() - RequestQueue._cache_by_name.clear() - RequestQueue._default_instance = None + # First clean up any storage instance caches + storage_instance_manager = service_locator.storage_instance_manager + storage_instance_manager.clear_cache() # Open the default request queue rq1 = await RequestQueue.open( From c783dac894ab4c7cc30af61e0217874b3250cb58 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 20 Jun 2025 13:13:40 +0200 Subject: [PATCH 27/43] Utilize recoverable state for the FS RQ state --- src/crawlee/_service_locator.py | 2 +- src/crawlee/_utils/recoverable_state.py | 9 +- .../_file_system/_request_queue_client.py | 290 +++++++++--------- .../_file_system/test_fs_rq_client.py | 9 - 4 files changed, 156 insertions(+), 154 deletions(-) diff --git a/src/crawlee/_service_locator.py b/src/crawlee/_service_locator.py index 427a5538aa..52f934a881 100644 --- a/src/crawlee/_service_locator.py +++ b/src/crawlee/_service_locator.py @@ -104,7 +104,7 @@ def set_storage_client(self, storage_client: StorageClient) -> None: def storage_instance_manager(self) -> StorageInstanceManager: """Get the storage instance manager.""" if self._storage_instance_manager is None: - # Import here to avoid circular imports + # Import here to avoid circular imports. from crawlee.storages._storage_instance_manager import StorageInstanceManager self._storage_instance_manager = StorageInstanceManager() diff --git a/src/crawlee/_utils/recoverable_state.py b/src/crawlee/_utils/recoverable_state.py index 03c6e399c4..35ee0a1d3f 100644 --- a/src/crawlee/_utils/recoverable_state.py +++ b/src/crawlee/_utils/recoverable_state.py @@ -4,7 +4,6 @@ from pydantic import BaseModel -from crawlee import service_locator from crawlee.events._types import Event, EventPersistStateData if TYPE_CHECKING: @@ -76,7 +75,7 @@ async def initialize(self) -> TStateModel: self._state = self._default_state.model_copy(deep=True) return self.current_value - # Import here to avoid circular imports + # Import here to avoid circular imports. from crawlee.storages._key_value_store import KeyValueStore self._key_value_store = await KeyValueStore.open( @@ -85,6 +84,9 @@ async def initialize(self) -> TStateModel: await self._load_saved_state() + # Import here to avoid circular imports. + from crawlee import service_locator + event_manager = service_locator.get_event_manager() event_manager.on(event=Event.PERSIST_STATE, listener=self.persist_state) @@ -99,6 +101,9 @@ async def teardown(self) -> None: if not self._persistence_enabled: return + # Import here to avoid circular imports. + from crawlee import service_locator + event_manager = service_locator.get_event_manager() event_manager.off(event=Event.PERSIST_STATE, listener=self.persist_state) await self.persist_state() diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 7c4e1ca50f..084d9abda0 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -9,13 +9,14 @@ from pathlib import Path from typing import TYPE_CHECKING -from pydantic import ValidationError +from pydantic import BaseModel, ValidationError from typing_extensions import override from crawlee import Request from crawlee._consts import METADATA_FILENAME from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.file import atomic_write, json_dumps +from crawlee._utils.recoverable_state import RecoverableState from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import ( AddRequestsResponse, @@ -32,6 +33,28 @@ logger = getLogger(__name__) +class RequestQueueState(BaseModel): + """State model for the `FileSystemRequestQueueClient`.""" + + sequence_counter: int = 0 + """Counter for regular request ordering.""" + + forefront_sequence_counter: int = 0 + """Counter for forefront request ordering.""" + + forefront_requests: dict[str, int] = {} + """Mapping of forefront request IDs to their sequence numbers.""" + + regular_requests: dict[str, int] = {} + """Mapping of regular request IDs to their sequence numbers.""" + + in_progress_requests: set[str] = set() + """Set of request IDs currently being processed.""" + + handled_requests: set[str] = set() + """Set of request IDs that have been handled.""" + + class FileSystemRequestQueueClient(RequestQueueClient): """A file system implementation of the request queue client. @@ -43,9 +66,10 @@ class FileSystemRequestQueueClient(RequestQueueClient): {STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json ``` - The implementation uses sequence numbers embedded in request files for FIFO ordering of regular requests. - It maintains in-memory data structures for tracking in-progress requests and prioritizing forefront requests. - File system storage provides durability at the cost of slower I/O operations compared to memory-based storage. + The implementation uses `RecoverableState` to maintain ordering information, in-progress status, and + request handling status. This allows for proper state recovery across process restarts without + embedding metadata in individual request files. File system storage provides durability at the cost of + slower I/O operations compared to memory only-based storage. This implementation is ideal for long-running crawlers where persistence is important and for situations where you need to resume crawling after process termination. @@ -74,8 +98,6 @@ def __init__( stats: dict, total_request_count: int, storage_dir: Path, - sequence_counter: int, - forefront_sequence_counter: int, ) -> None: """Initialize a new instance. @@ -97,24 +119,9 @@ def __init__( self._storage_dir = storage_dir """The base directory where the request queue is stored.""" - self._sequence_counter = sequence_counter - """A counter to track the order of (normal) requests added to the queue. - - This number is going to be used as a sequence number for next request. - """ - - self._forefront_sequence_counter = forefront_sequence_counter - """A counter to track the order of forefront requests added to the queue. - - This number is going to be used as a sequence number for next forefront request. - """ - self._lock = asyncio.Lock() """A lock to ensure that only one operation is performed at a time.""" - self._in_progress = set[str]() - """A set of request IDs that are currently being processed.""" - self._request_cache = deque[Request]() """Cache for requests: forefront requests at the beginning, regular requests at the end.""" @@ -124,6 +131,15 @@ def __init__( self._is_empty_cache: bool | None = None """Cache for is_empty result: None means unknown, True/False is cached state.""" + self._state = RecoverableState[RequestQueueState]( + default_state=RequestQueueState(), + persist_state_key='request_queue_state', + persistence_enabled=True, + persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}', + logger=logger, + ) + """Recoverable state to maintain request ordering, in-progress status, and handled status.""" + @property @override def metadata(self) -> RequestQueueMetadata: @@ -174,20 +190,13 @@ async def open( file_content = json.load(file) metadata = RequestQueueMetadata(**file_content) - rq_path = ( - rq_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT - if metadata.name is None - else rq_base_path / metadata.name - ) - sequence_counter, forefront_sequence_counter = await cls._get_sequence_counters(rq_path) - if metadata.id == id: client = cls( **metadata.model_dump(), storage_dir=storage_dir, - sequence_counter=sequence_counter, - forefront_sequence_counter=forefront_sequence_counter, ) + await client._state.initialize() + await client._discover_existing_requests() await client._update_metadata(update_accessed_at=True) found = True break @@ -217,15 +226,14 @@ async def open( raise ValueError(f'Invalid metadata file for request queue "{name}"') from exc metadata.name = name - sequence_counter, forefront_sequence_counter = await cls._get_sequence_counters(rq_path) client = cls( **metadata.model_dump(), storage_dir=storage_dir, - sequence_counter=sequence_counter, - forefront_sequence_counter=forefront_sequence_counter, ) + await client._state.initialize() + await client._discover_existing_requests() await client._update_metadata(update_accessed_at=True) # Otherwise, create a new dataset client. @@ -243,9 +251,8 @@ async def open( stats={}, total_request_count=0, storage_dir=storage_dir, - sequence_counter=0, - forefront_sequence_counter=0, ) + await client._state.initialize() await client._update_metadata() return client @@ -257,7 +264,9 @@ async def drop(self) -> None: if self.path_to_rq.exists(): await asyncio.to_thread(shutil.rmtree, self.path_to_rq) - self._in_progress.clear() + # Clear recoverable state + await self._state.reset() + await self._state.teardown() self._request_cache.clear() self._request_cache_needs_refresh = True @@ -272,7 +281,8 @@ async def purge(self) -> None: for file_path in request_files: await asyncio.to_thread(file_path.unlink, missing_ok=True) - self._in_progress.clear() + # Clear recoverable state + await self._state.reset() self._request_cache.clear() self._request_cache_needs_refresh = True @@ -298,6 +308,7 @@ async def add_batch_of_requests( new_pending_request_count = self._metadata.pending_request_count processed_requests = list[ProcessedRequest]() unprocessed_requests = list[UnprocessedRequest]() + state = self._state.current_value # Prepare a dictionary to track existing requests by their unique keys. existing_unique_keys: dict[str, Path] = {} @@ -321,19 +332,18 @@ async def add_batch_of_requests( if existing_request is None: request_path = self._get_request_path(request.id) - # Add sequence number to ensure FIFO ordering. + # Add sequence number to ensure FIFO ordering using state. if forefront: - sequence_number = self._forefront_sequence_counter - self._forefront_sequence_counter += 1 + sequence_number = state.forefront_sequence_counter + state.forefront_sequence_counter += 1 + state.forefront_requests[request.id] = sequence_number else: - sequence_number = self._sequence_counter - self._sequence_counter += 1 - - # Update the request data and dump it to the file. - request_dict = request.model_dump() - request_dict['__sequence'] = sequence_number - request_dict['__forefront'] = forefront - request_data = await json_dumps(request_dict) + sequence_number = state.sequence_counter + state.sequence_counter += 1 + state.regular_requests[request.id] = sequence_number + + # Save the clean request without extra fields + request_data = await json_dumps(request.model_dump()) await atomic_write(request_path, request_data) # Update the metadata counts. @@ -356,7 +366,7 @@ async def add_batch_of_requests( else: # Set the processed request flags. was_already_present = existing_request is not None - was_already_handled = existing_request.was_already_handled if existing_request else False + was_already_handled = existing_request.id in state.handled_requests # If the request is already in the RQ and handled, just continue with the next one. if was_already_present and was_already_handled: @@ -371,11 +381,21 @@ async def add_batch_of_requests( # If the request is already in the RQ but not handled yet, update it. elif was_already_present and not was_already_handled: - request_path = self._get_request_path(existing_request.id) - request_dict = existing_request.model_dump() - request_dict['__forefront'] = forefront - request_data = await json_dumps(request_dict) - await atomic_write(request_path, request_data) + # Update request type (forefront vs regular) in state + if forefront: + # Move from regular to forefront if needed + if existing_request.id in state.regular_requests: + state.regular_requests.pop(existing_request.id) + if existing_request.id not in state.forefront_requests: + state.forefront_requests[existing_request.id] = state.forefront_sequence_counter + state.forefront_sequence_counter += 1 + elif ( + existing_request.id not in state.forefront_requests + and existing_request.id not in state.regular_requests + ): + # Keep as regular if not already forefront + state.regular_requests[existing_request.id] = state.sequence_counter + state.sequence_counter += 1 processed_requests.append( ProcessedRequest( @@ -425,7 +445,8 @@ async def get_request(self, request_id: str) -> Request | None: logger.warning(f'Request with ID "{request_id}" not found in the queue.') return None - self._in_progress.add(request.id) + state = self._state.current_value + state.in_progress_requests.add(request.id) await self._update_metadata(update_accessed_at=True) return request @@ -437,17 +458,18 @@ async def fetch_next_request(self) -> Request | None: await self._refresh_cache() next_request: Request | None = None + state = self._state.current_value # Fetch from the front of the deque (forefront requests are at the beginning). while self._request_cache and next_request is None: candidate = self._request_cache.popleft() # Skip requests that are already in progress, however this should not happen. - if candidate.id not in self._in_progress: + if candidate.id not in state.in_progress_requests: next_request = candidate if next_request is not None: - self._in_progress.add(next_request.id) + state.in_progress_requests.add(next_request.id) return next_request @@ -455,9 +477,10 @@ async def fetch_next_request(self) -> Request | None: async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: async with self._lock: self._is_empty_cache = None + state = self._state.current_value # Check if the request is in progress. - if request.id not in self._in_progress: + if request.id not in state.in_progress_requests: logger.warning(f'Marking request {request.id} as handled that is not in progress.') return None @@ -475,8 +498,9 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | request_data = await json_dumps(request.model_dump()) await atomic_write(request_path, request_data) - # Remove from in-progress. - self._in_progress.discard(request.id) + # Update state: remove from in-progress and add to handled. + state.in_progress_requests.discard(request.id) + state.handled_requests.add(request.id) # Update RQ metadata. await self._update_metadata( @@ -502,9 +526,10 @@ async def reclaim_request( ) -> ProcessedRequest | None: async with self._lock: self._is_empty_cache = None + state = self._state.current_value # Check if the request is in progress. - if request.id not in self._in_progress: + if request.id not in state.in_progress_requests: logger.info(f'Reclaiming request {request.id} that is not in progress.') return None @@ -514,23 +539,26 @@ async def reclaim_request( logger.warning(f'Request file for {request.id} does not exist, cannot reclaim.') return None - # Update sequence number to ensure proper ordering. + # Update sequence number and state to ensure proper ordering. if forefront: - sequence_number = self._forefront_sequence_counter - self._forefront_sequence_counter += 1 + # Remove from regular requests if it was there + state.regular_requests.pop(request.id, None) + sequence_number = state.forefront_sequence_counter + state.forefront_sequence_counter += 1 + state.forefront_requests[request.id] = sequence_number else: - sequence_number = self._sequence_counter - self._sequence_counter += 1 + # Remove from forefront requests if it was there + state.forefront_requests.pop(request.id, None) + sequence_number = state.sequence_counter + state.sequence_counter += 1 + state.regular_requests[request.id] = sequence_number - # Dump the updated request to the file. - request_dict = request.model_dump() - request_dict['__forefront'] = forefront - request_dict['__sequence'] = sequence_number - request_data = await json_dumps(request_dict) + # Save the clean request without extra fields + request_data = await json_dumps(request.model_dump()) await atomic_write(request_path, request_data) # Remove from in-progress. - self._in_progress.discard(request.id) + state.in_progress_requests.discard(request.id) # Update RQ metadata. await self._update_metadata( @@ -558,31 +586,32 @@ async def is_empty(self) -> bool: if self._is_empty_cache is not None: return self._is_empty_cache + state = self._state.current_value + # If there are in-progress requests, return False immediately. - if len(self._in_progress) > 0: + if len(state.in_progress_requests) > 0: self._is_empty_cache = False return False # If we have a cached requests, check them first (fast path). if self._request_cache: for req in self._request_cache: - if req.handled_at is None: + if req.id not in state.handled_requests: self._is_empty_cache = False return False self._is_empty_cache = True - return len(self._in_progress) == 0 + return len(state.in_progress_requests) == 0 - # Fallback: check files on disk (slow path). + # Fallback: check state for unhandled requests. await self._update_metadata(update_accessed_at=True) - request_files = await self._get_request_files(self.path_to_rq) - for request_file in request_files: - request = await self._parse_request_file(request_file) - if request is None: - continue - if request.handled_at is None: - self._is_empty_cache = False - return False + # Check if there are any requests that are not handled + all_requests = set(state.forefront_requests.keys()) | set(state.regular_requests.keys()) + unhandled_requests = all_requests - state.handled_requests + + if unhandled_requests: + self._is_empty_cache = False + return False self._is_empty_cache = True return True @@ -655,56 +684,54 @@ async def _refresh_cache(self) -> None: prioritizing forefront requests and maintaining proper ordering. """ self._request_cache.clear() + state = self._state.current_value - forefront_requests = list[Request]() - regular_requests = list[Request]() + forefront_requests = list[tuple[Request, int]]() # (request, sequence) + regular_requests = list[tuple[Request, int]]() # (request, sequence) request_files = await self._get_request_files(self.path_to_rq) for request_file in request_files: request = await self._parse_request_file(request_file) - if request is None or request.was_already_handled: - continue - - if request.id in self._in_progress: + if request is None: continue - if request.model_extra is None: - logger.warning(f'Request file "{request_file}" does not contain model_extra field.') + # Skip handled requests + if request.id in state.handled_requests: continue - forefront = request.model_extra.get('__forefront') - if forefront is None: - logger.warning(f'Request file "{request_file}" does not contain "__forefront" field.') + # Skip in-progress requests + if request.id in state.in_progress_requests: continue - if forefront: - forefront_requests.append(request) + # Determine if request is forefront or regular based on state + if request.id in state.forefront_requests: + sequence = state.forefront_requests[request.id] + forefront_requests.append((request, sequence)) + elif request.id in state.regular_requests: + sequence = state.regular_requests[request.id] + regular_requests.append((request, sequence)) else: - regular_requests.append(request) + # Request not in state, skip it (might be orphaned) + logger.warning(f'Request {request.id} not found in state, skipping.') + continue # Sort forefront requests by sequence (newest first for LIFO behavior). - forefront_requests.sort( - key=lambda request: request.model_extra.get('__sequence', 0) if request.model_extra else 0, - reverse=True, - ) + forefront_requests.sort(key=lambda item: item[1], reverse=True) # Sort regular requests by sequence (oldest first for FIFO behavior). - regular_requests.sort( - key=lambda request: request.model_extra.get('__sequence', 0) if request.model_extra else 0, - reverse=False, - ) + regular_requests.sort(key=lambda item: item[1], reverse=False) # Add forefront requests to the beginning of the cache (left side). Since forefront_requests are sorted # by sequence (newest first), we need to add them in reverse order to maintain correct priority. - for request in reversed(forefront_requests): + for request, _ in reversed(forefront_requests): if len(self._request_cache) >= self._MAX_REQUESTS_IN_CACHE: break self._request_cache.appendleft(request) # Add regular requests to the end of the cache (right side). - for request in regular_requests: + for request, _ in regular_requests: if len(self._request_cache) >= self._MAX_REQUESTS_IN_CACHE: break self._request_cache.append(request) @@ -768,43 +795,22 @@ async def _parse_request_file(cls, file_path: Path) -> Request | None: logger.warning(f'Failed to validate request file {file_path}: {exc!s}') return None - @classmethod - async def _get_sequence_counters(cls, path_to_rq: Path) -> tuple[int, int]: - """Get the current sequence counters for the request queue. - - Args: - path_to_rq: The path to the request queue directory. - - Returns: - A tuple containing the current sequence counter for regular requests and for forefront requests. - """ - max_sequence = -1 - max_forefront_sequence = -1 - - # Get all request files. - request_files = await cls._get_request_files(path_to_rq) + async def _discover_existing_requests(self) -> None: + """Discover and load existing requests into the state when opening an existing request queue.""" + request_files = await self._get_request_files(self.path_to_rq) + state = self._state.current_value for request_file in request_files: - request = await cls._parse_request_file(request_file) + request = await self._parse_request_file(request_file) if request is None: continue - # Extract sequence number and forefront flag from model_extra. - if request.model_extra: - sequence = request.model_extra.get('__sequence') - is_forefront = request.model_extra.get('__forefront') - - if sequence is None: - logger.warning(f'Request file "{request_file}" does not contain "__sequence" field.') - continue - - if is_forefront is None: - logger.warning(f'Request file "{request_file}" does not contain "__forefront" field.') - continue - - if is_forefront: - max_forefront_sequence = max(max_forefront_sequence, sequence) - else: - max_sequence = max(max_sequence, sequence) + # Add request to state as regular request (assign sequence numbers) + if request.id not in state.regular_requests and request.id not in state.forefront_requests: + # Assign as regular request with current sequence counter + state.regular_requests[request.id] = state.sequence_counter + state.sequence_counter += 1 - return max_sequence, max_forefront_sequence + # Check if request was already handled + if request.handled_at is not None: + state.handled_requests.add(request.id) diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index 52413d7858..e1f737ea58 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -222,9 +222,6 @@ async def test_fetch_next_request(rq_client: FileSystemRequestQueueClient) -> No assert first_request is not None assert first_request.url == 'https://example.com/1' - # Check that it's marked as in-progress - assert first_request.id in rq_client._in_progress - # Fetch the second request second_request = await rq_client.fetch_next_request() assert second_request is not None @@ -284,9 +281,6 @@ async def test_mark_request_as_handled(rq_client: FileSystemRequestQueueClient) assert result is not None assert result.was_already_handled is True - # Verify it's no longer in-progress - assert request.id not in rq_client._in_progress - # Verify metadata was updated assert rq_client.metadata.handled_request_count == 1 assert rq_client.metadata.pending_request_count == 0 @@ -313,9 +307,6 @@ async def test_reclaim_request(rq_client: FileSystemRequestQueueClient) -> None: assert result is not None assert result.was_already_handled is False - # Verify it's no longer in-progress - assert request.id not in rq_client._in_progress - # Should be able to fetch it again reclaimed_request = await rq_client.fetch_next_request() assert reclaimed_request is not None From 437071e5942c19599c4231882da341d240c59bb9 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 20 Jun 2025 13:27:57 +0200 Subject: [PATCH 28/43] Details --- src/crawlee/_request.py | 2 +- src/crawlee/storages/_storage_instance_manager.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index 3637ca70c2..a3581e7ebf 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -158,7 +158,7 @@ class Request(BaseModel): ``` """ - model_config = ConfigDict(populate_by_name=True, extra='allow') + model_config = ConfigDict(populate_by_name=True) id: str """A unique identifier for the request. Note that this is not used for deduplication, and should not be confused diff --git a/src/crawlee/storages/_storage_instance_manager.py b/src/crawlee/storages/_storage_instance_manager.py index 44d5f17fdd..e4cdc5587f 100644 --- a/src/crawlee/storages/_storage_instance_manager.py +++ b/src/crawlee/storages/_storage_instance_manager.py @@ -79,14 +79,12 @@ async def open_storage_instance( instance_name = getattr(instance, 'name', None) # Cache the instance - if cls not in self._cache_by_id: - self._cache_by_id[cls] = {} - if cls not in self._cache_by_name: - self._cache_by_name[cls] = {} + type_cache_by_id = self._cache_by_id.setdefault(cls, {}) + type_cache_by_name = self._cache_by_name.setdefault(cls, {}) - self._cache_by_id[cls][instance.id] = instance + type_cache_by_id[instance.id] = instance if instance_name is not None: - self._cache_by_name[cls][instance_name] = instance + type_cache_by_name[instance_name] = instance # Set as default if no id/name specified if id is None and name is None: From df4bfa78224e805c0f140c231672a9bf1463e577 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 23 Jun 2025 14:31:20 +0200 Subject: [PATCH 29/43] Rm default_"storage"_id options (were not used at all) --- docs/guides/storages.mdx | 6 ++-- src/crawlee/configuration.py | 36 ------------------- .../_playwright/test_playwright_crawler.py | 2 +- 3 files changed, 3 insertions(+), 41 deletions(-) diff --git a/docs/guides/storages.mdx b/docs/guides/storages.mdx index 37815bde59..b0c8424d48 100644 --- a/docs/guides/storages.mdx +++ b/docs/guides/storages.mdx @@ -41,21 +41,19 @@ Each storage client is responsible for maintaining the storages in a specific en The `MemoryStorageClient` is the default and currently the only one storage client in Crawlee. It stores data in memory and persists it to the local file system. The data are stored in the following directory structure: ```text -{CRAWLEE_STORAGE_DIR}/{storage_type}/{STORAGE_ID}/ +{CRAWLEE_STORAGE_DIR}/{storage_type}/{STORAGE_NAME}/ ``` where: - `{CRAWLEE_STORAGE_DIR}`: The root directory for local storage, specified by the `CRAWLEE_STORAGE_DIR` environment variable (default: `./storage`). - `{storage_type}`: The type of storage (e.g., `datasets`, `key_value_stores`, `request_queues`). -- `{STORAGE_ID}`: The ID of the specific storage instance (default: `default`). +- `{STORAGE_NAME}`: The name of the specific storage instance (default: `default`). :::info NOTE The current `MemoryStorageClient` and its interface is quite old and not great. We plan to refactor it, together with the whole `StorageClient` interface in the near future and it better and and easier to use. We also plan to introduce new storage clients for different storage backends - e.g. for [SQLite](https://sqlite.org/). ::: -You can override default storage IDs using these environment variables: `CRAWLEE_DEFAULT_DATASET_ID`, `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`, or `CRAWLEE_DEFAULT_REQUEST_QUEUE_ID`. - ## Request queue The `RequestQueue` is the primary storage for URLs in Crawlee, especially useful for deep crawling. It supports dynamic addition and removal of URLs, making it ideal for recursive tasks where URLs are discovered and added during the crawling process (e.g., following links across multiple pages). Each Crawlee project has a **default request queue**, which can be used to store URLs during a specific run. The `RequestQueue` is highly useful for large-scale and complex crawls. diff --git a/src/crawlee/configuration.py b/src/crawlee/configuration.py index e3ef39f486..cc1f10a491 100644 --- a/src/crawlee/configuration.py +++ b/src/crawlee/configuration.py @@ -73,42 +73,6 @@ class Configuration(BaseSettings): ] = 'INFO' """The logging level.""" - default_dataset_id: Annotated[ - str, - Field( - validation_alias=AliasChoices( - 'actor_default_dataset_id', - 'apify_default_dataset_id', - 'crawlee_default_dataset_id', - ) - ), - ] = 'default' - """The default `Dataset` ID. This option is utilized by the storage client.""" - - default_key_value_store_id: Annotated[ - str, - Field( - validation_alias=AliasChoices( - 'actor_default_key_value_store_id', - 'apify_default_key_value_store_id', - 'crawlee_default_key_value_store_id', - ) - ), - ] = 'default' - """The default `KeyValueStore` ID. This option is utilized by the storage client.""" - - default_request_queue_id: Annotated[ - str, - Field( - validation_alias=AliasChoices( - 'actor_default_request_queue_id', - 'apify_default_request_queue_id', - 'crawlee_default_request_queue_id', - ) - ), - ] = 'default' - """The default `RequestQueue` ID. This option is utilized by the storage client.""" - purge_on_start: Annotated[ bool, Field( diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index fc6fb282a9..a8d422c8e7 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -708,7 +708,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: async def test_overwrite_configuration() -> None: """Check that the configuration is allowed to be passed to the Playwrightcrawler.""" - configuration = Configuration(default_dataset_id='my_dataset_id') + configuration = Configuration(log_level='WARNING') PlaywrightCrawler(configuration=configuration) used_configuration = service_locator.get_configuration() assert used_configuration is configuration From e133fcdffa18ee0e86bdc7342440335431163dac Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 23 Jun 2025 14:36:53 +0200 Subject: [PATCH 30/43] Update storages guide and add storage clients guide --- .../cleaning_purge_explicitly_example.py | 20 ++ .../storages/dataset_basic_example.py | 2 +- .../dataset_with_crawler_explicit_example.py | 2 +- .../storages/kvs_basic_example.py | 2 +- .../kvs_with_crawler_explicit_example.py | 2 +- .../rq_with_crawler_explicit_example.py | 2 +- docs/guides/request_loaders.mdx | 4 +- docs/guides/storage_clients.mdx | 197 ++++++++++++++++++ docs/guides/storages.mdx | 96 ++++----- 9 files changed, 260 insertions(+), 67 deletions(-) create mode 100644 docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py create mode 100644 docs/guides/storage_clients.mdx diff --git a/docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py b/docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py new file mode 100644 index 0000000000..17911b79d7 --- /dev/null +++ b/docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py @@ -0,0 +1,20 @@ +import asyncio + +from crawlee.storages import Dataset + + +async def main() -> None: + # Create storage client with configuration + dataset = await Dataset.open(name='my-dataset') + + # Purge the dataset explicitly - purging will remove all items from the dataset. + # But keeps the dataset itself and its metadata. + await dataset.purge() + + # Or you can drop the dataset completely, which will remove the dataset + # and all its items. + await dataset.drop() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/storages/dataset_basic_example.py b/docs/guides/code_examples/storages/dataset_basic_example.py index 9b67f36eb0..03b7581f85 100644 --- a/docs/guides/code_examples/storages/dataset_basic_example.py +++ b/docs/guides/code_examples/storages/dataset_basic_example.py @@ -6,7 +6,7 @@ async def main() -> None: # Open the dataset, if it does not exist, it will be created. # Leave name empty to use the default dataset. - dataset = await Dataset.open() + dataset = await Dataset.open(name='my-dataset') # Push a single row of data. await dataset.push_data({'foo': 'bar'}) diff --git a/docs/guides/code_examples/storages/dataset_with_crawler_explicit_example.py b/docs/guides/code_examples/storages/dataset_with_crawler_explicit_example.py index 7c6a613b8f..2b19c86994 100644 --- a/docs/guides/code_examples/storages/dataset_with_crawler_explicit_example.py +++ b/docs/guides/code_examples/storages/dataset_with_crawler_explicit_example.py @@ -7,7 +7,7 @@ async def main() -> None: # Open the dataset, if it does not exist, it will be created. # Leave name empty to use the default dataset. - dataset = await Dataset.open() + dataset = await Dataset.open(name='my-dataset') # Create a new crawler (it can be any subclass of BasicCrawler). crawler = BeautifulSoupCrawler() diff --git a/docs/guides/code_examples/storages/kvs_basic_example.py b/docs/guides/code_examples/storages/kvs_basic_example.py index 7821fa75de..9cc66c59a5 100644 --- a/docs/guides/code_examples/storages/kvs_basic_example.py +++ b/docs/guides/code_examples/storages/kvs_basic_example.py @@ -6,7 +6,7 @@ async def main() -> None: # Open the key-value store, if it does not exist, it will be created. # Leave name empty to use the default KVS. - kvs = await KeyValueStore.open() + kvs = await KeyValueStore.open(name='my-key-value-store') # Set a value associated with 'some-key'. await kvs.set_value(key='some-key', value={'foo': 'bar'}) diff --git a/docs/guides/code_examples/storages/kvs_with_crawler_explicit_example.py b/docs/guides/code_examples/storages/kvs_with_crawler_explicit_example.py index 66a921bd04..4c965457c3 100644 --- a/docs/guides/code_examples/storages/kvs_with_crawler_explicit_example.py +++ b/docs/guides/code_examples/storages/kvs_with_crawler_explicit_example.py @@ -7,7 +7,7 @@ async def main() -> None: # Open the key-value store, if it does not exist, it will be created. # Leave name empty to use the default KVS. - kvs = await KeyValueStore.open() + kvs = await KeyValueStore.open(name='my-key-value-store') # Create a new Playwright crawler. crawler = PlaywrightCrawler() diff --git a/docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py b/docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py index bfece2eca5..aac7b0bcb8 100644 --- a/docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py +++ b/docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py @@ -13,7 +13,7 @@ async def main() -> None: await request_queue.add_requests(['https://apify.com/', 'https://crawlee.dev/']) # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request - # list as request manager to it. It will be managed by the crawler. + # queue as request manager to it. It will be managed by the crawler. crawler = HttpCrawler(request_manager=request_queue) # Define the default request handler, which will be called for every request. diff --git a/docs/guides/request_loaders.mdx b/docs/guides/request_loaders.mdx index 8816f2a388..289d7c07ff 100644 --- a/docs/guides/request_loaders.mdx +++ b/docs/guides/request_loaders.mdx @@ -42,7 +42,7 @@ classDiagram %% Abstract classes %% ======================== -class BaseStorage { +class Storage { <> + id + name @@ -92,7 +92,7 @@ class RequestManagerTandem { %% Inheritance arrows %% ======================== -BaseStorage <|-- RequestQueue +Storage <|-- RequestQueue RequestManager <|-- RequestQueue RequestLoader <|-- RequestManager diff --git a/docs/guides/storage_clients.mdx b/docs/guides/storage_clients.mdx new file mode 100644 index 0000000000..24cec03976 --- /dev/null +++ b/docs/guides/storage_clients.mdx @@ -0,0 +1,197 @@ +--- +id: storage-clients +title: Storage clients +description: How to work with storage clients in Crawlee, including the built-in clients and how to create your own. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +Storage clients in Crawlee are subclasses of `StorageClient`. They handle interactions with different storage backends. For instance: + +- `MemoryStorageClient`: Stores data purely in memory with no persistence. +- `FileSystemStorageClient`: Provides persistent file system storage with in-memory caching for better performance. +- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient): Manages storage on the [Apify platform](https://apify.com). Apify storage client is implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). + +Each storage client is responsible for maintaining the storages in a specific environment. This abstraction makes it easier to switch between different environments, e.g. between local development and cloud production setup. + +Storage clients provide a unified interface for interacting with `Dataset`, `KeyValueStore`, and `RequestQueue`, regardless of the underlying storage implementation. They handle operations like creating, reading, updating, and deleting storage instances, as well as managing data persistence and cleanup. + +## Built-in storage clients + +Crawlee Python currently provides two main storage client implementations: + +### Memory storage client + +The `MemoryStorageClient` stores all data in memory using Python data structures. It provides fast access but does not persist data between runs, meaning all data is lost when the program terminates. + +```python +from crawlee.storage_clients import MemoryStorageClient +from crawlee.crawlers import ParselCrawler + +# Create memory storage client. +storage_client = MemoryStorageClient() + +# Or pass it directly to the crawler. +crawler = ParselCrawler(storage_client=storage_client) +``` + +The `MemoryStorageClient` is a good choice for testing, development, or short-lived operations where speed is more important than data persistence. It is not suitable for production use or long-running crawls, as all data will be lost when the program exits. + +### File system storage client + +The `FileSystemStorageClient` provides persistent storage by writing data directly to the file system. It uses smart caching and batch processing for better performance while storing data in human-readable JSON format. + +This storage client is ideal for large datasets, and long-running operations where data persistence is required. Data can be easily inspected and shared with other tools. + +```python +from crawlee.storage_clients import FileSystemStorageClient +from crawlee.crawlers import ParselCrawler + +# Create file system storage client. +storage_client = FileSystemStorageClient() + +# Or pass it directly to the crawler. +crawler = ParselCrawler(storage_client=storage_client) +``` + +Configuration options for the `FileSystemStorageClient` can be set through environment variables or the `Configuration` class. + - **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`): The root directory for all storage data. + - **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`): Whether to purge default storages on start. + +Data are stored using the following directory structure: + +```text +{CRAWLEE_STORAGE_DIR}/ +├── datasets/ +│ └── {DATASET_NAME}/ +│ ├── __metadata__.json +│ ├── 000000001.json +│ └── 000000002.json +├── key_value_stores/ +│ └── {KVS_NAME}/ +│ ├── __metadata__.json +│ ├── key1.json +│ ├── key2.txt +│ └── key3.json +└── request_queues/ + └── {RQ_NAME}/ + ├── __metadata__.json + ├── {REQUEST_ID_1}.json + └── {REQUEST_ID_2}.json +``` + +Where: +- `{CRAWLEE_STORAGE_DIR}`: The root directory for local storage +- `{DATASET_NAME}`, `{KVS_NAME}`, `{RQ_NAME}`: The unique names for each storage instance (defaults to `"default"`) +- Files are stored directly without additional metadata files for simpler structure + +```python +from crawlee.configuration import Configuration +from crawlee.storage_clients import FileSystemStorageClient +from crawlee.crawlers import ParselCrawler + +configuration = Configuration( + storage_dir='./my_storage', + purge_on_start=False, +) +storage_client = FileSystemStorageClient(configuration=configuration) +crawler = ParselCrawler(storage_client=storage_client) +``` + +:::warning Concurrency limitation +The `FileSystemStorageClient` is not safe for concurrent access from multiple crawler processes. Use it only when running a single crawler process at a time. +::: + +## Creating a custom storage client + +A custom storage client consists of two parts: the storage client factory and individual storage type clients. The `StorageClient` acts as a factory that creates specific clients (`DatasetClient`, `KeyValueStoreClient`, `RequestQueueClient`) where the actual storage logic is implemented. + +```python +# First, implement the specific storage clients by subclassing the abstract base classes: + +from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient + +class CustomDatasetClient(DatasetClient): + # Implement all abstract methods for dataset operations. + pass + +class CustomKeyValueStoreClient(KeyValueStoreClient): + # Implement all abstract methods for key-value store operations. + pass + +class CustomRequestQueueClient(RequestQueueClient): + # Implement all abstract methods for request queue operations. + pass + +# Then implement the storage client that provides these specific clients: + +from crawlee.storage_clients import StorageClient +from crawlee.configuration import Configuration + +class CustomStorageClient(StorageClient): + async def create_dataset_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> CustomDatasetClient: + # Create an instance of custom dataset client and return it. + pass + + async def create_kvs_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> CustomKeyValueStoreClient: + # Create an instance of custom key-value store client and return it. + pass + + async def create_rq_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> CustomRequestQueueClient: + # Create an instance of custom request queue client and return it. + pass +``` + +Custom storage clients can implement any storage logic, such as connecting to a database, using a cloud storage service, or integrating with other systems. They must implement the required methods for creating, reading, updating, and deleting data in the respective storages. + +## Registering storage clients + +Custom storage clients can be registered with the `ServiceLocator` or passed directly to the crawler or specific storage. This allows you to use your custom storage implementation seamlessly with Crawlee's abstractions. + +```python +from crawlee.storage_clients import CustomStorageClient +from crawlee.service_locator import service_locator +from crawlee.crawlers import ParselCrawler +from crawlee.storages import Dataset + +# Create custom storage client. +storage_client = CustomStorageClient() +storage_client = CustomStorageClient() + +# Register it either with the service locator. +service_locator.set_storage_client(storage_client) + +# Or pass it directly to the crawler. +crawler = ParselCrawler(storage_client=storage_client) + +# Or just provide it when opening a storage (e.g. dataset). +dataset = await Dataset.open( + name='my_dataset', + storage_client=storage_client, +) +``` + +## Conclusion + +Storage clients in Crawlee provide different backends for storages. Use `MemoryStorageClient` for testing and fast operations without persistence, or `FileSystemStorageClient` for environments where data needs to persist. You can also create custom storage clients for specialized backends by implementing the `StorageClient` interface. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/guides/storages.mdx b/docs/guides/storages.mdx index b0c8424d48..22626e7143 100644 --- a/docs/guides/storages.mdx +++ b/docs/guides/storages.mdx @@ -17,57 +17,27 @@ import RqHelperEnqueueLinksExample from '!!raw-loader!roa-loader!./code_examples import DatasetBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_basic_example.py'; import DatasetWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_example.py'; -import DatasetWithCrawerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_explicit_example.py'; +import DatasetWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/dataset_with_crawler_explicit_example.py'; import KvsBasicExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_basic_example.py'; import KvsWithCrawlerExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_example.py'; import KvsWithCrawlerExplicitExample from '!!raw-loader!roa-loader!./code_examples/storages/kvs_with_crawler_explicit_example.py'; import CleaningDoNotPurgeExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_do_not_purge_example.py'; +import CleaningPurgeExplicitlyExample from '!!raw-loader!roa-loader!./code_examples/storages/cleaning_purge_explicitly_example.py'; -Crawlee offers multiple storage types for managing and persisting your crawling data. Request-oriented storages, such as the `RequestQueue`, help you store and deduplicate URLs, while result-oriented storages, like `Dataset` and `KeyValueStore`, focus on storing and retrieving scraping results. This guide helps you choose the storage type that suits your needs. +Crawlee offers several storage types for managing and persisting your crawling data. Request-oriented storages, such as the `RequestQueue`, help you store and deduplicate URLs, while result-oriented storages, like `Dataset` and `KeyValueStore`, focus on storing and retrieving scraping results. This guide helps you choose the storage type that suits your needs. -## Storage clients +Crawlee's storage system consists of two main layers: +- **Storages** (`Dataset`, `KeyValueStore`, `RequestQueue`): High-level interfaces for interacting with different storage types. +- **Storage clients** (`MemoryStorageClient`, `FileSystemStorageClient`, etc.): Backend implementations that handle the actual data persistence and management. -Storage clients in Crawlee are subclasses of `StorageClient`. They handle interactions with different storage backends. For instance: - -- `MemoryStorageClient`: Stores data in memory and persists it to the local file system. -- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient): Manages storage on the [Apify platform](https://apify.com). Apify storage client is implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). - -Each storage client is responsible for maintaining the storages in a specific environment. This abstraction makes it easier to switch between different environments, e.g. between local development and cloud production setup. - -### Memory storage client - -The `MemoryStorageClient` is the default and currently the only one storage client in Crawlee. It stores data in memory and persists it to the local file system. The data are stored in the following directory structure: - -```text -{CRAWLEE_STORAGE_DIR}/{storage_type}/{STORAGE_NAME}/ -``` - -where: - -- `{CRAWLEE_STORAGE_DIR}`: The root directory for local storage, specified by the `CRAWLEE_STORAGE_DIR` environment variable (default: `./storage`). -- `{storage_type}`: The type of storage (e.g., `datasets`, `key_value_stores`, `request_queues`). -- `{STORAGE_NAME}`: The name of the specific storage instance (default: `default`). - -:::info NOTE -The current `MemoryStorageClient` and its interface is quite old and not great. We plan to refactor it, together with the whole `StorageClient` interface in the near future and it better and and easier to use. We also plan to introduce new storage clients for different storage backends - e.g. for [SQLite](https://sqlite.org/). -::: +For more information about storage clients and their configuration, see the [Storage clients guide](./storage-clients). ## Request queue The `RequestQueue` is the primary storage for URLs in Crawlee, especially useful for deep crawling. It supports dynamic addition and removal of URLs, making it ideal for recursive tasks where URLs are discovered and added during the crawling process (e.g., following links across multiple pages). Each Crawlee project has a **default request queue**, which can be used to store URLs during a specific run. The `RequestQueue` is highly useful for large-scale and complex crawls. -By default, data are stored using the following path structure: - -```text -{CRAWLEE_STORAGE_DIR}/request_queues/{QUEUE_ID}/{INDEX}.json -``` - -- `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data, specified by the environment variable. -- `{QUEUE_ID}`: The ID of the request queue, "default" by default. -- `{INDEX}`: Represents the zero-based index of the record within the queue. - The following code demonstrates the usage of the `RequestQueue`: @@ -120,15 +90,6 @@ For a detailed explanation of the `RequestMan The `Dataset` is designed for storing structured data, where each entry has a consistent set of attributes, such as products in an online store or real estate listings. Think of a `Dataset` as a table: each entry corresponds to a row, with attributes represented as columns. Datasets are append-only, allowing you to add new records but not modify or delete existing ones. Every Crawlee project run is associated with a default dataset, typically used to store results specific to that crawler execution. However, using this dataset is optional. -By default, data are stored using the following path structure: - -```text -{CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json -``` -- `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable. -- `{DATASET_ID}`: The dataset's ID, "default" by default. -- `{INDEX}`: Represents the zero-based index of the record within the dataset. - The following code demonstrates basic operations of the dataset: @@ -144,7 +105,7 @@ The following code demonstrates basic operations of the dataset: - {DatasetWithCrawerExplicitExample} + {DatasetWithCrawlerExplicitExample} @@ -159,16 +120,6 @@ Crawlee provides the following helper function to simplify interactions with the The `KeyValueStore` is designed to save and retrieve data records or files efficiently. Each record is uniquely identified by a key and is associated with a specific MIME type, making the `KeyValueStore` ideal for tasks like saving web page screenshots, PDFs, or tracking the state of crawlers. -By default, data are stored using the following path structure: - -```text -{CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}.{EXT} -``` -- `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable. -- `{STORE_ID}`: The KVS's ID, "default" by default. -- `{KEY}`: The unique key for the record. -- `{EXT}`: The file extension corresponding to the MIME type of the content. - The following code demonstrates the usage of the `KeyValueStore`: @@ -199,14 +150,39 @@ Crawlee provides the following helper function to simplify interactions with the ## Cleaning up the storages -Default storages are purged before the crawler starts, unless explicitly configured otherwise. For that case, see `Configuration.purge_on_start`. This cleanup happens as soon as a storage is accessed, either when you open a storage (e.g. using `RequestQueue.open`, `Dataset.open`, `KeyValueStore.open`) or when interacting with a storage through one of the helper functions (e.g. `push_data`), which implicitly opens the result storage. +By default, Crawlee automatically cleans up **default storages** before each crawler run to ensure a clean state. This behavior is controlled by the `Configuration.purge_on_start` setting (default: `True`). + +### What gets purged + +- **Default storages** are completely removed and recreated at the start of each run, ensuring that you start with a clean slate. +- **Named storages** are never automatically purged and persist across runs. +- The behavior depends on the storage client implementation. + +### When purging happens + +The cleanup occurs as soon as a storage is accessed: +- When opening a storage explicitly (e.g., `RequestQueue.open`, `Dataset.open`, `KeyValueStore.open`). +- When using helper functions that implicitly open storages (e.g., `push_data`). +- Automatically when `BasicCrawler.run` is invoked. + +### Disabling automatic purging + +To disable automatic purging, set `purge_on_start=False` in your configuration: {CleaningDoNotPurgeExample} -If you do not explicitly interact with storages in your code, the purging will occur automatically when the `BasicCrawler.run` method is invoked. +### Manual purging + +Purge on start behavior just triggers the storage's `purge` method, which removes all data from the storage. If you want to purge the storage manually, you can do so by calling the `purge` method on the storage instance. Or if you want to delete the storage completely, you can call the `drop` method on the storage instance, which will remove the storage, including metadata and all its data. + + + {CleaningPurgeExplicitlyExample} + + +Note that purging behavior may vary between storage client implementations. For more details on storage configuration and client implementations, see the [Storage clients guide](./storage-clients). ## Conclusion -This guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned how to manage requests and store and retrieve scraping results using the `RequestQueue`, `Dataset`, and `KeyValueStore`. You also discovered how to use helper functions to simplify interactions with these storages. Finally, you learned how to clean up storages before starting a crawler run and how to purge them explicitly. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! +This guide introduced you to the different storage types available in Crawlee and how to interact with them. You learned how to manage requests using the `RequestQueue` and store and retrieve scraping results using the `Dataset` and `KeyValueStore`. You also discovered how to use helper functions to simplify interactions with these storages. Finally, you learned how to clean up storages before starting a crawler run. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! From 76f1ffb7f5630442d04d2d31f0c1717ca6684461 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 24 Jun 2025 10:38:58 +0200 Subject: [PATCH 31/43] Docs guides - code examples --- .../custom_storage_client_example.py | 66 +++++++++ ...ile_system_storage_client_basic_example.py | 8 ++ ...em_storage_client_configuration_example.py | 16 +++ .../memory_storage_client_basic_example.py | 8 ++ .../registering_storage_client_example.py | 21 +++ docs/guides/storage_clients.mdx | 128 +++--------------- 6 files changed, 140 insertions(+), 107 deletions(-) create mode 100644 docs/guides/code_examples/storage_clients/custom_storage_client_example.py create mode 100644 docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py create mode 100644 docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py create mode 100644 docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py create mode 100644 docs/guides/code_examples/storage_clients/registering_storage_client_example.py diff --git a/docs/guides/code_examples/storage_clients/custom_storage_client_example.py b/docs/guides/code_examples/storage_clients/custom_storage_client_example.py new file mode 100644 index 0000000000..c42a5a58a7 --- /dev/null +++ b/docs/guides/code_examples/storage_clients/custom_storage_client_example.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from crawlee.storage_clients import StorageClient +from crawlee.storage_clients._base import ( + DatasetClient, + KeyValueStoreClient, + RequestQueueClient, +) + +if TYPE_CHECKING: + from crawlee.configuration import Configuration + + +# Implement the storage type clients with your backend logic. + + +class CustomDatasetClient(DatasetClient): + # Implement abstract methods like get_items, add_item, drop, purge, etc. + ... + + +class CustomKeyValueStoreClient(KeyValueStoreClient): + # Implement abstract methods like get_value, set_value, delete, etc. + ... + + +class CustomRequestQueueClient(RequestQueueClient): + # Implement abstract methods like add_request, fetch_next_request, etc. + ... + + +# Implement the storage client factory. + + +class CustomStorageClient(StorageClient): + async def create_dataset_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> CustomDatasetClient: + # Create and return your custom dataset client. + ... + + async def create_kvs_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> CustomKeyValueStoreClient: + # Create and return your custom key-value store client. + ... + + async def create_rq_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> CustomRequestQueueClient: + # Create and return your custom request queue client. + ... diff --git a/docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py b/docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py new file mode 100644 index 0000000000..0501f83f5b --- /dev/null +++ b/docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py @@ -0,0 +1,8 @@ +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import FileSystemStorageClient + +# Create file system storage client. +storage_client = FileSystemStorageClient() + +# Or pass it directly to the crawler. +crawler = ParselCrawler(storage_client=storage_client) diff --git a/docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py b/docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py new file mode 100644 index 0000000000..5235b496dc --- /dev/null +++ b/docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py @@ -0,0 +1,16 @@ +from crawlee.configuration import Configuration +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import FileSystemStorageClient + +# Create configuration with custom settings. +configuration = Configuration( + storage_dir='./my_storage', + purge_on_start=False, +) + +storage_client = FileSystemStorageClient() + +crawler = ParselCrawler( + storage_client=storage_client, + configuration=configuration, +) diff --git a/docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py b/docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py new file mode 100644 index 0000000000..9a9aeedeb5 --- /dev/null +++ b/docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py @@ -0,0 +1,8 @@ +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import MemoryStorageClient + +# Create memory storage client. +storage_client = MemoryStorageClient() + +# Or pass it directly to the crawler. +crawler = ParselCrawler(storage_client=storage_client) diff --git a/docs/guides/code_examples/storage_clients/registering_storage_client_example.py b/docs/guides/code_examples/storage_clients/registering_storage_client_example.py new file mode 100644 index 0000000000..99f0cf37ab --- /dev/null +++ b/docs/guides/code_examples/storage_clients/registering_storage_client_example.py @@ -0,0 +1,21 @@ +from crawlee._service_locator import service_locator +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset + +# Create custom storage client (using MemoryStorageClient as example). +storage_client = MemoryStorageClient() + +# Register it either with the service locator. +service_locator.set_storage_client(storage_client) + +# Or pass it directly to the crawler. +crawler = ParselCrawler(storage_client=storage_client) + + +# Or just provide it when opening a storage (e.g. dataset). +async def example_usage() -> None: + dataset = await Dataset.open( + name='my_dataset', + storage_client=storage_client, + ) diff --git a/docs/guides/storage_clients.mdx b/docs/guides/storage_clients.mdx index 24cec03976..6e27acdbdb 100644 --- a/docs/guides/storage_clients.mdx +++ b/docs/guides/storage_clients.mdx @@ -9,6 +9,12 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import MemoryStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/memory_storage_client_basic_example.py'; +import FileSystemStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_basic_example.py'; +import FileSystemStorageClientConfigurationExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_configuration_example.py'; +import CustomStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/custom_storage_client_example.py'; +import RegisteringStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/registering_storage_client_example.py'; + Storage clients in Crawlee are subclasses of `StorageClient`. They handle interactions with different storage backends. For instance: - `MemoryStorageClient`: Stores data purely in memory with no persistence. @@ -27,16 +33,9 @@ Crawlee Python currently provides two main storage client implementations: The `MemoryStorageClient` stores all data in memory using Python data structures. It provides fast access but does not persist data between runs, meaning all data is lost when the program terminates. -```python -from crawlee.storage_clients import MemoryStorageClient -from crawlee.crawlers import ParselCrawler - -# Create memory storage client. -storage_client = MemoryStorageClient() - -# Or pass it directly to the crawler. -crawler = ParselCrawler(storage_client=storage_client) -``` + +{MemoryStorageClientBasicExample} + The `MemoryStorageClient` is a good choice for testing, development, or short-lived operations where speed is more important than data persistence. It is not suitable for production use or long-running crawls, as all data will be lost when the program exits. @@ -46,16 +45,9 @@ The `FileSystemStorageClient` +{FileSystemStorageClientBasicExample} + Configuration options for the `FileSystemStorageClient` can be set through environment variables or the `Configuration` class. - **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`): The root directory for all storage data. @@ -88,18 +80,9 @@ Where: - `{DATASET_NAME}`, `{KVS_NAME}`, `{RQ_NAME}`: The unique names for each storage instance (defaults to `"default"`) - Files are stored directly without additional metadata files for simpler structure -```python -from crawlee.configuration import Configuration -from crawlee.storage_clients import FileSystemStorageClient -from crawlee.crawlers import ParselCrawler - -configuration = Configuration( - storage_dir='./my_storage', - purge_on_start=False, -) -storage_client = FileSystemStorageClient(configuration=configuration) -crawler = ParselCrawler(storage_client=storage_client) -``` + +{FileSystemStorageClientConfigurationExample} + :::warning Concurrency limitation The `FileSystemStorageClient` is not safe for concurrent access from multiple crawler processes. Use it only when running a single crawler process at a time. @@ -109,59 +92,9 @@ The `FileSystemStorageClient` is not safe for concurrent access from multiple cr A custom storage client consists of two parts: the storage client factory and individual storage type clients. The `StorageClient` acts as a factory that creates specific clients (`DatasetClient`, `KeyValueStoreClient`, `RequestQueueClient`) where the actual storage logic is implemented. -```python -# First, implement the specific storage clients by subclassing the abstract base classes: - -from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient - -class CustomDatasetClient(DatasetClient): - # Implement all abstract methods for dataset operations. - pass - -class CustomKeyValueStoreClient(KeyValueStoreClient): - # Implement all abstract methods for key-value store operations. - pass - -class CustomRequestQueueClient(RequestQueueClient): - # Implement all abstract methods for request queue operations. - pass - -# Then implement the storage client that provides these specific clients: - -from crawlee.storage_clients import StorageClient -from crawlee.configuration import Configuration - -class CustomStorageClient(StorageClient): - async def create_dataset_client( - self, - *, - id: str | None = None, - name: str | None = None, - configuration: Configuration | None = None, - ) -> CustomDatasetClient: - # Create an instance of custom dataset client and return it. - pass - - async def create_kvs_client( - self, - *, - id: str | None = None, - name: str | None = None, - configuration: Configuration | None = None, - ) -> CustomKeyValueStoreClient: - # Create an instance of custom key-value store client and return it. - pass - - async def create_rq_client( - self, - *, - id: str | None = None, - name: str | None = None, - configuration: Configuration | None = None, - ) -> CustomRequestQueueClient: - # Create an instance of custom request queue client and return it. - pass -``` + +{CustomStorageClientExample} + Custom storage clients can implement any storage logic, such as connecting to a database, using a cloud storage service, or integrating with other systems. They must implement the required methods for creating, reading, updating, and deleting data in the respective storages. @@ -169,28 +102,9 @@ Custom storage clients can implement any storage logic, such as connecting to a Custom storage clients can be registered with the `ServiceLocator` or passed directly to the crawler or specific storage. This allows you to use your custom storage implementation seamlessly with Crawlee's abstractions. -```python -from crawlee.storage_clients import CustomStorageClient -from crawlee.service_locator import service_locator -from crawlee.crawlers import ParselCrawler -from crawlee.storages import Dataset - -# Create custom storage client. -storage_client = CustomStorageClient() -storage_client = CustomStorageClient() - -# Register it either with the service locator. -service_locator.set_storage_client(storage_client) - -# Or pass it directly to the crawler. -crawler = ParselCrawler(storage_client=storage_client) - -# Or just provide it when opening a storage (e.g. dataset). -dataset = await Dataset.open( - name='my_dataset', - storage_client=storage_client, -) -``` + +{RegisteringStorageClientExample} + ## Conclusion From fa4864497b11be2de1a52cd2cf585632cbe6ff78 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 24 Jun 2025 13:57:57 +0200 Subject: [PATCH 32/43] Docs guides polishment --- .../custom_storage_client_example.py | 27 +++----- ...ile_system_storage_client_basic_example.py | 4 +- ...em_storage_client_configuration_example.py | 8 ++- .../memory_storage_client_basic_example.py | 4 +- .../registering_storage_client_example.py | 24 ++++--- docs/guides/storage_clients.mdx | 62 ++++++++++++++----- 6 files changed, 81 insertions(+), 48 deletions(-) diff --git a/docs/guides/code_examples/storage_clients/custom_storage_client_example.py b/docs/guides/code_examples/storage_clients/custom_storage_client_example.py index c42a5a58a7..6ba8526c24 100644 --- a/docs/guides/code_examples/storage_clients/custom_storage_client_example.py +++ b/docs/guides/code_examples/storage_clients/custom_storage_client_example.py @@ -1,7 +1,4 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - +from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import ( DatasetClient, @@ -9,26 +6,22 @@ RequestQueueClient, ) -if TYPE_CHECKING: - from crawlee.configuration import Configuration - - # Implement the storage type clients with your backend logic. class CustomDatasetClient(DatasetClient): - # Implement abstract methods like get_items, add_item, drop, purge, etc. - ... + # Implement methods like push_data, get_data, iterate_items, etc. + pass class CustomKeyValueStoreClient(KeyValueStoreClient): - # Implement abstract methods like get_value, set_value, delete, etc. - ... + # Implement methods like get_value, set_value, delete, etc. + pass class CustomRequestQueueClient(RequestQueueClient): - # Implement abstract methods like add_request, fetch_next_request, etc. - ... + # Implement methods like add_request, fetch_next_request, etc. + pass # Implement the storage client factory. @@ -43,7 +36,7 @@ async def create_dataset_client( configuration: Configuration | None = None, ) -> CustomDatasetClient: # Create and return your custom dataset client. - ... + pass async def create_kvs_client( self, @@ -53,7 +46,7 @@ async def create_kvs_client( configuration: Configuration | None = None, ) -> CustomKeyValueStoreClient: # Create and return your custom key-value store client. - ... + pass async def create_rq_client( self, @@ -63,4 +56,4 @@ async def create_rq_client( configuration: Configuration | None = None, ) -> CustomRequestQueueClient: # Create and return your custom request queue client. - ... + pass diff --git a/docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py b/docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py index 0501f83f5b..62969f8024 100644 --- a/docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py +++ b/docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py @@ -1,8 +1,8 @@ from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import FileSystemStorageClient -# Create file system storage client. +# Create a new instance of storage client. storage_client = FileSystemStorageClient() -# Or pass it directly to the crawler. +# And pass it to the crawler. crawler = ParselCrawler(storage_client=storage_client) diff --git a/docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py b/docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py index 5235b496dc..1d3507660f 100644 --- a/docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py +++ b/docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py @@ -2,14 +2,16 @@ from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import FileSystemStorageClient -# Create configuration with custom settings. +# Create a new instance of storage client. +storage_client = FileSystemStorageClient() + +# Create a configuration with custom settings. configuration = Configuration( storage_dir='./my_storage', purge_on_start=False, ) -storage_client = FileSystemStorageClient() - +# And pass them to the crawler. crawler = ParselCrawler( storage_client=storage_client, configuration=configuration, diff --git a/docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py b/docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py index 9a9aeedeb5..fe79edc3f4 100644 --- a/docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py +++ b/docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py @@ -1,8 +1,8 @@ from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import MemoryStorageClient -# Create memory storage client. +# Create a new instance of storage client. storage_client = MemoryStorageClient() -# Or pass it directly to the crawler. +# And pass it to the crawler. crawler = ParselCrawler(storage_client=storage_client) diff --git a/docs/guides/code_examples/storage_clients/registering_storage_client_example.py b/docs/guides/code_examples/storage_clients/registering_storage_client_example.py index 99f0cf37ab..f91cb3574d 100644 --- a/docs/guides/code_examples/storage_clients/registering_storage_client_example.py +++ b/docs/guides/code_examples/storage_clients/registering_storage_client_example.py @@ -1,21 +1,29 @@ +import asyncio + from crawlee._service_locator import service_locator from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import MemoryStorageClient from crawlee.storages import Dataset -# Create custom storage client (using MemoryStorageClient as example). -storage_client = MemoryStorageClient() -# Register it either with the service locator. -service_locator.set_storage_client(storage_client) +async def main() -> None: + # Create custom storage client, MemoryStorageClient for example. + storage_client = MemoryStorageClient() -# Or pass it directly to the crawler. -crawler = ParselCrawler(storage_client=storage_client) + # Register it globally via the service locator. + service_locator.set_storage_client(storage_client) + # Or pass it directly to the crawler, it will be registered globally + # to the service locator under the hood. + crawler = ParselCrawler(storage_client=storage_client) -# Or just provide it when opening a storage (e.g. dataset). -async def example_usage() -> None: + # Or just provide it when opening a storage (e.g. dataset), it will be used + # for this storage only, not globally. dataset = await Dataset.open( name='my_dataset', storage_client=storage_client, ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/storage_clients.mdx b/docs/guides/storage_clients.mdx index 6e27acdbdb..09674f5f1e 100644 --- a/docs/guides/storage_clients.mdx +++ b/docs/guides/storage_clients.mdx @@ -13,13 +13,12 @@ import MemoryStorageClientBasicExample from '!!raw-loader!roa-loader!./code_exam import FileSystemStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_basic_example.py'; import FileSystemStorageClientConfigurationExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_configuration_example.py'; import CustomStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/custom_storage_client_example.py'; -import RegisteringStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/registering_storage_client_example.py'; Storage clients in Crawlee are subclasses of `StorageClient`. They handle interactions with different storage backends. For instance: - `MemoryStorageClient`: Stores data purely in memory with no persistence. - `FileSystemStorageClient`: Provides persistent file system storage with in-memory caching for better performance. -- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient): Manages storage on the [Apify platform](https://apify.com). Apify storage client is implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). +- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient): Manages storage on the [Apify platform](https://apify.com). Apify storage client is implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). You will find more information about it in the [Apify SDK documentation](https://docs.apify.com/sdk/python/docs/overview/introduction). Each storage client is responsible for maintaining the storages in a specific environment. This abstraction makes it easier to switch between different environments, e.g. between local development and cloud production setup. @@ -27,7 +26,7 @@ Storage clients provide a unified interface for interacting with `MemoryStorageClient` stor The `MemoryStorageClient` is a good choice for testing, development, or short-lived operations where speed is more important than data persistence. It is not suitable for production use or long-running crawls, as all data will be lost when the program exits. +:::warning Persistence limitation +The `MemoryStorageClient` does not persist data between runs. All data is lost when the program terminates. +::: + ### File system storage client -The `FileSystemStorageClient` provides persistent storage by writing data directly to the file system. It uses smart caching and batch processing for better performance while storing data in human-readable JSON format. +The `FileSystemStorageClient` provides persistent storage by writing data directly to the file system. It uses smart caching and batch processing for better performance while storing data in human-readable JSON format. This is a default storage client used by Crawlee when no other storage client is specified. + +:::warning Concurrency limitation +The `FileSystemStorageClient` is not safe for concurrent access from multiple crawler processes. Use it only when running a single crawler process at a time. +::: This storage client is ideal for large datasets, and long-running operations where data persistence is required. Data can be easily inspected and shared with other tools. @@ -76,21 +83,21 @@ Data are stored using the following directory structure: ``` Where: -- `{CRAWLEE_STORAGE_DIR}`: The root directory for local storage -- `{DATASET_NAME}`, `{KVS_NAME}`, `{RQ_NAME}`: The unique names for each storage instance (defaults to `"default"`) -- Files are stored directly without additional metadata files for simpler structure +- `{CRAWLEE_STORAGE_DIR}`: The root directory for local storage. +- `{DATASET_NAME}`, `{KVS_NAME}`, `{RQ_NAME}`: The unique names for each storage instance (defaults to `"default"`). +- Files are stored directly without additional metadata files for simpler structure. + +Here is an example of how to configure the `FileSystemStorageClient`: {FileSystemStorageClientConfigurationExample} -:::warning Concurrency limitation -The `FileSystemStorageClient` is not safe for concurrent access from multiple crawler processes. Use it only when running a single crawler process at a time. -::: - ## Creating a custom storage client -A custom storage client consists of two parts: the storage client factory and individual storage type clients. The `StorageClient` acts as a factory that creates specific clients (`DatasetClient`, `KeyValueStoreClient`, `RequestQueueClient`) where the actual storage logic is implemented. +A storage client consists of two parts: the storage client factory and individual storage type clients. The `StorageClient` acts as a factory that creates specific clients (`DatasetClient`, `KeyValueStoreClient`, `RequestQueueClient`) where the actual storage logic is implemented. + +Here is an example of a custom storage client that implements the `StorageClient` interface: {CustomStorageClientExample} @@ -100,11 +107,34 @@ Custom storage clients can implement any storage logic, such as connecting to a ## Registering storage clients -Custom storage clients can be registered with the `ServiceLocator` or passed directly to the crawler or specific storage. This allows you to use your custom storage implementation seamlessly with Crawlee's abstractions. +Storage clients can be registered either: +- Globally, with the `ServiceLocator` or passed directly to the crawler; +- Or storage specific, when opening a storage instance like `Dataset`, `KeyValueStore`, or `RequestQueue`. - -{RegisteringStorageClientExample} - +```python +from crawlee.storage_clients import CustomStorageClient +from crawlee.service_locator import service_locator +from crawlee.crawlers import ParselCrawler +from crawlee.storages import Dataset + +# Create custom storage client. +storage_client = CustomStorageClient() +storage_client = CustomStorageClient() + +# Register it either with the service locator. +service_locator.set_storage_client(storage_client) + +# Or pass it directly to the crawler. +crawler = ParselCrawler(storage_client=storage_client) + +# Or just provide it when opening a storage (e.g. dataset). +dataset = await Dataset.open( + name='my_dataset', + storage_client=storage_client, +) +``` + +You can also register a different storage client for each storage instance, allowing you to use different backends for different storages. This is useful when you want to use for example a fast in-memory storage for `RequestQueue` while persisting scraping results for `Dataset` or `KeyValueStore`. ## Conclusion From 5c935af14ff82d954e22ba8a7b98fd59829676f0 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 24 Jun 2025 14:01:21 +0200 Subject: [PATCH 33/43] docs fix lint & type checks for py 3.9 --- .../storage_clients/custom_storage_client_example.py | 8 +++++++- pyproject.toml | 7 +++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/docs/guides/code_examples/storage_clients/custom_storage_client_example.py b/docs/guides/code_examples/storage_clients/custom_storage_client_example.py index 6ba8526c24..271b83d811 100644 --- a/docs/guides/code_examples/storage_clients/custom_storage_client_example.py +++ b/docs/guides/code_examples/storage_clients/custom_storage_client_example.py @@ -1,4 +1,7 @@ -from crawlee.configuration import Configuration +from __future__ import annotations + +from typing import TYPE_CHECKING + from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import ( DatasetClient, @@ -6,6 +9,9 @@ RequestQueueClient, ) +if TYPE_CHECKING: + from crawlee.configuration import Configuration + # Implement the storage type clients with your backend logic. diff --git a/pyproject.toml b/pyproject.toml index 49ab8221a4..db65c9b3ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -218,7 +218,10 @@ markers = [ [tool.mypy] python_version = "3.9" plugins = ["pydantic.mypy"] -exclude = ["src/crawlee/project_template"] +exclude = [ + "src/crawlee/project_template", + "docs/guides/code_examples/storage_clients/custom_storage_client_example.py", +] files = ["src", "tests", "docs", "website"] check_untyped_defs = true disallow_incomplete_defs = true @@ -254,7 +257,7 @@ ignore_missing_imports = true [[tool.mypy.overrides]] module = [ - "running_in_web_server.*" # False positive when fastapi not available + "running_in_web_server.*", # False positive when fastapi not available ] disable_error_code = ["misc"] From ac259ce50194f9094eb1aded07e3a9dd74edbb74 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 24 Jun 2025 15:01:39 +0200 Subject: [PATCH 34/43] Address Honza's feedback --- docs/guides/storage_clients.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/storage_clients.mdx b/docs/guides/storage_clients.mdx index 09674f5f1e..6175eb2785 100644 --- a/docs/guides/storage_clients.mdx +++ b/docs/guides/storage_clients.mdx @@ -36,7 +36,7 @@ The `MemoryStorageClient` stor {MemoryStorageClientBasicExample} -The `MemoryStorageClient` is a good choice for testing, development, or short-lived operations where speed is more important than data persistence. It is not suitable for production use or long-running crawls, as all data will be lost when the program exits. +The `MemoryStorageClient` is a good choice for testing, development, short-lived operations where speed is more important than data persistence, or HTTP APIs where each request should be handled with a fresh storage. It is not suitable for production use or long-running crawls, as all data will be lost when the program exits. :::warning Persistence limitation The `MemoryStorageClient` does not persist data between runs. All data is lost when the program terminates. From 1cbf15e13af882c864b87f8ed48252bcb3747993 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 25 Jun 2025 14:02:40 +0200 Subject: [PATCH 35/43] SDK fixes --- src/crawlee/_utils/file.py | 4 ++-- src/crawlee/storage_clients/models.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index 4199cc27f9..c7190b739a 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -77,8 +77,8 @@ def infer_mime_type(value: Any) -> str: if isinstance(value, (dict, list)): return 'application/json; charset=utf-8' - # If the value is a string, assume plain text. - if isinstance(value, str): + # If the value is a string, number or boolean, assume plain text. + if isinstance(value, (str, int, float, bool)): return 'text/plain; charset=utf-8' # Default fallback. diff --git a/src/crawlee/storage_clients/models.py b/src/crawlee/storage_clients/models.py index 17067f2a1d..3cb5b67b7a 100644 --- a/src/crawlee/storage_clients/models.py +++ b/src/crawlee/storage_clients/models.py @@ -95,7 +95,7 @@ class KeyValueStoreRecordMetadata(BaseModel): Describe the format and type of data stored in the record, following the MIME specification. """ - size: Annotated[int, Field(alias='size')] + size: Annotated[int | None, Field(alias='size', default=None)] = None """The size of the record in bytes.""" From bc50990dd09eb5c2b66783b2fa62a8bc689a7737 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 26 Jun 2025 15:44:34 +0200 Subject: [PATCH 36/43] Add KVS record_exists method --- .../_base/_key_value_store_client.py | 13 ++ .../_file_system/_key_value_store_client.py | 20 ++ .../_memory/_key_value_store_client.py | 5 + src/crawlee/storages/_key_value_store.py | 11 + .../_file_system/test_fs_kvs_client.py | 193 ++++++++++++++++++ .../_memory/test_memory_kvs_client.py | 70 +++++++ tests/unit/storages/test_key_value_store.py | 113 ++++++++++ 7 files changed, 425 insertions(+) diff --git a/src/crawlee/storage_clients/_base/_key_value_store_client.py b/src/crawlee/storage_clients/_base/_key_value_store_client.py index 013830932b..df56bc1ac8 100644 --- a/src/crawlee/storage_clients/_base/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_base/_key_value_store_client.py @@ -115,3 +115,16 @@ async def get_public_url(self, *, key: str) -> str: The backend method for the `KeyValueStore.get_public_url` call. """ + + @abstractmethod + async def record_exists(self, *, key: str) -> bool: + """Check if a record with the given key exists in the key-value store. + + The backend method for the `KeyValueStore.record_exists` call. + + Args: + key: The key to check for existence. + + Returns: + True if a record with the given key exists, False otherwise. + """ diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 8464772fb5..3c16f796a9 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -427,6 +427,26 @@ async def get_public_url(self, *, key: str) -> str: absolute_path = record_path.absolute() return absolute_path.as_uri() + @override + async def record_exists(self, *, key: str) -> bool: + """Check if a record with the given key exists in the key-value store. + + Args: + key: The key to check for existence. + + Returns: + True if a record with the given key exists, False otherwise. + """ + # Update the metadata to record access + async with self._lock: + await self._update_metadata(update_accessed_at=True) + + record_path = self.path_to_kvs / self._encode_key(key) + record_metadata_filepath = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') + + # Both the value file and metadata file must exist for a record to be considered existing + return record_path.exists() and record_metadata_filepath.exists() + async def _update_metadata( self, *, diff --git a/src/crawlee/storage_clients/_memory/_key_value_store_client.py b/src/crawlee/storage_clients/_memory/_key_value_store_client.py index 39e3f326e1..8e68c25e81 100644 --- a/src/crawlee/storage_clients/_memory/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_memory/_key_value_store_client.py @@ -152,6 +152,11 @@ async def iterate_keys( async def get_public_url(self, *, key: str) -> str: raise NotImplementedError('Public URLs are not supported for memory key-value stores.') + @override + async def record_exists(self, *, key: str) -> bool: + await self._update_metadata(update_accessed_at=True) + return key in self._records + async def _update_metadata( self, *, diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index 9a9974c20e..8838f39c2b 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -222,6 +222,17 @@ async def list_keys( ) ] + async def record_exists(self, key: str) -> bool: + """Check if a record with the given key exists in the key-value store. + + Args: + key: Key of the record to check for existence. + + Returns: + True if a record with the given key exists, False otherwise. + """ + return await self._client.record_exists(key=key) + async def get_public_url(self, key: str) -> str: """Get the public URL for the given key. diff --git a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py index 765059d305..eba8edc416 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py @@ -414,3 +414,196 @@ async def set_value(key: str, value: str) -> None: record = await kvs_client.get_value(key=key) assert record is not None assert record.value == f'value-{i}' + + +async def test_record_exists_nonexistent_key(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that record_exists returns False for nonexistent key.""" + assert await kvs_client.record_exists(key='nonexistent-key') is False + + +async def test_record_exists_after_set_dict(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test record_exists returns True after setting a dict value.""" + key = 'dict-key' + value = {'data': 'test'} + + # Initially should not exist + assert await kvs_client.record_exists(key=key) is False + + # Set the value and check existence + await kvs_client.set_value(key=key, value=value) + assert await kvs_client.record_exists(key=key) is True + + # Also verify we can retrieve the value + record = await kvs_client.get_value(key=key) + assert record is not None + assert record.value == value + + # Verify the actual files exist on disk + encoded_key = urllib.parse.quote(key, safe='') + record_path = kvs_client.path_to_kvs / encoded_key + metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') + assert record_path.exists() + assert metadata_path.exists() + + +async def test_record_exists_after_set_string(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test record_exists returns True after setting a string value.""" + key = 'string-key' + value = 'test string' + + # Initially should not exist + assert await kvs_client.record_exists(key=key) is False + + # Set the value and check existence + await kvs_client.set_value(key=key, value=value) + assert await kvs_client.record_exists(key=key) is True + + # Also verify we can retrieve the value + record = await kvs_client.get_value(key=key) + assert record is not None + assert record.value == value + + # Verify the actual files exist on disk + encoded_key = urllib.parse.quote(key, safe='') + record_path = kvs_client.path_to_kvs / encoded_key + metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') + assert record_path.exists() + assert metadata_path.exists() + + +async def test_record_exists_after_set_none(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test record_exists returns True after setting None value.""" + key = 'none-key' + value = None + + # Initially should not exist + assert await kvs_client.record_exists(key=key) is False + + # Set the value and check existence + await kvs_client.set_value(key=key, value=value) + assert await kvs_client.record_exists(key=key) is True + + # Also verify we can retrieve the value + record = await kvs_client.get_value(key=key) + assert record is not None + assert record.value == value + + # Verify the actual files exist on disk + encoded_key = urllib.parse.quote(key, safe='') + record_path = kvs_client.path_to_kvs / encoded_key + metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') + assert record_path.exists() + assert metadata_path.exists() + + +async def test_record_exists_after_set_int(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test record_exists returns True after setting an int value.""" + key = 'int-key' + value = 42 + + # Initially should not exist + assert await kvs_client.record_exists(key=key) is False + + # Set the value and check existence + await kvs_client.set_value(key=key, value=value) + assert await kvs_client.record_exists(key=key) is True + + # Also verify we can retrieve the value + record = await kvs_client.get_value(key=key) + assert record is not None + # For file system storage, non-JSON scalar values get converted to strings + assert record.value == str(value) + + # Verify the actual files exist on disk + encoded_key = urllib.parse.quote(key, safe='') + record_path = kvs_client.path_to_kvs / encoded_key + metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') + assert record_path.exists() + assert metadata_path.exists() + + +async def test_record_exists_after_delete(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test record_exists returns False after deleting a value.""" + key = 'delete-key' + value = 'will be deleted' + + # Initially should not exist + assert await kvs_client.record_exists(key=key) is False + + # Set the value first + await kvs_client.set_value(key=key, value=value) + assert await kvs_client.record_exists(key=key) is True + + # Then delete it + await kvs_client.delete_value(key=key) + assert await kvs_client.record_exists(key=key) is False + + # Verify the actual files are gone from disk + encoded_key = urllib.parse.quote(key, safe='') + record_path = kvs_client.path_to_kvs / encoded_key + metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') + assert not record_path.exists() + assert not metadata_path.exists() + + +async def test_record_exists_none_value_distinction(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that record_exists can distinguish between None value and nonexistent key.""" + test_key = 'none-value-key' + + # Set None as value + await kvs_client.set_value(key=test_key, value=None) + + # Should still exist even though value is None + assert await kvs_client.record_exists(key=test_key) is True + + # Verify we can distinguish between None value and nonexistent key + record = await kvs_client.get_value(key=test_key) + assert record is not None + assert record.value is None + assert await kvs_client.record_exists(key=test_key) is True + assert await kvs_client.record_exists(key='truly-nonexistent') is False + + +async def test_record_exists_only_value_file(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that record_exists returns False if only value file exists without metadata.""" + test_key = 'only-value-file-key' + + # Manually create only the value file without metadata + encoded_key = urllib.parse.quote(test_key, safe='') + record_path = kvs_client.path_to_kvs / encoded_key + record_path.parent.mkdir(parents=True, exist_ok=True) + record_path.write_text('orphaned value') + + # Should return False because metadata file is missing + assert await kvs_client.record_exists(key=test_key) is False + + +async def test_record_exists_only_metadata_file(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that record_exists returns False if only metadata file exists without value.""" + test_key = 'only-metadata-file-key' + + # Manually create only the metadata file without value + encoded_key = urllib.parse.quote(test_key, safe='') + record_path = kvs_client.path_to_kvs / encoded_key + metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') + + record_path.parent.mkdir(parents=True, exist_ok=True) + metadata_path.write_text('{"key":"test","content_type":"text/plain","size":10}') + + # Should return False because value file is missing + assert await kvs_client.record_exists(key=test_key) is False + + +async def test_record_exists_updates_metadata(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that record_exists updates the accessed_at timestamp.""" + # Record initial timestamp + initial_accessed = kvs_client.metadata.accessed_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Check if record exists (should update accessed_at) + await kvs_client.record_exists(key='any-key') + + # Verify timestamp was updated + assert kvs_client.metadata.accessed_at > initial_accessed diff --git a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py index ee699d4230..d3c57df869 100644 --- a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py @@ -241,3 +241,73 @@ async def test_metadata_updates(kvs_client: MemoryKeyValueStoreClient) -> None: assert kvs_client.metadata.created_at == initial_created assert kvs_client.metadata.modified_at > initial_modified assert kvs_client.metadata.accessed_at > accessed_after_get + + +async def test_record_exists_nonexistent(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that record_exists returns False for a nonexistent key.""" + result = await kvs_client.record_exists(key='nonexistent-key') + assert result is False + + +async def test_record_exists_after_set(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that record_exists returns True after setting a value.""" + test_key = 'exists-key' + test_value = {'data': 'test'} + + # Initially should not exist + assert await kvs_client.record_exists(key=test_key) is False + + # Set the value + await kvs_client.set_value(key=test_key, value=test_value) + + # Now should exist + assert await kvs_client.record_exists(key=test_key) is True + + +async def test_record_exists_after_delete(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that record_exists returns False after deleting a value.""" + test_key = 'exists-then-delete-key' + test_value = 'will be deleted' + + # Set a value + await kvs_client.set_value(key=test_key, value=test_value) + assert await kvs_client.record_exists(key=test_key) is True + + # Delete the value + await kvs_client.delete_value(key=test_key) + + # Should no longer exist + assert await kvs_client.record_exists(key=test_key) is False + + +async def test_record_exists_with_none_value(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that record_exists returns True even when value is None.""" + test_key = 'none-value-key' + + # Set None as value + await kvs_client.set_value(key=test_key, value=None) + + # Should still exist even though value is None + assert await kvs_client.record_exists(key=test_key) is True + + # Verify we can distinguish between None value and nonexistent key + record = await kvs_client.get_value(key=test_key) + assert record is not None + assert record.value is None + assert await kvs_client.record_exists(key=test_key) is True + assert await kvs_client.record_exists(key='truly-nonexistent') is False + + +async def test_record_exists_updates_metadata(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that record_exists updates the accessed_at timestamp.""" + # Record initial timestamp + initial_accessed = kvs_client.metadata.accessed_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Check if record exists (should update accessed_at) + await kvs_client.record_exists(key='any-key') + + # Verify timestamp was updated + assert kvs_client.metadata.accessed_at > initial_accessed diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index 5a52cedb64..25bbcb4fc0 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -485,3 +485,116 @@ async def test_purge( # Clean up await kvs.drop() + + +async def test_record_exists_nonexistent(kvs: KeyValueStore) -> None: + """Test that record_exists returns False for a nonexistent key.""" + result = await kvs.record_exists('nonexistent-key') + assert result is False + + +async def test_record_exists_after_set(kvs: KeyValueStore) -> None: + """Test that record_exists returns True after setting a value.""" + test_key = 'exists-key' + test_value = {'data': 'test'} + + # Initially should not exist + assert await kvs.record_exists(test_key) is False + + # Set the value + await kvs.set_value(test_key, test_value) + + # Now should exist + assert await kvs.record_exists(test_key) is True + + +async def test_record_exists_after_delete(kvs: KeyValueStore) -> None: + """Test that record_exists returns False after deleting a value.""" + test_key = 'exists-then-delete-key' + test_value = 'will be deleted' + + # Set a value + await kvs.set_value(test_key, test_value) + assert await kvs.record_exists(test_key) is True + + # Delete the value + await kvs.delete_value(test_key) + + # Should no longer exist + assert await kvs.record_exists(test_key) is False + + +async def test_record_exists_with_none_value(kvs: KeyValueStore) -> None: + """Test that record_exists returns True even when value is None.""" + test_key = 'none-value-key' + + # Set None as value + await kvs.set_value(test_key, None) + + # Should still exist even though value is None + assert await kvs.record_exists(test_key) is True + + # Verify we can distinguish between None value and nonexistent key + assert await kvs.get_value(test_key) is None + assert await kvs.record_exists(test_key) is True + assert await kvs.record_exists('truly-nonexistent') is False + + +async def test_record_exists_different_content_types(kvs: KeyValueStore) -> None: + """Test record_exists with different content types.""" + test_cases = [ + ('json-key', {'data': 'json'}, 'application/json'), + ('text-key', 'plain text', 'text/plain'), + ('binary-key', b'binary data', 'application/octet-stream'), + ] + + for key, value, content_type in test_cases: + # Set value with specific content type + await kvs.set_value(key, value, content_type=content_type) + + # Should exist regardless of content type + assert await kvs.record_exists(key) is True + + +async def test_record_exists_multiple_keys(kvs: KeyValueStore) -> None: + """Test record_exists with multiple keys and batch operations.""" + keys_and_values = [ + ('key1', 'value1'), + ('key2', {'nested': 'object'}), + ('key3', [1, 2, 3]), + ('key4', None), + ] + + # Initially, none should exist + for key, _ in keys_and_values: + assert await kvs.record_exists(key) is False + + # Set all values + for key, value in keys_and_values: + await kvs.set_value(key, value) + + # All should exist now + for key, _ in keys_and_values: + assert await kvs.record_exists(key) is True + + # Test some non-existent keys + assert await kvs.record_exists('nonexistent1') is False + assert await kvs.record_exists('nonexistent2') is False + + +async def test_record_exists_after_purge(kvs: KeyValueStore) -> None: + """Test that record_exists returns False after purging the store.""" + # Set some values + await kvs.set_value('key1', 'value1') + await kvs.set_value('key2', 'value2') + + # Verify they exist + assert await kvs.record_exists('key1') is True + assert await kvs.record_exists('key2') is True + + # Purge the store + await kvs.purge() + + # Should no longer exist + assert await kvs.record_exists('key1') is False + assert await kvs.record_exists('key2') is False From d1cf967712862fdbca83cc893e8f6f81413c2d38 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 26 Jun 2025 17:02:59 +0200 Subject: [PATCH 37/43] reduce test duplicities for storages & storage clients --- .../_file_system/test_fs_dataset_client.py | 291 ++-------- .../_file_system/test_fs_kvs_client.py | 529 +++--------------- .../_file_system/test_fs_rq_client.py | 460 +++------------ .../_memory/test_memory_dataset_client.py | 227 +------- .../_memory/test_memory_kvs_client.py | 264 +-------- .../_memory/test_memory_rq_client.py | 400 +------------ 6 files changed, 237 insertions(+), 1934 deletions(-) diff --git a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py index 450a9073e8..0f0ca0fe19 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py @@ -2,7 +2,6 @@ import asyncio import json -from datetime import datetime from pathlib import Path from typing import TYPE_CHECKING @@ -11,12 +10,12 @@ from crawlee._consts import METADATA_FILENAME from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient -from crawlee.storage_clients._file_system import FileSystemDatasetClient -from crawlee.storage_clients.models import DatasetItemsListPage if TYPE_CHECKING: from collections.abc import AsyncGenerator + from crawlee.storage_clients._file_system import FileSystemDatasetClient + @pytest.fixture def configuration(tmp_path: Path) -> Configuration: @@ -36,280 +35,57 @@ async def dataset_client(configuration: Configuration) -> AsyncGenerator[FileSys await client.drop() -async def test_open_creates_new_dataset(configuration: Configuration) -> None: - """Test that open() creates a new dataset with proper metadata when it doesn't exist.""" +async def test_file_and_directory_creation(configuration: Configuration) -> None: + """Test that file system dataset creates proper files and directories.""" client = await FileSystemStorageClient().create_dataset_client( name='new_dataset', configuration=configuration, ) - # Verify correct client type and properties - assert isinstance(client, FileSystemDatasetClient) - assert client.metadata.id is not None - assert client.metadata.name == 'new_dataset' - assert client.metadata.item_count == 0 - assert isinstance(client.metadata.created_at, datetime) - assert isinstance(client.metadata.accessed_at, datetime) - assert isinstance(client.metadata.modified_at, datetime) - # Verify files were created assert client.path_to_dataset.exists() assert client.path_to_metadata.exists() - # Verify metadata content + # Verify metadata file structure with client.path_to_metadata.open() as f: metadata = json.load(f) assert metadata['id'] == client.metadata.id assert metadata['name'] == 'new_dataset' assert metadata['item_count'] == 0 - -async def test_open_dataset_by_id(configuration: Configuration) -> None: - """Test opening a dataset by ID after creating it by name.""" - storage_client = FileSystemStorageClient() - - # First create a dataset by name - original_client = await storage_client.create_dataset_client( - name='open-by-id-test', - configuration=configuration, - ) - - # Get the ID from the created client - dataset_id = original_client.metadata.id - - # Add some data to verify it persists - await original_client.push_data({'test_item': 'test_value'}) - - # Now try to open the same dataset using just the ID - reopened_client = await storage_client.create_dataset_client( - id=dataset_id, - configuration=configuration, - ) - - # Verify it's the same dataset - assert reopened_client.metadata.id == dataset_id - assert reopened_client.metadata.name == 'open-by-id-test' - - # Verify the data is still there - data = await reopened_client.get_data() - assert len(data.items) == 1 - assert data.items[0]['test_item'] == 'test_value' - - # Clean up - await reopened_client.drop() - - -async def test_dataset_client_purge_on_start(configuration: Configuration) -> None: - """Test that purge_on_start=True clears existing data in the dataset.""" - configuration.purge_on_start = True - - # Create dataset and add data - dataset_client1 = await FileSystemStorageClient().create_dataset_client( - configuration=configuration, - ) - await dataset_client1.push_data({'item': 'initial data'}) - - # Verify data was added - items = await dataset_client1.get_data() - assert len(items.items) == 1 - - # Reopen - dataset_client2 = await FileSystemStorageClient().create_dataset_client( - configuration=configuration, - ) - - # Verify data was purged - items = await dataset_client2.get_data() - assert len(items.items) == 0 - - -async def test_dataset_client_no_purge_on_start(configuration: Configuration) -> None: - """Test that purge_on_start=False keeps existing data in the dataset.""" - configuration.purge_on_start = False - - # Create dataset and add data - dataset_client1 = await FileSystemStorageClient().create_dataset_client( - name='test-no-purge-dataset', - configuration=configuration, - ) - await dataset_client1.push_data({'item': 'preserved data'}) - - # Reopen - dataset_client2 = await FileSystemStorageClient().create_dataset_client( - name='test-no-purge-dataset', - configuration=configuration, - ) - - # Verify data was preserved - items = await dataset_client2.get_data() - assert len(items.items) == 1 - assert items.items[0]['item'] == 'preserved data' + await client.drop() -async def test_push_data_single_item(dataset_client: FileSystemDatasetClient) -> None: - """Test pushing a single item to the dataset.""" +async def test_file_persistence_and_content_verification(dataset_client: FileSystemDatasetClient) -> None: + """Test that data is properly persisted to files with correct content.""" item = {'key': 'value', 'number': 42} await dataset_client.push_data(item) - # Verify item count was updated - assert dataset_client.metadata.item_count == 1 - + # Verify files are created on disk all_files = list(dataset_client.path_to_dataset.glob('*.json')) assert len(all_files) == 2 # 1 data file + 1 metadata file - # Verify item was persisted + # Verify actual file content data_files = [item for item in all_files if item.name != METADATA_FILENAME] assert len(data_files) == 1 - # Verify file content with Path(data_files[0]).open() as f: saved_item = json.load(f) assert saved_item == item - -async def test_push_data_multiple_items(dataset_client: FileSystemDatasetClient) -> None: - """Test pushing multiple items to the dataset.""" + # Test multiple items file creation items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}] await dataset_client.push_data(items) - # Verify item count was updated - assert dataset_client.metadata.item_count == 3 - all_files = list(dataset_client.path_to_dataset.glob('*.json')) - assert len(all_files) == 4 # 3 data files + 1 metadata file + assert len(all_files) == 5 # 4 data files + 1 metadata file - # Verify items were saved to files data_files = [f for f in all_files if f.name != METADATA_FILENAME] - assert len(data_files) == 3 - - -async def test_get_data_empty_dataset(dataset_client: FileSystemDatasetClient) -> None: - """Test getting data from an empty dataset returns empty list.""" - result = await dataset_client.get_data() + assert len(data_files) == 4 # Original item + 3 new items - assert isinstance(result, DatasetItemsListPage) - assert result.count == 0 - assert result.total == 0 - assert result.items == [] - -async def test_get_data_with_items(dataset_client: FileSystemDatasetClient) -> None: - """Test getting data from a dataset returns all items in order with correct properties.""" - # Add some items - items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}] - await dataset_client.push_data(items) - - # Get all items - result = await dataset_client.get_data() - - assert result.count == 3 - assert result.total == 3 - assert len(result.items) == 3 - assert result.items[0]['id'] == 1 - assert result.items[1]['id'] == 2 - assert result.items[2]['id'] == 3 - - -async def test_get_data_with_pagination(dataset_client: FileSystemDatasetClient) -> None: - """Test getting data with offset and limit parameters for pagination implementation.""" - # Add some items - items = [{'id': i} for i in range(1, 11)] # 10 items - await dataset_client.push_data(items) - - # Test offset - result = await dataset_client.get_data(offset=3) - assert result.count == 7 - assert result.offset == 3 - assert result.items[0]['id'] == 4 - - # Test limit - result = await dataset_client.get_data(limit=5) - assert result.count == 5 - assert result.limit == 5 - assert result.items[-1]['id'] == 5 - - # Test both offset and limit - result = await dataset_client.get_data(offset=2, limit=3) - assert result.count == 3 - assert result.offset == 2 - assert result.limit == 3 - assert result.items[0]['id'] == 3 - assert result.items[-1]['id'] == 5 - - -async def test_get_data_descending_order(dataset_client: FileSystemDatasetClient) -> None: - """Test getting data in descending order reverses the item order.""" - # Add some items - items = [{'id': i} for i in range(1, 6)] # 5 items - await dataset_client.push_data(items) - - # Get items in descending order - result = await dataset_client.get_data(desc=True) - - assert result.desc is True - assert result.items[0]['id'] == 5 - assert result.items[-1]['id'] == 1 - - -async def test_get_data_skip_empty(dataset_client: FileSystemDatasetClient) -> None: - """Test getting data with skip_empty option filters out empty items when True.""" - # Add some items including an empty one - items = [ - {'id': 1, 'name': 'Item 1'}, - {}, # Empty item - {'id': 3, 'name': 'Item 3'}, - ] - await dataset_client.push_data(items) - - # Get all items - result = await dataset_client.get_data() - assert result.count == 3 - - # Get non-empty items - result = await dataset_client.get_data(skip_empty=True) - assert result.count == 2 - assert all(item != {} for item in result.items) - - -async def test_iterate(dataset_client: FileSystemDatasetClient) -> None: - """Test iterating over dataset items yields each item in the original order.""" - # Add some items - items = [{'id': i} for i in range(1, 6)] # 5 items - await dataset_client.push_data(items) - - # Iterate over all items - collected_items = [item async for item in dataset_client.iterate_items()] - - assert len(collected_items) == 5 - assert collected_items[0]['id'] == 1 - assert collected_items[-1]['id'] == 5 - - -async def test_iterate_with_options(dataset_client: FileSystemDatasetClient) -> None: - """Test iterating with offset, limit and desc parameters works the same as with get_data().""" - # Add some items - items = [{'id': i} for i in range(1, 11)] # 10 items - await dataset_client.push_data(items) - - # Test with offset and limit - collected_items = [item async for item in dataset_client.iterate_items(offset=3, limit=3)] - - assert len(collected_items) == 3 - assert collected_items[0]['id'] == 4 - assert collected_items[-1]['id'] == 6 - - # Test with descending order - collected_items = [] - async for item in dataset_client.iterate_items(desc=True, limit=3): - collected_items.append(item) - - assert len(collected_items) == 3 - assert collected_items[0]['id'] == 10 - assert collected_items[-1]['id'] == 8 - - -async def test_drop(dataset_client: FileSystemDatasetClient) -> None: - """Test dropping a dataset removes the entire dataset directory from disk.""" +async def test_drop_removes_files_from_disk(dataset_client: FileSystemDatasetClient) -> None: + """Test that dropping a dataset removes the entire dataset directory from disk.""" await dataset_client.push_data({'test': 'data'}) assert dataset_client.path_to_dataset.exists() @@ -320,8 +96,8 @@ async def test_drop(dataset_client: FileSystemDatasetClient) -> None: assert not dataset_client.path_to_dataset.exists() -async def test_metadata_updates(dataset_client: FileSystemDatasetClient) -> None: - """Test that metadata timestamps are updated correctly after read and write operations.""" +async def test_metadata_file_updates(dataset_client: FileSystemDatasetClient) -> None: + """Test that metadata file is updated correctly after operations.""" # Record initial timestamps initial_created = dataset_client.metadata.created_at initial_accessed = dataset_client.metadata.accessed_at @@ -350,3 +126,36 @@ async def test_metadata_updates(dataset_client: FileSystemDatasetClient) -> None assert dataset_client.metadata.created_at == initial_created assert dataset_client.metadata.modified_at > initial_modified assert dataset_client.metadata.accessed_at > accessed_after_get + + # Verify metadata file is updated on disk + with dataset_client.path_to_metadata.open() as f: + metadata = json.load(f) + assert metadata['item_count'] == 1 + + +async def test_data_persistence_across_reopens(configuration: Configuration) -> None: + """Test that data persists correctly when reopening the same dataset.""" + storage_client = FileSystemStorageClient() + + # Create dataset and add data + original_client = await storage_client.create_dataset_client( + name='persistence-test', + configuration=configuration, + ) + + test_data = {'test_item': 'test_value', 'id': 123} + await original_client.push_data(test_data) + + dataset_id = original_client.metadata.id + + # Reopen by ID and verify data persists + reopened_client = await storage_client.create_dataset_client( + id=dataset_id, + configuration=configuration, + ) + + data = await reopened_client.get_data() + assert len(data.items) == 1 + assert data.items[0] == test_data + + await reopened_client.drop() diff --git a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py index eba8edc416..0c36258ccc 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py @@ -2,8 +2,6 @@ import asyncio import json -import urllib.parse -from datetime import datetime from typing import TYPE_CHECKING import pytest @@ -11,12 +9,13 @@ from crawlee._consts import METADATA_FILENAME from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient -from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient if TYPE_CHECKING: from collections.abc import AsyncGenerator from pathlib import Path + from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient + @pytest.fixture def configuration(tmp_path: Path) -> Configuration: @@ -36,123 +35,33 @@ async def kvs_client(configuration: Configuration) -> AsyncGenerator[FileSystemK await client.drop() -async def test_open_creates_new_kvs(configuration: Configuration) -> None: - """Test that open() creates a new key-value store with proper metadata and files on disk.""" +async def test_file_and_directory_creation(configuration: Configuration) -> None: + """Test that file system KVS creates proper files and directories.""" client = await FileSystemStorageClient().create_kvs_client( name='new_kvs', configuration=configuration, ) - # Verify correct client type and properties - assert isinstance(client, FileSystemKeyValueStoreClient) - assert client.metadata.id is not None - assert client.metadata.name == 'new_kvs' - assert isinstance(client.metadata.created_at, datetime) - assert isinstance(client.metadata.accessed_at, datetime) - assert isinstance(client.metadata.modified_at, datetime) - # Verify files were created assert client.path_to_kvs.exists() assert client.path_to_metadata.exists() - # Verify metadata content + # Verify metadata file structure with client.path_to_metadata.open() as f: metadata = json.load(f) assert metadata['id'] == client.metadata.id assert metadata['name'] == 'new_kvs' - -async def test_open_kvs_by_id(configuration: Configuration) -> None: - """Test opening a key-value store by ID after creating it by name.""" - storage_client = FileSystemStorageClient() - - # First create a key-value store by name - original_client = await storage_client.create_kvs_client( - name='open-by-id-test', - configuration=configuration, - ) - - # Get the ID from the created client - kvs_id = original_client.metadata.id - - # Add some data to verify it persists - await original_client.set_value(key='test-key', value='test-value') - - # Now try to open the same key-value store using just the ID - reopened_client = await storage_client.create_kvs_client( - id=kvs_id, - configuration=configuration, - ) - - # Verify it's the same key-value store - assert reopened_client.metadata.id == kvs_id - assert reopened_client.metadata.name == 'open-by-id-test' - - # Verify the data is still there - record = await reopened_client.get_value(key='test-key') - assert record is not None - assert record.value == 'test-value' - - # Clean up - await reopened_client.drop() - - -async def test_kvs_client_purge_on_start(configuration: Configuration) -> None: - """Test that purge_on_start=True clears existing data in the key-value store.""" - configuration.purge_on_start = True - - # Create KVS and add data - kvs_client1 = await FileSystemStorageClient().create_kvs_client( - configuration=configuration, - ) - await kvs_client1.set_value(key='test-key', value='initial value') - - # Verify value was set - record = await kvs_client1.get_value(key='test-key') - assert record is not None - assert record.value == 'initial value' - - # Reopen - kvs_client2 = await FileSystemStorageClient().create_kvs_client( - configuration=configuration, - ) - - # Verify value was purged - record = await kvs_client2.get_value(key='test-key') - assert record is None - - -async def test_kvs_client_no_purge_on_start(configuration: Configuration) -> None: - """Test that purge_on_start=False keeps existing data in the key-value store.""" - configuration.purge_on_start = False - - # Create KVS and add data - kvs_client1 = await FileSystemStorageClient().create_kvs_client( - name='test-no-purge-kvs', - configuration=configuration, - ) - await kvs_client1.set_value(key='test-key', value='preserved value') - - # Reopen - kvs_client2 = await FileSystemStorageClient().create_kvs_client( - name='test-no-purge-kvs', - configuration=configuration, - ) - - # Verify value was preserved - record = await kvs_client2.get_value(key='test-key') - assert record is not None - assert record.value == 'preserved value' + await client.drop() -async def test_set_get_value_string(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test setting and getting a string value with correct file creation and metadata.""" - # Set a value +async def test_value_file_creation_and_content(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that values are properly persisted to files with correct content and metadata.""" test_key = 'test-key' test_value = 'Hello, world!' await kvs_client.set_value(key=test_key, value=test_value) - # Check if the file was created + # Check if the files were created key_path = kvs_client.path_to_kvs / test_key key_metadata_path = kvs_client.path_to_kvs / f'{test_key}.{METADATA_FILENAME}' assert key_path.exists() @@ -162,99 +71,57 @@ async def test_set_get_value_string(kvs_client: FileSystemKeyValueStoreClient) - content = key_path.read_text(encoding='utf-8') assert content == test_value - # Check record metadata + # Check record metadata file with key_metadata_path.open() as f: metadata = json.load(f) assert metadata['key'] == test_key assert metadata['content_type'] == 'text/plain; charset=utf-8' assert metadata['size'] == len(test_value.encode('utf-8')) - # Get the value - record = await kvs_client.get_value(key=test_key) - assert record is not None - assert record.key == test_key - assert record.value == test_value - assert record.content_type == 'text/plain; charset=utf-8' - assert record.size == len(test_value.encode('utf-8')) - - -async def test_set_get_value_json(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test setting and getting a JSON value with correct serialization and deserialization.""" - # Set a value - test_key = 'test-json' - test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]} - await kvs_client.set_value(key=test_key, value=test_value) - # Get the value - record = await kvs_client.get_value(key=test_key) - assert record is not None - assert record.key == test_key - assert record.value == test_value - assert 'application/json' in record.content_type - - -async def test_set_get_value_bytes(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test setting and getting binary data without corruption and with correct content type.""" - # Set a value +async def test_binary_data_persistence(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that binary data is stored correctly without corruption.""" test_key = 'test-binary' test_value = b'\x00\x01\x02\x03\x04' await kvs_client.set_value(key=test_key, value=test_value) - # Get the value - record = await kvs_client.get_value(key=test_key) - assert record is not None - assert record.key == test_key - assert record.value == test_value - assert record.content_type == 'application/octet-stream' - assert record.size == len(test_value) - - -async def test_set_value_explicit_content_type(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that an explicitly provided content type overrides the automatically inferred one.""" - test_key = 'test-explicit-content-type' - test_value = 'Hello, world!' - explicit_content_type = 'text/html; charset=utf-8' + # Verify binary file exists + key_path = kvs_client.path_to_kvs / test_key + assert key_path.exists() - await kvs_client.set_value(key=test_key, value=test_value, content_type=explicit_content_type) + # Verify binary content is preserved + content = key_path.read_bytes() + assert content == test_value + # Verify retrieval works correctly record = await kvs_client.get_value(key=test_key) assert record is not None - assert record.content_type == explicit_content_type - - -async def test_get_nonexistent_value(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that attempting to get a non-existent key returns None.""" - record = await kvs_client.get_value(key='nonexistent-key') - assert record is None - - -async def test_overwrite_value(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that an existing value can be overwritten and the updated value is retrieved correctly.""" - test_key = 'test-overwrite' + assert record.value == test_value + assert record.content_type == 'application/octet-stream' - # Set initial value - initial_value = 'Initial value' - await kvs_client.set_value(key=test_key, value=initial_value) - # Overwrite with new value - new_value = 'New value' - await kvs_client.set_value(key=test_key, value=new_value) +async def test_json_serialization_to_file(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that JSON objects are properly serialized to files.""" + test_key = 'test-json' + test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]} + await kvs_client.set_value(key=test_key, value=test_value) - # Verify the updated value - record = await kvs_client.get_value(key=test_key) - assert record is not None - assert record.value == new_value + # Check if file content is valid JSON + key_path = kvs_client.path_to_kvs / test_key + with key_path.open() as f: + file_content = json.load(f) + assert file_content == test_value -async def test_delete_value(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that deleting a value removes its files from disk and makes it irretrievable.""" +async def test_file_deletion_on_value_delete(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that deleting a value removes its files from disk.""" test_key = 'test-delete' test_value = 'Delete me' # Set a value await kvs_client.set_value(key=test_key, value=test_value) - # Verify it exists + # Verify files exist key_path = kvs_client.path_to_kvs / test_key metadata_path = kvs_client.path_to_kvs / f'{test_key}.{METADATA_FILENAME}' assert key_path.exists() @@ -267,66 +134,8 @@ async def test_delete_value(kvs_client: FileSystemKeyValueStoreClient) -> None: assert not key_path.exists() assert not metadata_path.exists() - # Verify value is no longer retrievable - record = await kvs_client.get_value(key=test_key) - assert record is None - - -async def test_delete_nonexistent_value(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that attempting to delete a non-existent key is a no-op and doesn't raise errors.""" - # Should not raise an error - await kvs_client.delete_value(key='nonexistent-key') - - -async def test_iterate_keys_empty_store(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that iterating over an empty store yields no keys.""" - keys = [key async for key in kvs_client.iterate_keys()] - assert len(keys) == 0 - - -async def test_iterate_keys(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that all keys can be iterated over and are returned in sorted order.""" - # Add some values - await kvs_client.set_value(key='key1', value='value1') - await kvs_client.set_value(key='key2', value='value2') - await kvs_client.set_value(key='key3', value='value3') - - # Iterate over keys - keys = [key.key async for key in kvs_client.iterate_keys()] - assert len(keys) == 3 - assert sorted(keys) == ['key1', 'key2', 'key3'] - -async def test_iterate_keys_with_limit(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that the limit parameter returns only the specified number of keys.""" - # Add some values - await kvs_client.set_value(key='key1', value='value1') - await kvs_client.set_value(key='key2', value='value2') - await kvs_client.set_value(key='key3', value='value3') - - # Iterate with limit - keys = [key.key async for key in kvs_client.iterate_keys(limit=2)] - assert len(keys) == 2 - - -async def test_iterate_keys_with_exclusive_start_key(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that exclusive_start_key parameter returns only keys after it alphabetically.""" - # Add some values with alphabetical keys - await kvs_client.set_value(key='a-key', value='value-a') - await kvs_client.set_value(key='d-key', value='value-d') - await kvs_client.set_value(key='c-key', value='value-c') - await kvs_client.set_value(key='b-key', value='value-b') - - # Iterate with exclusive start key - keys = [key.key async for key in kvs_client.iterate_keys(exclusive_start_key='b-key')] - assert len(keys) == 2 - assert 'c-key' in keys - assert 'd-key' in keys - assert 'a-key' not in keys - assert 'b-key' not in keys - - -async def test_drop(kvs_client: FileSystemKeyValueStoreClient) -> None: +async def test_drop_removes_directory(kvs_client: FileSystemKeyValueStoreClient) -> None: """Test that drop removes the entire store directory from disk.""" await kvs_client.set_value(key='test', value='test-value') @@ -338,8 +147,8 @@ async def test_drop(kvs_client: FileSystemKeyValueStoreClient) -> None: assert not kvs_client.path_to_kvs.exists() -async def test_metadata_updates(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that read/write operations properly update accessed_at and modified_at timestamps.""" +async def test_metadata_file_updates(kvs_client: FileSystemKeyValueStoreClient) -> None: + """Test that read/write operations properly update metadata file timestamps.""" # Record initial timestamps initial_created = kvs_client.metadata.created_at initial_accessed = kvs_client.metadata.accessed_at @@ -348,262 +157,52 @@ async def test_metadata_updates(kvs_client: FileSystemKeyValueStoreClient) -> No # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) - # Perform an operation that updates accessed_at + # Perform a read operation await kvs_client.get_value(key='nonexistent') - # Verify timestamps + # Verify accessed timestamp was updated assert kvs_client.metadata.created_at == initial_created assert kvs_client.metadata.accessed_at > initial_accessed assert kvs_client.metadata.modified_at == initial_modified - accessed_after_get = kvs_client.metadata.accessed_at + accessed_after_read = kvs_client.metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) - # Perform an operation that updates modified_at - await kvs_client.set_value(key='new-key', value='new-value') + # Perform a write operation + await kvs_client.set_value(key='test', value='test-value') - # Verify timestamps again + # Verify modified timestamp was updated assert kvs_client.metadata.created_at == initial_created assert kvs_client.metadata.modified_at > initial_modified - assert kvs_client.metadata.accessed_at > accessed_after_get - - -async def test_get_public_url(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that get_public_url returns a valid file:// URL for the given key.""" - # Set a value first to ensure the file exists - test_key = 'test-url-key' - test_value = 'Test URL value' - await kvs_client.set_value(key=test_key, value=test_value) - - # Get the URL - url = await kvs_client.get_public_url(key=test_key) - - # Verify it's a valid file:// URL - assert url.startswith('file:///') - - # The encoded key name should be in the URL - encoded_key = urllib.parse.quote(test_key, safe='') - assert encoded_key in url - - # Verify the path in the URL points to the actual file - file_path = kvs_client.path_to_kvs / encoded_key - assert file_path.exists() - - # Verify file content without using urlopen (avoiding blocking IO) - content = file_path.read_text(encoding='utf-8') - assert content == test_value - - -async def test_concurrent_operations(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that multiple concurrent set operations can be performed safely with correct results.""" - - # Create multiple tasks to set different values concurrently - async def set_value(key: str, value: str) -> None: - await kvs_client.set_value(key=key, value=value) - - tasks = [asyncio.create_task(set_value(f'concurrent-key-{i}', f'value-{i}')) for i in range(10)] - - # Wait for all tasks to complete - await asyncio.gather(*tasks) - - # Verify all values were set correctly - for i in range(10): - key = f'concurrent-key-{i}' - record = await kvs_client.get_value(key=key) - assert record is not None - assert record.value == f'value-{i}' + assert kvs_client.metadata.accessed_at > accessed_after_read -async def test_record_exists_nonexistent_key(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that record_exists returns False for nonexistent key.""" - assert await kvs_client.record_exists(key='nonexistent-key') is False - - -async def test_record_exists_after_set_dict(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test record_exists returns True after setting a dict value.""" - key = 'dict-key' - value = {'data': 'test'} - - # Initially should not exist - assert await kvs_client.record_exists(key=key) is False - - # Set the value and check existence - await kvs_client.set_value(key=key, value=value) - assert await kvs_client.record_exists(key=key) is True - - # Also verify we can retrieve the value - record = await kvs_client.get_value(key=key) - assert record is not None - assert record.value == value - - # Verify the actual files exist on disk - encoded_key = urllib.parse.quote(key, safe='') - record_path = kvs_client.path_to_kvs / encoded_key - metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') - assert record_path.exists() - assert metadata_path.exists() - - -async def test_record_exists_after_set_string(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test record_exists returns True after setting a string value.""" - key = 'string-key' - value = 'test string' - - # Initially should not exist - assert await kvs_client.record_exists(key=key) is False - - # Set the value and check existence - await kvs_client.set_value(key=key, value=value) - assert await kvs_client.record_exists(key=key) is True - - # Also verify we can retrieve the value - record = await kvs_client.get_value(key=key) - assert record is not None - assert record.value == value - - # Verify the actual files exist on disk - encoded_key = urllib.parse.quote(key, safe='') - record_path = kvs_client.path_to_kvs / encoded_key - metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') - assert record_path.exists() - assert metadata_path.exists() - - -async def test_record_exists_after_set_none(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test record_exists returns True after setting None value.""" - key = 'none-key' - value = None - - # Initially should not exist - assert await kvs_client.record_exists(key=key) is False - - # Set the value and check existence - await kvs_client.set_value(key=key, value=value) - assert await kvs_client.record_exists(key=key) is True - - # Also verify we can retrieve the value - record = await kvs_client.get_value(key=key) - assert record is not None - assert record.value == value - - # Verify the actual files exist on disk - encoded_key = urllib.parse.quote(key, safe='') - record_path = kvs_client.path_to_kvs / encoded_key - metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') - assert record_path.exists() - assert metadata_path.exists() - - -async def test_record_exists_after_set_int(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test record_exists returns True after setting an int value.""" - key = 'int-key' - value = 42 - - # Initially should not exist - assert await kvs_client.record_exists(key=key) is False - - # Set the value and check existence - await kvs_client.set_value(key=key, value=value) - assert await kvs_client.record_exists(key=key) is True - - # Also verify we can retrieve the value - record = await kvs_client.get_value(key=key) - assert record is not None - # For file system storage, non-JSON scalar values get converted to strings - assert record.value == str(value) - - # Verify the actual files exist on disk - encoded_key = urllib.parse.quote(key, safe='') - record_path = kvs_client.path_to_kvs / encoded_key - metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') - assert record_path.exists() - assert metadata_path.exists() - - -async def test_record_exists_after_delete(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test record_exists returns False after deleting a value.""" - key = 'delete-key' - value = 'will be deleted' - - # Initially should not exist - assert await kvs_client.record_exists(key=key) is False - - # Set the value first - await kvs_client.set_value(key=key, value=value) - assert await kvs_client.record_exists(key=key) is True - - # Then delete it - await kvs_client.delete_value(key=key) - assert await kvs_client.record_exists(key=key) is False - - # Verify the actual files are gone from disk - encoded_key = urllib.parse.quote(key, safe='') - record_path = kvs_client.path_to_kvs / encoded_key - metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') - assert not record_path.exists() - assert not metadata_path.exists() +async def test_data_persistence_across_reopens(configuration: Configuration) -> None: + """Test that data persists correctly when reopening the same KVS.""" + storage_client = FileSystemStorageClient() + # Create KVS and add data + original_client = await storage_client.create_kvs_client( + name='persistence-test', + configuration=configuration, + ) -async def test_record_exists_none_value_distinction(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that record_exists can distinguish between None value and nonexistent key.""" - test_key = 'none-value-key' + test_key = 'persistent-key' + test_value = 'persistent-value' + await original_client.set_value(key=test_key, value=test_value) - # Set None as value - await kvs_client.set_value(key=test_key, value=None) + kvs_id = original_client.metadata.id - # Should still exist even though value is None - assert await kvs_client.record_exists(key=test_key) is True + # Reopen by ID and verify data persists + reopened_client = await storage_client.create_kvs_client( + id=kvs_id, + configuration=configuration, + ) - # Verify we can distinguish between None value and nonexistent key - record = await kvs_client.get_value(key=test_key) + record = await reopened_client.get_value(key=test_key) assert record is not None - assert record.value is None - assert await kvs_client.record_exists(key=test_key) is True - assert await kvs_client.record_exists(key='truly-nonexistent') is False - - -async def test_record_exists_only_value_file(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that record_exists returns False if only value file exists without metadata.""" - test_key = 'only-value-file-key' - - # Manually create only the value file without metadata - encoded_key = urllib.parse.quote(test_key, safe='') - record_path = kvs_client.path_to_kvs / encoded_key - record_path.parent.mkdir(parents=True, exist_ok=True) - record_path.write_text('orphaned value') - - # Should return False because metadata file is missing - assert await kvs_client.record_exists(key=test_key) is False - - -async def test_record_exists_only_metadata_file(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that record_exists returns False if only metadata file exists without value.""" - test_key = 'only-metadata-file-key' - - # Manually create only the metadata file without value - encoded_key = urllib.parse.quote(test_key, safe='') - record_path = kvs_client.path_to_kvs / encoded_key - metadata_path = record_path.with_name(f'{record_path.name}.{METADATA_FILENAME}') - - record_path.parent.mkdir(parents=True, exist_ok=True) - metadata_path.write_text('{"key":"test","content_type":"text/plain","size":10}') - - # Should return False because value file is missing - assert await kvs_client.record_exists(key=test_key) is False - - -async def test_record_exists_updates_metadata(kvs_client: FileSystemKeyValueStoreClient) -> None: - """Test that record_exists updates the accessed_at timestamp.""" - # Record initial timestamp - initial_accessed = kvs_client.metadata.accessed_at - - # Wait a moment to ensure timestamps can change - await asyncio.sleep(0.01) - - # Check if record exists (should update accessed_at) - await kvs_client.record_exists(key='any-key') + assert record.value == test_value - # Verify timestamp was updated - assert kvs_client.metadata.accessed_at > initial_accessed + await reopened_client.drop() diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index e1f737ea58..1b9d329b3d 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -2,21 +2,20 @@ import asyncio import json -from datetime import datetime from typing import TYPE_CHECKING import pytest from crawlee import Request -from crawlee._consts import METADATA_FILENAME from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient -from crawlee.storage_clients._file_system import FileSystemRequestQueueClient if TYPE_CHECKING: from collections.abc import AsyncGenerator from pathlib import Path + from crawlee.storage_clients._file_system import FileSystemRequestQueueClient + @pytest.fixture def configuration(tmp_path: Path) -> Configuration: @@ -36,421 +35,68 @@ async def rq_client(configuration: Configuration) -> AsyncGenerator[FileSystemRe await client.drop() -async def test_open_request_queue_by_id(configuration: Configuration) -> None: - """Test opening a request queue by ID after creating it by name.""" - storage_client = FileSystemStorageClient() - - # First create a request queue by name - original_client = await storage_client.create_rq_client( - name='open-by-id-test', - configuration=configuration, - ) - - # Get the ID from the created client - rq_id = original_client.metadata.id - - # Add a request to verify it persists - await original_client.add_batch_of_requests([Request.from_url('https://example.com/test')]) - - # Now try to open the same request queue using just the ID - reopened_client = await storage_client.create_rq_client( - id=rq_id, - configuration=configuration, - ) - - # Verify it's the same request queue - assert reopened_client.metadata.id == rq_id - assert reopened_client.metadata.name == 'open-by-id-test' - - # Verify the request is still there - request = await reopened_client.fetch_next_request() - assert request is not None - assert request.url == 'https://example.com/test' - - # Clean up - await reopened_client.drop() - - -async def test_open_creates_new_rq(configuration: Configuration) -> None: - """Test that open() creates a new request queue with proper metadata and files on disk.""" +async def test_file_and_directory_creation(configuration: Configuration) -> None: + """Test that file system RQ creates proper files and directories.""" client = await FileSystemStorageClient().create_rq_client( name='new_request_queue', configuration=configuration, ) - # Verify correct client type and properties - assert isinstance(client, FileSystemRequestQueueClient) - assert client.metadata.id is not None - assert client.metadata.name == 'new_request_queue' - assert client.metadata.handled_request_count == 0 - assert client.metadata.pending_request_count == 0 - assert client.metadata.total_request_count == 0 - assert isinstance(client.metadata.created_at, datetime) - assert isinstance(client.metadata.accessed_at, datetime) - assert isinstance(client.metadata.modified_at, datetime) - # Verify files were created assert client.path_to_rq.exists() assert client.path_to_metadata.exists() - # Verify metadata content + # Verify metadata file structure with client.path_to_metadata.open() as f: metadata = json.load(f) assert metadata['id'] == client.metadata.id assert metadata['name'] == 'new_request_queue' - -async def test_rq_client_purge_on_start(configuration: Configuration) -> None: - """Test that purge_on_start=True clears existing data in the request queue.""" - configuration.purge_on_start = True - - # Create request queue and add data - rq_client1 = await FileSystemStorageClient().create_rq_client(configuration=configuration) - await rq_client1.add_batch_of_requests([Request.from_url('https://example.com')]) - - # Verify request was added - assert rq_client1.metadata.pending_request_count == 1 - assert rq_client1.metadata.total_request_count == 1 - assert rq_client1.metadata.handled_request_count == 0 - - # Reopen - rq_client2 = await FileSystemStorageClient().create_rq_client(configuration=configuration) - - # Verify data was purged - assert rq_client2.metadata.pending_request_count == 0 - assert rq_client2.metadata.total_request_count == 1 - assert rq_client2.metadata.handled_request_count == 0 - - -async def test_rq_client_no_purge_on_start(configuration: Configuration) -> None: - """Test that purge_on_start=False keeps existing data in the request queue.""" - configuration.purge_on_start = False - - # Create request queue and add data - rq_client1 = await FileSystemStorageClient().create_rq_client( - name='test-no-purge-rq', - configuration=configuration, - ) - await rq_client1.add_batch_of_requests([Request.from_url('https://example.com')]) - - # Reopen - rq_client2 = await FileSystemStorageClient().create_rq_client( - name='test-no-purge-rq', - configuration=configuration, - ) - - # Verify data was preserved - assert rq_client2.metadata.total_request_count == 1 - - -@pytest.fixture -def rq_path(rq_client: FileSystemRequestQueueClient) -> Path: - """Return the path to the request queue directory.""" - return rq_client.path_to_rq + await client.drop() -async def test_add_requests(rq_client: FileSystemRequestQueueClient) -> None: - """Test adding requests creates proper files in the filesystem.""" - # Add a batch of requests +async def test_request_file_persistence(rq_client: FileSystemRequestQueueClient) -> None: + """Test that requests are properly persisted to files.""" requests = [ Request.from_url('https://example.com/1'), Request.from_url('https://example.com/2'), Request.from_url('https://example.com/3'), ] - response = await rq_client.add_batch_of_requests(requests) - - # Verify response - assert len(response.processed_requests) == 3 - for i, processed_request in enumerate(response.processed_requests): - assert processed_request.unique_key == f'https://example.com/{i + 1}' - assert processed_request.was_already_present is False - assert processed_request.was_already_handled is False + await rq_client.add_batch_of_requests(requests) - # Verify request files were created + # Verify request files are created request_files = list(rq_client.path_to_rq.glob('*.json')) - assert len(request_files) == 4 # 3 requests + metadata file + # Should have 3 request files + 1 metadata file + assert len(request_files) == 4 assert rq_client.path_to_metadata in request_files - # Verify metadata was updated - assert rq_client.metadata.total_request_count == 3 - assert rq_client.metadata.pending_request_count == 3 + # Verify actual request file content + data_files = [f for f in request_files if f != rq_client.path_to_metadata] + assert len(data_files) == 3 - # Verify content of the request files - for req_file in [f for f in request_files if f != rq_client.path_to_metadata]: + for req_file in data_files: with req_file.open() as f: - content = json.load(f) - assert 'url' in content - assert content['url'].startswith('https://example.com/') - assert 'id' in content - assert content['handled_at'] is None - - -async def test_add_duplicate_request(rq_client: FileSystemRequestQueueClient) -> None: - """Test adding a duplicate request.""" - request = Request.from_url('https://example.com') + request_data = json.load(f) + assert 'url' in request_data + assert request_data['url'].startswith('https://example.com/') - # Add the request the first time - await rq_client.add_batch_of_requests([request]) - # Add the same request again - second_response = await rq_client.add_batch_of_requests([request]) - - # Verify response indicates it was already present - assert second_response.processed_requests[0].was_already_present is True - - # Verify only one request file exists - request_files = [f for f in rq_client.path_to_rq.glob('*.json') if f.name != METADATA_FILENAME] - assert len(request_files) == 1 - - # Verify metadata counts weren't incremented - assert rq_client.metadata.total_request_count == 1 - assert rq_client.metadata.pending_request_count == 1 - - -async def test_fetch_next_request(rq_client: FileSystemRequestQueueClient) -> None: - """Test fetching the next request from the queue.""" - # Add requests - requests = [ - Request.from_url('https://example.com/1'), - Request.from_url('https://example.com/2'), - ] - await rq_client.add_batch_of_requests(requests) - - # Fetch the first request - first_request = await rq_client.fetch_next_request() - assert first_request is not None - assert first_request.url == 'https://example.com/1' - - # Fetch the second request - second_request = await rq_client.fetch_next_request() - assert second_request is not None - assert second_request.url == 'https://example.com/2' - - # There should be no more requests - empty_request = await rq_client.fetch_next_request() - assert empty_request is None - - -async def test_fetch_forefront_requests(rq_client: FileSystemRequestQueueClient) -> None: - """Test that forefront requests are fetched first.""" - # Add regular requests - await rq_client.add_batch_of_requests( - [ - Request.from_url('https://example.com/regular1'), - Request.from_url('https://example.com/regular2'), - ] - ) - - # Add forefront requests - await rq_client.add_batch_of_requests( - [ - Request.from_url('https://example.com/priority1'), - Request.from_url('https://example.com/priority2'), - ], - forefront=True, - ) - - # Fetch requests - they should come in priority order first - next_request1 = await rq_client.fetch_next_request() - assert next_request1 is not None - assert next_request1.url.startswith('https://example.com/priority') - - next_request2 = await rq_client.fetch_next_request() - assert next_request2 is not None - assert next_request2.url.startswith('https://example.com/priority') - - next_request3 = await rq_client.fetch_next_request() - assert next_request3 is not None - assert next_request3.url.startswith('https://example.com/regular') - - next_request4 = await rq_client.fetch_next_request() - assert next_request4 is not None - assert next_request4.url.startswith('https://example.com/regular') - - -async def test_mark_request_as_handled(rq_client: FileSystemRequestQueueClient) -> None: - """Test marking a request as handled.""" - # Add and fetch a request +async def test_drop_removes_directory(rq_client: FileSystemRequestQueueClient) -> None: + """Test that drop removes the entire RQ directory from disk.""" await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) - request = await rq_client.fetch_next_request() - assert request is not None - - # Mark it as handled - result = await rq_client.mark_request_as_handled(request) - assert result is not None - assert result.was_already_handled is True - - # Verify metadata was updated - assert rq_client.metadata.handled_request_count == 1 - assert rq_client.metadata.pending_request_count == 0 - - # Verify the file was updated with handled_at timestamp - request_files = [f for f in rq_client.path_to_rq.glob('*.json') if f.name != METADATA_FILENAME] - assert len(request_files) == 1 - - with request_files[0].open() as f: - content = json.load(f) - assert 'handled_at' in content - assert content['handled_at'] is not None - -async def test_reclaim_request(rq_client: FileSystemRequestQueueClient) -> None: - """Test reclaiming a request that failed processing.""" - # Add and fetch a request - await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) - request = await rq_client.fetch_next_request() - assert request is not None - - # Reclaim the request - result = await rq_client.reclaim_request(request) - assert result is not None - assert result.was_already_handled is False - - # Should be able to fetch it again - reclaimed_request = await rq_client.fetch_next_request() - assert reclaimed_request is not None - assert reclaimed_request.id == request.id - - -async def test_reclaim_request_with_forefront(rq_client: FileSystemRequestQueueClient) -> None: - """Test reclaiming a request with forefront priority.""" - # Add requests - await rq_client.add_batch_of_requests( - [ - Request.from_url('https://example.com/first'), - Request.from_url('https://example.com/second'), - ] - ) - - # Fetch the first request - first_request = await rq_client.fetch_next_request() - assert first_request is not None - assert first_request.url == 'https://example.com/first' - - # Reclaim it with forefront priority - await rq_client.reclaim_request(first_request, forefront=True) - - # It should be returned before the second request - reclaimed_request = await rq_client.fetch_next_request() - assert reclaimed_request is not None - assert reclaimed_request.url == 'https://example.com/first' - - -async def test_is_empty(rq_client: FileSystemRequestQueueClient) -> None: - """Test checking if a queue is empty.""" - # Queue should start empty - assert await rq_client.is_empty() is True - - # Add a request - await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) - assert await rq_client.is_empty() is False - - # Fetch and handle the request - request = await rq_client.fetch_next_request() - assert request is not None - await rq_client.mark_request_as_handled(request) - - # Queue should be empty again - assert await rq_client.is_empty() is True - - -async def test_get_request(rq_client: FileSystemRequestQueueClient) -> None: - """Test getting a request by ID.""" - # Add a request - response = await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) - request_id = response.processed_requests[0].id - - # Get the request by ID - request = await rq_client.get_request(request_id) - assert request is not None - assert request.id == request_id - assert request.url == 'https://example.com' - - # Try to get a non-existent request - not_found = await rq_client.get_request('non-existent-id') - assert not_found is None - - -async def test_drop(configuration: Configuration) -> None: - """Test dropping the queue removes files from the filesystem.""" - client = await FileSystemStorageClient().create_rq_client( - name='drop_test', - configuration=configuration, - ) - - # Add requests to create files - await client.add_batch_of_requests( - [ - Request.from_url('https://example.com/1'), - Request.from_url('https://example.com/2'), - ] - ) - - # Verify the directory exists - rq_path = client.path_to_rq + rq_path = rq_client.path_to_rq assert rq_path.exists() - # Drop the client - await client.drop() + # Drop the request queue + await rq_client.drop() - # Verify the directory was removed assert not rq_path.exists() -async def test_file_persistence(configuration: Configuration) -> None: - """Test that requests are persisted to files and can be recovered after a 'restart'.""" - # Explicitly set purge_on_start to False to ensure files aren't deleted - configuration.purge_on_start = False - - # Create a client and add requests - client1 = await FileSystemStorageClient().create_rq_client( - name='persistence_test', - configuration=configuration, - ) - - await client1.add_batch_of_requests( - [ - Request.from_url('https://example.com/1'), - Request.from_url('https://example.com/2'), - ] - ) - - # Fetch and handle one request - request = await client1.fetch_next_request() - assert request is not None - await client1.mark_request_as_handled(request) - - # Get the storage directory path before clearing the cache - storage_path = client1.path_to_rq - assert storage_path.exists(), 'Request queue directory should exist' - - # Verify files exist - request_files = list(storage_path.glob('*.json')) - assert len(request_files) > 0, 'Request files should exist' - - # Create a new client with same name (which will load from files) - client2 = await FileSystemStorageClient().create_rq_client( - name='persistence_test', - configuration=configuration, - ) - - # Verify state was recovered - assert client2.metadata.total_request_count == 2 - assert client2.metadata.handled_request_count == 1 - assert client2.metadata.pending_request_count == 1 - - # Should be able to fetch the remaining request - remaining_request = await client2.fetch_next_request() - assert remaining_request is not None - assert remaining_request.url == 'https://example.com/2' - - # Clean up - await client2.drop() - - -async def test_metadata_updates(rq_client: FileSystemRequestQueueClient) -> None: - """Test that metadata timestamps are updated correctly after operations.""" +async def test_metadata_file_updates(rq_client: FileSystemRequestQueueClient) -> None: + """Test that metadata file is updated correctly after operations.""" # Record initial timestamps initial_created = rq_client.metadata.created_at initial_accessed = rq_client.metadata.accessed_at @@ -459,23 +105,65 @@ async def test_metadata_updates(rq_client: FileSystemRequestQueueClient) -> None # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) - # Perform an operation that updates accessed_at + # Perform a read operation await rq_client.is_empty() - # Verify timestamps + # Verify accessed timestamp was updated assert rq_client.metadata.created_at == initial_created assert rq_client.metadata.accessed_at > initial_accessed assert rq_client.metadata.modified_at == initial_modified - accessed_after_get = rq_client.metadata.accessed_at + accessed_after_read = rq_client.metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) - # Perform an operation that updates modified_at + # Perform a write operation await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) - # Verify timestamps again + # Verify modified timestamp was updated assert rq_client.metadata.created_at == initial_created assert rq_client.metadata.modified_at > initial_modified - assert rq_client.metadata.accessed_at > accessed_after_get + assert rq_client.metadata.accessed_at > accessed_after_read + + # Verify metadata file is updated on disk + with rq_client.path_to_metadata.open() as f: + metadata = json.load(f) + assert metadata['total_request_count'] == 1 + + +async def test_data_persistence_across_reopens(configuration: Configuration) -> None: + """Test that requests persist correctly when reopening the same RQ.""" + storage_client = FileSystemStorageClient() + + # Create RQ and add requests + original_client = await storage_client.create_rq_client( + name='persistence-test', + configuration=configuration, + ) + + test_requests = [ + Request.from_url('https://example.com/1'), + Request.from_url('https://example.com/2'), + ] + await original_client.add_batch_of_requests(test_requests) + + rq_id = original_client.metadata.id + + # Reopen by ID and verify requests persist + reopened_client = await storage_client.create_rq_client( + id=rq_id, + configuration=configuration, + ) + + assert reopened_client.metadata.total_request_count == 2 + + # Fetch requests to verify they're still there + request1 = await reopened_client.fetch_next_request() + request2 = await reopened_client.fetch_next_request() + + assert request1 is not None + assert request2 is not None + assert {request1.url, request2.url} == {'https://example.com/1', 'https://example.com/2'} + + await reopened_client.drop() diff --git a/tests/unit/storage_clients/_memory/test_memory_dataset_client.py b/tests/unit/storage_clients/_memory/test_memory_dataset_client.py index 6cb77556f4..cbea599bc2 100644 --- a/tests/unit/storage_clients/_memory/test_memory_dataset_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_dataset_client.py @@ -1,19 +1,18 @@ from __future__ import annotations import asyncio -from datetime import datetime from typing import TYPE_CHECKING import pytest from crawlee.configuration import Configuration from crawlee.storage_clients import MemoryStorageClient -from crawlee.storage_clients._memory import MemoryDatasetClient -from crawlee.storage_clients.models import DatasetItemsListPage if TYPE_CHECKING: from collections.abc import AsyncGenerator + from crawlee.storage_clients._memory import MemoryDatasetClient + @pytest.fixture async def dataset_client() -> AsyncGenerator[MemoryDatasetClient, None]: @@ -23,22 +22,8 @@ async def dataset_client() -> AsyncGenerator[MemoryDatasetClient, None]: await client.drop() -async def test_open_creates_new_dataset() -> None: - """Test that open() creates a new dataset with proper metadata and adds it to the cache.""" - client = await MemoryStorageClient().create_dataset_client(name='new_dataset') - - # Verify correct client type and properties - assert isinstance(client, MemoryDatasetClient) - assert client.metadata.id is not None - assert client.metadata.name == 'new_dataset' - assert client.metadata.item_count == 0 - assert isinstance(client.metadata.created_at, datetime) - assert isinstance(client.metadata.accessed_at, datetime) - assert isinstance(client.metadata.modified_at, datetime) - - -async def test_dataset_client_purge_on_start() -> None: - """Test that purge_on_start=True clears existing data in the dataset.""" +async def test_memory_specific_purge_behavior() -> None: + """Test memory-specific purge behavior and in-memory storage characteristics.""" configuration = Configuration(purge_on_start=True) # Create dataset and add data @@ -52,203 +37,19 @@ async def test_dataset_client_purge_on_start() -> None: items = await dataset_client1.get_data() assert len(items.items) == 1 - # Reopen + # Reopen with same storage client instance dataset_client2 = await MemoryStorageClient().create_dataset_client( name='test_purge_dataset', configuration=configuration, ) - # Verify data was purged + # Verify data was purged (memory storage specific behavior) items = await dataset_client2.get_data() assert len(items.items) == 0 -async def test_open_with_id_and_name() -> None: - """Test that open() can be used with both id and name parameters.""" - client = await MemoryStorageClient().create_dataset_client( - id='some-id', - name='some-name', - ) - assert client.metadata.id == 'some-id' - assert client.metadata.name == 'some-name' - - -async def test_push_data_single_item(dataset_client: MemoryDatasetClient) -> None: - """Test pushing a single item to the dataset and verifying it was stored correctly.""" - item = {'key': 'value', 'number': 42} - await dataset_client.push_data(item) - - # Verify item count was updated - assert dataset_client.metadata.item_count == 1 - - # Verify item was stored - result = await dataset_client.get_data() - assert result.count == 1 - assert result.items[0] == item - - -async def test_push_data_multiple_items(dataset_client: MemoryDatasetClient) -> None: - """Test pushing multiple items to the dataset and verifying they were stored correctly.""" - items = [ - {'id': 1, 'name': 'Item 1'}, - {'id': 2, 'name': 'Item 2'}, - {'id': 3, 'name': 'Item 3'}, - ] - await dataset_client.push_data(items) - - # Verify item count was updated - assert dataset_client.metadata.item_count == 3 - - # Verify items were stored - result = await dataset_client.get_data() - assert result.count == 3 - assert result.items == items - - -async def test_get_data_empty_dataset(dataset_client: MemoryDatasetClient) -> None: - """Test that getting data from an empty dataset returns empty results with correct metadata.""" - result = await dataset_client.get_data() - - assert isinstance(result, DatasetItemsListPage) - assert result.count == 0 - assert result.total == 0 - assert result.items == [] - - -async def test_get_data_with_items(dataset_client: MemoryDatasetClient) -> None: - """Test that all items pushed to the dataset can be retrieved with correct metadata.""" - # Add some items - items = [ - {'id': 1, 'name': 'Item 1'}, - {'id': 2, 'name': 'Item 2'}, - {'id': 3, 'name': 'Item 3'}, - ] - await dataset_client.push_data(items) - - # Get all items - result = await dataset_client.get_data() - - assert result.count == 3 - assert result.total == 3 - assert len(result.items) == 3 - assert result.items[0]['id'] == 1 - assert result.items[1]['id'] == 2 - assert result.items[2]['id'] == 3 - - -async def test_get_data_with_pagination(dataset_client: MemoryDatasetClient) -> None: - """Test that offset and limit parameters work correctly for dataset pagination.""" - # Add some items - items = [{'id': i} for i in range(1, 11)] # 10 items - await dataset_client.push_data(items) - - # Test offset - result = await dataset_client.get_data(offset=3) - assert result.count == 7 - assert result.offset == 3 - assert result.items[0]['id'] == 4 - - # Test limit - result = await dataset_client.get_data(limit=5) - assert result.count == 5 - assert result.limit == 5 - assert result.items[-1]['id'] == 5 - - # Test both offset and limit - result = await dataset_client.get_data(offset=2, limit=3) - assert result.count == 3 - assert result.offset == 2 - assert result.limit == 3 - assert result.items[0]['id'] == 3 - assert result.items[-1]['id'] == 5 - - -async def test_get_data_descending_order(dataset_client: MemoryDatasetClient) -> None: - """Test that the desc parameter correctly reverses the order of returned items.""" - # Add some items - items = [{'id': i} for i in range(1, 6)] # 5 items - await dataset_client.push_data(items) - - # Get items in descending order - result = await dataset_client.get_data(desc=True) - - assert result.desc is True - assert result.items[0]['id'] == 5 - assert result.items[-1]['id'] == 1 - - -async def test_get_data_skip_empty(dataset_client: MemoryDatasetClient) -> None: - """Test that the skip_empty parameter correctly filters out empty items.""" - # Add some items including an empty one - items = [ - {'id': 1, 'name': 'Item 1'}, - {}, # Empty item - {'id': 3, 'name': 'Item 3'}, - ] - await dataset_client.push_data(items) - - # Get all items - result = await dataset_client.get_data() - assert result.count == 3 - - # Get non-empty items - result = await dataset_client.get_data(skip_empty=True) - assert result.count == 2 - assert all(item != {} for item in result.items) - - -async def test_iterate(dataset_client: MemoryDatasetClient) -> None: - """Test that iterate_items yields each item in the dataset in the correct order.""" - # Add some items - items = [{'id': i} for i in range(1, 6)] # 5 items - await dataset_client.push_data(items) - - # Iterate over all items - collected_items = [item async for item in dataset_client.iterate_items()] - - assert len(collected_items) == 5 - assert collected_items[0]['id'] == 1 - assert collected_items[-1]['id'] == 5 - - -async def test_iterate_with_options(dataset_client: MemoryDatasetClient) -> None: - """Test that iterate_items respects offset, limit, and desc parameters.""" - # Add some items - items = [{'id': i} for i in range(1, 11)] # 10 items - await dataset_client.push_data(items) - - # Test with offset and limit - collected_items = [item async for item in dataset_client.iterate_items(offset=3, limit=3)] - - assert len(collected_items) == 3 - assert collected_items[0]['id'] == 4 - assert collected_items[-1]['id'] == 6 - - # Test with descending order - collected_items = [] - async for item in dataset_client.iterate_items(desc=True, limit=3): - collected_items.append(item) - - assert len(collected_items) == 3 - assert collected_items[0]['id'] == 10 - assert collected_items[-1]['id'] == 8 - - -async def test_drop(dataset_client: MemoryDatasetClient) -> None: - """Test that drop removes the dataset from cache and resets its state.""" - await dataset_client.push_data({'test': 'data'}) - - # Drop the dataset - await dataset_client.drop() - - # Verify the dataset is empty - assert dataset_client.metadata.item_count == 0 - result = await dataset_client.get_data() - assert result.count == 0 - - -async def test_metadata_updates(dataset_client: MemoryDatasetClient) -> None: - """Test that read/write operations properly update accessed_at and modified_at timestamps.""" +async def test_memory_metadata_updates(dataset_client: MemoryDatasetClient) -> None: + """Test that metadata timestamps are updated correctly in memory storage.""" # Record initial timestamps initial_created = dataset_client.metadata.created_at initial_accessed = dataset_client.metadata.accessed_at @@ -257,23 +58,23 @@ async def test_metadata_updates(dataset_client: MemoryDatasetClient) -> None: # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) - # Perform an operation that updates accessed_at + # Perform a read operation await dataset_client.get_data() - # Verify timestamps + # Verify timestamps (memory-specific behavior) assert dataset_client.metadata.created_at == initial_created assert dataset_client.metadata.accessed_at > initial_accessed assert dataset_client.metadata.modified_at == initial_modified - accessed_after_get = dataset_client.metadata.accessed_at + accessed_after_read = dataset_client.metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) - # Perform an operation that updates modified_at + # Perform a write operation await dataset_client.push_data({'new': 'item'}) - # Verify timestamps again + # Verify timestamps were updated assert dataset_client.metadata.created_at == initial_created assert dataset_client.metadata.modified_at > initial_modified - assert dataset_client.metadata.accessed_at > accessed_after_get + assert dataset_client.metadata.accessed_at > accessed_after_read diff --git a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py index d3c57df869..6b4388984e 100644 --- a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py @@ -1,19 +1,18 @@ from __future__ import annotations import asyncio -from datetime import datetime -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING import pytest from crawlee.configuration import Configuration from crawlee.storage_clients import MemoryStorageClient -from crawlee.storage_clients._memory import MemoryKeyValueStoreClient -from crawlee.storage_clients.models import KeyValueStoreRecordMetadata if TYPE_CHECKING: from collections.abc import AsyncGenerator + from crawlee.storage_clients._memory import MemoryKeyValueStoreClient + @pytest.fixture async def kvs_client() -> AsyncGenerator[MemoryKeyValueStoreClient, None]: @@ -23,21 +22,8 @@ async def kvs_client() -> AsyncGenerator[MemoryKeyValueStoreClient, None]: await client.drop() -async def test_open_creates_new_kvs() -> None: - """Test that open() creates a new key-value store with proper metadata and adds it to the cache.""" - client = await MemoryStorageClient().create_kvs_client(name='new_kvs') - - # Verify correct client type and properties - assert isinstance(client, MemoryKeyValueStoreClient) - assert client.metadata.id is not None - assert client.metadata.name == 'new_kvs' - assert isinstance(client.metadata.created_at, datetime) - assert isinstance(client.metadata.accessed_at, datetime) - assert isinstance(client.metadata.modified_at, datetime) - - -async def test_kvs_client_purge_on_start() -> None: - """Test that purge_on_start=True clears existing data in the KVS.""" +async def test_memory_specific_purge_behavior() -> None: + """Test memory-specific purge behavior and in-memory storage characteristics.""" configuration = Configuration(purge_on_start=True) # Create KVS and add data @@ -52,167 +38,19 @@ async def test_kvs_client_purge_on_start() -> None: assert record is not None assert record.value == 'initial value' - # Reopen + # Reopen with same storage client instance kvs_client2 = await MemoryStorageClient().create_kvs_client( name='test_purge_kvs', configuration=configuration, ) - # Verify value was purged + # Verify value was purged (memory storage specific behavior) record = await kvs_client2.get_value(key='test-key') assert record is None -async def test_open_with_id_and_name() -> None: - """Test that open() can be used with both id and name parameters.""" - client = await MemoryStorageClient().create_kvs_client( - id='some-id', - name='some-name', - ) - assert client.metadata.id == 'some-id' - assert client.metadata.name == 'some-name' - - -@pytest.mark.parametrize( - ('key', 'value', 'expected_content_type'), - [ - pytest.param('string_key', 'string value', 'text/plain; charset=utf-8', id='string'), - pytest.param('dict_key', {'name': 'test', 'value': 42}, 'application/json; charset=utf-8', id='dictionary'), - pytest.param('list_key', [1, 2, 3], 'application/json; charset=utf-8', id='list'), - pytest.param('bytes_key', b'binary data', 'application/octet-stream', id='bytes'), - ], -) -async def test_set_get_value( - kvs_client: MemoryKeyValueStoreClient, - key: str, - value: Any, - expected_content_type: str, -) -> None: - """Test storing and retrieving different types of values with correct content types.""" - # Set value - await kvs_client.set_value(key=key, value=value) - - # Get and verify value - record = await kvs_client.get_value(key=key) - assert record is not None - assert record.key == key - assert record.value == value - assert record.content_type == expected_content_type - - -async def test_get_nonexistent_value(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that attempting to get a non-existent key returns None.""" - record = await kvs_client.get_value(key='nonexistent') - assert record is None - - -async def test_set_value_with_explicit_content_type(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that an explicitly provided content type overrides the automatically inferred one.""" - value = 'This could be XML' - content_type = 'application/xml' - - await kvs_client.set_value(key='xml_key', value=value, content_type=content_type) - - record = await kvs_client.get_value(key='xml_key') - assert record is not None - assert record.value == value - assert record.content_type == content_type - - -async def test_delete_value(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that a stored value can be deleted and is no longer retrievable after deletion.""" - # Set a value - await kvs_client.set_value(key='delete_me', value='to be deleted') - - # Verify it exists - record = await kvs_client.get_value(key='delete_me') - assert record is not None - - # Delete it - await kvs_client.delete_value(key='delete_me') - - # Verify it's gone - record = await kvs_client.get_value(key='delete_me') - assert record is None - - -async def test_delete_nonexistent_value(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that attempting to delete a non-existent key is a no-op and doesn't raise errors.""" - # Should not raise an error - await kvs_client.delete_value(key='nonexistent') - - -async def test_iterate_keys(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that all keys can be iterated over and are returned in sorted order with correct metadata.""" - # Set some values - items = { - 'a_key': 'value A', - 'b_key': 'value B', - 'c_key': 'value C', - 'd_key': 'value D', - } - - for key, value in items.items(): - await kvs_client.set_value(key=key, value=value) - - # Get all keys - metadata_list = [metadata async for metadata in kvs_client.iterate_keys()] - - # Verify keys are returned in sorted order - assert len(metadata_list) == 4 - assert [m.key for m in metadata_list] == sorted(items.keys()) - assert all(isinstance(m, KeyValueStoreRecordMetadata) for m in metadata_list) - - -async def test_iterate_keys_with_exclusive_start_key(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that exclusive_start_key parameter returns only keys after it alphabetically.""" - # Set some values - for key in ['b_key', 'c_key', 'a_key', 'e_key', 'd_key']: - await kvs_client.set_value(key=key, value=f'value for {key}') - - # Get keys starting after 'b_key' - metadata_list = [metadata async for metadata in kvs_client.iterate_keys(exclusive_start_key='b_key')] - - # Verify only keys after 'b_key' are returned - assert len(metadata_list) == 3 - assert [m.key for m in metadata_list] == ['c_key', 'd_key', 'e_key'] - - -async def test_iterate_keys_with_limit(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that the limit parameter returns only the specified number of keys.""" - # Set some values - for key in ['a_key', 'e_key', 'c_key', 'b_key', 'd_key']: - await kvs_client.set_value(key=key, value=f'value for {key}') - - # Get first 3 keys - metadata_list = [metadata async for metadata in kvs_client.iterate_keys(limit=3)] - - # Verify only the first 3 keys are returned - assert len(metadata_list) == 3 - assert [m.key for m in metadata_list] == ['a_key', 'b_key', 'c_key'] - - -async def test_drop(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that drop removes the store from cache and clears all data.""" - # Add some values to the store - await kvs_client.set_value(key='test', value='data') - - # Drop the store - await kvs_client.drop() - - # Verify the store is empty - record = await kvs_client.get_value(key='test') - assert record is None - - -async def test_get_public_url(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that get_public_url raises NotImplementedError for the memory implementation.""" - with pytest.raises(NotImplementedError): - await kvs_client.get_public_url(key='any-key') - - -async def test_metadata_updates(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that read/write operations properly update accessed_at and modified_at timestamps.""" +async def test_memory_metadata_updates(kvs_client: MemoryKeyValueStoreClient) -> None: + """Test that metadata timestamps are updated correctly in memory storage.""" # Record initial timestamps initial_created = kvs_client.metadata.created_at initial_accessed = kvs_client.metadata.accessed_at @@ -221,93 +59,23 @@ async def test_metadata_updates(kvs_client: MemoryKeyValueStoreClient) -> None: # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) - # Perform an operation that updates accessed_at + # Perform a read operation await kvs_client.get_value(key='nonexistent') - # Verify timestamps + # Verify timestamps (memory-specific behavior) assert kvs_client.metadata.created_at == initial_created assert kvs_client.metadata.accessed_at > initial_accessed assert kvs_client.metadata.modified_at == initial_modified - accessed_after_get = kvs_client.metadata.accessed_at + accessed_after_read = kvs_client.metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) - # Perform an operation that updates modified_at and accessed_at - await kvs_client.set_value(key='new_key', value='new value') + # Perform a write operation + await kvs_client.set_value(key='test', value='test-value') - # Verify timestamps again + # Verify timestamps were updated assert kvs_client.metadata.created_at == initial_created assert kvs_client.metadata.modified_at > initial_modified - assert kvs_client.metadata.accessed_at > accessed_after_get - - -async def test_record_exists_nonexistent(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that record_exists returns False for a nonexistent key.""" - result = await kvs_client.record_exists(key='nonexistent-key') - assert result is False - - -async def test_record_exists_after_set(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that record_exists returns True after setting a value.""" - test_key = 'exists-key' - test_value = {'data': 'test'} - - # Initially should not exist - assert await kvs_client.record_exists(key=test_key) is False - - # Set the value - await kvs_client.set_value(key=test_key, value=test_value) - - # Now should exist - assert await kvs_client.record_exists(key=test_key) is True - - -async def test_record_exists_after_delete(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that record_exists returns False after deleting a value.""" - test_key = 'exists-then-delete-key' - test_value = 'will be deleted' - - # Set a value - await kvs_client.set_value(key=test_key, value=test_value) - assert await kvs_client.record_exists(key=test_key) is True - - # Delete the value - await kvs_client.delete_value(key=test_key) - - # Should no longer exist - assert await kvs_client.record_exists(key=test_key) is False - - -async def test_record_exists_with_none_value(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that record_exists returns True even when value is None.""" - test_key = 'none-value-key' - - # Set None as value - await kvs_client.set_value(key=test_key, value=None) - - # Should still exist even though value is None - assert await kvs_client.record_exists(key=test_key) is True - - # Verify we can distinguish between None value and nonexistent key - record = await kvs_client.get_value(key=test_key) - assert record is not None - assert record.value is None - assert await kvs_client.record_exists(key=test_key) is True - assert await kvs_client.record_exists(key='truly-nonexistent') is False - - -async def test_record_exists_updates_metadata(kvs_client: MemoryKeyValueStoreClient) -> None: - """Test that record_exists updates the accessed_at timestamp.""" - # Record initial timestamp - initial_accessed = kvs_client.metadata.accessed_at - - # Wait a moment to ensure timestamps can change - await asyncio.sleep(0.01) - - # Check if record exists (should update accessed_at) - await kvs_client.record_exists(key='any-key') - - # Verify timestamp was updated - assert kvs_client.metadata.accessed_at > initial_accessed + assert kvs_client.metadata.accessed_at > accessed_after_read diff --git a/tests/unit/storage_clients/_memory/test_memory_rq_client.py b/tests/unit/storage_clients/_memory/test_memory_rq_client.py index f5ef1060e5..68a838d4cc 100644 --- a/tests/unit/storage_clients/_memory/test_memory_rq_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_rq_client.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -from datetime import datetime from typing import TYPE_CHECKING import pytest @@ -9,11 +8,12 @@ from crawlee import Request from crawlee.configuration import Configuration from crawlee.storage_clients import MemoryStorageClient -from crawlee.storage_clients._memory import MemoryRequestQueueClient if TYPE_CHECKING: from collections.abc import AsyncGenerator + from crawlee.storage_clients._memory import MemoryRequestQueueClient + @pytest.fixture async def rq_client() -> AsyncGenerator[MemoryRequestQueueClient, None]: @@ -23,25 +23,8 @@ async def rq_client() -> AsyncGenerator[MemoryRequestQueueClient, None]: await client.drop() -async def test_open_creates_new_rq() -> None: - """Test that open() creates a new request queue with proper metadata and adds it to the cache.""" - client = await MemoryStorageClient().create_rq_client(name='new_rq') - - # Verify correct client type and properties - assert isinstance(client, MemoryRequestQueueClient) - assert client.metadata.id is not None - assert client.metadata.name == 'new_rq' - assert isinstance(client.metadata.created_at, datetime) - assert isinstance(client.metadata.accessed_at, datetime) - assert isinstance(client.metadata.modified_at, datetime) - assert client.metadata.handled_request_count == 0 - assert client.metadata.pending_request_count == 0 - assert client.metadata.total_request_count == 0 - assert client.metadata.had_multiple_clients is False - - -async def test_rq_client_purge_on_start() -> None: - """Test that purge_on_start=True clears existing data in the RQ.""" +async def test_memory_specific_purge_behavior() -> None: + """Test memory-specific purge behavior and in-memory storage characteristics.""" configuration = Configuration(purge_on_start=True) # Create RQ and add data @@ -55,333 +38,18 @@ async def test_rq_client_purge_on_start() -> None: # Verify request was added assert await rq_client1.is_empty() is False - # Reopen + # Reopen with same storage client instance rq_client2 = await MemoryStorageClient().create_rq_client( name='test_purge_rq', configuration=configuration, ) - # Verify queue was purged + # Verify queue was purged (memory storage specific behavior) assert await rq_client2.is_empty() is True -async def test_open_with_id_and_name() -> None: - """Test that open() can be used with both id and name parameters.""" - client = await MemoryStorageClient().create_rq_client( - id='some-id', - name='some-name', - ) - assert client.metadata.id is not None # ID is always auto-generated - assert client.metadata.name == 'some-name' - - -async def test_add_batch_of_requests(rq_client: MemoryRequestQueueClient) -> None: - """Test adding a batch of requests to the queue.""" - requests = [ - Request.from_url(url='https://example.com/1'), - Request.from_url(url='https://example.com/2'), - Request.from_url(url='https://example.com/3'), - ] - - response = await rq_client.add_batch_of_requests(requests) - - # Verify correct response - assert len(response.processed_requests) == 3 - assert len(response.unprocessed_requests) == 0 - - # Verify each request was processed correctly - for i, req in enumerate(requests): - assert response.processed_requests[i].id == req.id - assert response.processed_requests[i].unique_key == req.unique_key - assert response.processed_requests[i].was_already_present is False - assert response.processed_requests[i].was_already_handled is False - - # Verify metadata was updated - assert rq_client.metadata.total_request_count == 3 - assert rq_client.metadata.pending_request_count == 3 - - -async def test_add_batch_of_requests_with_duplicates(rq_client: MemoryRequestQueueClient) -> None: - """Test adding requests with duplicate unique keys.""" - # Add initial requests - initial_requests = [ - Request.from_url(url='https://example.com/1', unique_key='key1'), - Request.from_url(url='https://example.com/2', unique_key='key2'), - ] - await rq_client.add_batch_of_requests(initial_requests) - - # Mark first request as handled - req1 = await rq_client.fetch_next_request() - assert req1 is not None - await rq_client.mark_request_as_handled(req1) - - # Add duplicate requests - duplicate_requests = [ - Request.from_url(url='https://example.com/1-dup', unique_key='key1'), # Same as first (handled) - Request.from_url(url='https://example.com/2-dup', unique_key='key2'), # Same as second (not handled) - Request.from_url(url='https://example.com/3', unique_key='key3'), # New request - ] - response = await rq_client.add_batch_of_requests(duplicate_requests) - - # Verify response - assert len(response.processed_requests) == 3 - - # First request should be marked as already handled - assert response.processed_requests[0].was_already_present is True - assert response.processed_requests[0].was_already_handled is True - - # Second request should be marked as already present but not handled - assert response.processed_requests[1].was_already_present is True - assert response.processed_requests[1].was_already_handled is False - - # Third request should be new - assert response.processed_requests[2].was_already_present is False - assert response.processed_requests[2].was_already_handled is False - - -async def test_add_batch_of_requests_to_forefront(rq_client: MemoryRequestQueueClient) -> None: - """Test adding requests to the forefront of the queue.""" - # Add initial requests - initial_requests = [ - Request.from_url(url='https://example.com/1'), - Request.from_url(url='https://example.com/2'), - ] - await rq_client.add_batch_of_requests(initial_requests) - - # Add new requests to forefront - forefront_requests = [ - Request.from_url(url='https://example.com/priority'), - ] - await rq_client.add_batch_of_requests(forefront_requests, forefront=True) - - # The priority request should be fetched first - next_request = await rq_client.fetch_next_request() - assert next_request is not None - assert next_request.url == 'https://example.com/priority' - - -async def test_fetch_next_request(rq_client: MemoryRequestQueueClient) -> None: - """Test fetching the next request from the queue.""" - # Add some requests - requests = [ - Request.from_url(url='https://example.com/1'), - Request.from_url(url='https://example.com/2'), - ] - await rq_client.add_batch_of_requests(requests) - - # Fetch first request - request1 = await rq_client.fetch_next_request() - assert request1 is not None - assert request1.url == 'https://example.com/1' - - # Fetch second request - request2 = await rq_client.fetch_next_request() - assert request2 is not None - assert request2.url == 'https://example.com/2' - - # No more requests - request3 = await rq_client.fetch_next_request() - assert request3 is None - - -async def test_fetch_skips_handled_requests(rq_client: MemoryRequestQueueClient) -> None: - """Test that fetch_next_request skips handled requests.""" - # Add requests - requests = [ - Request.from_url(url='https://example.com/1'), - Request.from_url(url='https://example.com/2'), - ] - await rq_client.add_batch_of_requests(requests) - - # Fetch and handle first request - request1 = await rq_client.fetch_next_request() - assert request1 is not None - await rq_client.mark_request_as_handled(request1) - - # Next fetch should return second request, not the handled one - request = await rq_client.fetch_next_request() - assert request is not None - assert request.url == 'https://example.com/2' - - -async def test_fetch_skips_in_progress_requests(rq_client: MemoryRequestQueueClient) -> None: - """Test that fetch_next_request skips requests that are already in progress.""" - # Add requests - requests = [ - Request.from_url(url='https://example.com/1'), - Request.from_url(url='https://example.com/2'), - ] - await rq_client.add_batch_of_requests(requests) - - # Fetch first request (it should be in progress now) - request1 = await rq_client.fetch_next_request() - assert request1 is not None - - # Next fetch should return second request, not the in-progress one - request2 = await rq_client.fetch_next_request() - assert request2 is not None - assert request2.url == 'https://example.com/2' - - # Third fetch should return None as all requests are in progress - request3 = await rq_client.fetch_next_request() - assert request3 is None - - -async def test_get_request(rq_client: MemoryRequestQueueClient) -> None: - """Test getting a request by ID.""" - # Add a request - request = Request.from_url(url='https://example.com/test') - await rq_client.add_batch_of_requests([request]) - - # Get the request by ID - retrieved_request = await rq_client.get_request(request.id) - assert retrieved_request is not None - assert retrieved_request.id == request.id - assert retrieved_request.url == request.url - - # Try to get a non-existent request - nonexistent = await rq_client.get_request('nonexistent-id') - assert nonexistent is None - - -async def test_get_in_progress_request(rq_client: MemoryRequestQueueClient) -> None: - """Test getting an in-progress request by ID.""" - # Add a request - request = Request.from_url(url='https://example.com/test') - await rq_client.add_batch_of_requests([request]) - - # Fetch the request to make it in-progress - fetched = await rq_client.fetch_next_request() - assert fetched is not None - - # Get the request by ID - retrieved = await rq_client.get_request(request.id) - assert retrieved is not None - assert retrieved.id == request.id - assert retrieved.url == request.url - - -async def test_mark_request_as_handled(rq_client: MemoryRequestQueueClient) -> None: - """Test marking a request as handled.""" - # Add a request - request = Request.from_url(url='https://example.com/test') - await rq_client.add_batch_of_requests([request]) - - # Fetch the request to make it in-progress - fetched = await rq_client.fetch_next_request() - assert fetched is not None - - # Mark as handled - result = await rq_client.mark_request_as_handled(fetched) - assert result is not None - assert result.id == fetched.id - assert result.was_already_handled is True - - # Check that metadata was updated - assert rq_client.metadata.handled_request_count == 1 - assert rq_client.metadata.pending_request_count == 0 - - # Try to mark again (should fail as it's no longer in-progress) - result = await rq_client.mark_request_as_handled(fetched) - assert result is None - - -async def test_reclaim_request(rq_client: MemoryRequestQueueClient) -> None: - """Test reclaiming a request back to the queue.""" - # Add a request - request = Request.from_url(url='https://example.com/test') - await rq_client.add_batch_of_requests([request]) - - # Fetch the request to make it in-progress - fetched = await rq_client.fetch_next_request() - assert fetched is not None - - # Reclaim the request - result = await rq_client.reclaim_request(fetched) - assert result is not None - assert result.id == fetched.id - assert result.was_already_handled is False - - # It should be available to fetch again - reclaimed = await rq_client.fetch_next_request() - assert reclaimed is not None - assert reclaimed.id == fetched.id - - -async def test_reclaim_request_to_forefront(rq_client: MemoryRequestQueueClient) -> None: - """Test reclaiming a request to the forefront of the queue.""" - # Add requests - requests = [ - Request.from_url(url='https://example.com/1'), - Request.from_url(url='https://example.com/2'), - ] - await rq_client.add_batch_of_requests(requests) - - # Fetch the second request to make it in-progress - await rq_client.fetch_next_request() # Skip the first one - request2 = await rq_client.fetch_next_request() - assert request2 is not None - assert request2.url == 'https://example.com/2' - - # Reclaim the request to forefront - await rq_client.reclaim_request(request2, forefront=True) - - # It should now be the first in the queue - next_request = await rq_client.fetch_next_request() - assert next_request is not None - assert next_request.url == 'https://example.com/2' - - -async def test_is_empty(rq_client: MemoryRequestQueueClient) -> None: - """Test checking if the queue is empty.""" - # Initially empty - assert await rq_client.is_empty() is True - - # Add a request - request = Request.from_url(url='https://example.com/test') - await rq_client.add_batch_of_requests([request]) - - # Not empty now - assert await rq_client.is_empty() is False - - # Fetch and handle - fetched = await rq_client.fetch_next_request() - assert fetched is not None - await rq_client.mark_request_as_handled(fetched) - - # Empty again (all requests handled) - assert await rq_client.is_empty() is True - - -async def test_is_empty_with_in_progress(rq_client: MemoryRequestQueueClient) -> None: - """Test that in-progress requests don't affect is_empty.""" - # Add a request - request = Request.from_url(url='https://example.com/test') - await rq_client.add_batch_of_requests([request]) - - # Fetch but don't handle - await rq_client.fetch_next_request() - - # Queue should still be considered non-empty - # This is because the request hasn't been handled yet - assert await rq_client.is_empty() is False - - -async def test_drop(rq_client: MemoryRequestQueueClient) -> None: - """Test that drop removes the queue from cache and clears all data.""" - # Add a request - request = Request.from_url(url='https://example.com/test') - await rq_client.add_batch_of_requests([request]) - - # Drop the queue - await rq_client.drop() - - # Verify the queue is empty - assert await rq_client.is_empty() is True - - -async def test_metadata_updates(rq_client: MemoryRequestQueueClient) -> None: - """Test that operations properly update metadata timestamps.""" +async def test_memory_metadata_updates(rq_client: MemoryRequestQueueClient) -> None: + """Test that metadata timestamps are updated correctly in memory storage.""" # Record initial timestamps initial_created = rq_client.metadata.created_at initial_accessed = rq_client.metadata.accessed_at @@ -390,53 +58,23 @@ async def test_metadata_updates(rq_client: MemoryRequestQueueClient) -> None: # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) - # Perform an operation that updates modified_at and accessed_at - request = Request.from_url(url='https://example.com/test') - await rq_client.add_batch_of_requests([request]) + # Perform a read operation + await rq_client.is_empty() - # Verify timestamps + # Verify timestamps (memory-specific behavior) assert rq_client.metadata.created_at == initial_created - assert rq_client.metadata.modified_at > initial_modified assert rq_client.metadata.accessed_at > initial_accessed + assert rq_client.metadata.modified_at == initial_modified - # Wait a moment to ensure timestamps can change - await asyncio.sleep(0.01) - - # Record timestamps after add - accessed_after_add = rq_client.metadata.accessed_at - modified_after_add = rq_client.metadata.modified_at - - # Check is_empty (should only update accessed_at) - await rq_client.is_empty() + accessed_after_read = rq_client.metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) - # Verify only accessed_at changed - assert rq_client.metadata.modified_at == modified_after_add - assert rq_client.metadata.accessed_at > accessed_after_add - - -async def test_unique_key_generation(rq_client: MemoryRequestQueueClient) -> None: - """Test that unique keys are auto-generated if not provided.""" - # Add requests without explicit unique keys - requests = [ - Request.from_url(url='https://example.com/1'), - Request.from_url(url='https://example.com/1', always_enqueue=True), - ] - response = await rq_client.add_batch_of_requests(requests) + # Perform a write operation + await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) - # Both should be added as their auto-generated unique keys will differ - assert len(response.processed_requests) == 2 - assert all(not pr.was_already_present for pr in response.processed_requests) - - # Add a request with explicit unique key - request = Request.from_url(url='https://example.com/2', unique_key='explicit-key') - await rq_client.add_batch_of_requests([request]) - - # Add duplicate with same unique key - duplicate = Request.from_url(url='https://example.com/different', unique_key='explicit-key') - duplicate_response = await rq_client.add_batch_of_requests([duplicate]) - - # Should be marked as already present - assert duplicate_response.processed_requests[0].was_already_present is True + # Verify timestamps were updated + assert rq_client.metadata.created_at == initial_created + assert rq_client.metadata.modified_at > initial_modified + assert rq_client.metadata.accessed_at > accessed_after_read From aa9bfd3e63dfe508de7a86c7f8485f80e0cd23f0 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 27 Jun 2025 10:30:42 +0200 Subject: [PATCH 38/43] Create locks in async context only --- .../storage_clients/_file_system/_dataset_client.py | 7 +++++-- .../_file_system/_key_value_store_client.py | 7 +++++-- .../storage_clients/_file_system/_request_queue_client.py | 6 +++++- src/crawlee/storage_clients/_memory/_dataset_client.py | 2 +- .../storage_clients/_memory/_key_value_store_client.py | 2 +- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index 6650212628..dc1af4fd07 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -62,6 +62,7 @@ def __init__( modified_at: datetime, item_count: int, storage_dir: Path, + lock: asyncio.Lock, ) -> None: """Initialize a new instance. @@ -78,8 +79,7 @@ def __init__( self._storage_dir = storage_dir - # Internal attributes - self._lock = asyncio.Lock() + self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @property @@ -140,6 +140,7 @@ async def open( modified_at=metadata.modified_at, item_count=metadata.item_count, storage_dir=storage_dir, + lock=asyncio.Lock(), ) await client._update_metadata(update_accessed_at=True) found = True @@ -179,6 +180,7 @@ async def open( modified_at=metadata.modified_at, item_count=metadata.item_count, storage_dir=storage_dir, + lock=asyncio.Lock(), ) await client._update_metadata(update_accessed_at=True) @@ -194,6 +196,7 @@ async def open( modified_at=now, item_count=0, storage_dir=storage_dir, + lock=asyncio.Lock(), ) await client._update_metadata() diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 3c16f796a9..435aa494a7 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -60,6 +60,7 @@ def __init__( accessed_at: datetime, modified_at: datetime, storage_dir: Path, + lock: asyncio.Lock, ) -> None: """Initialize a new instance. @@ -75,8 +76,7 @@ def __init__( self._storage_dir = storage_dir - # Internal attributes - self._lock = asyncio.Lock() + self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @property @@ -136,6 +136,7 @@ async def open( accessed_at=metadata.accessed_at, modified_at=metadata.modified_at, storage_dir=storage_dir, + lock=asyncio.Lock(), ) await client._update_metadata(update_accessed_at=True) found = True @@ -172,6 +173,7 @@ async def open( accessed_at=metadata.accessed_at, modified_at=metadata.modified_at, storage_dir=storage_dir, + lock=asyncio.Lock(), ) await client._update_metadata(update_accessed_at=True) @@ -186,6 +188,7 @@ async def open( accessed_at=now, modified_at=now, storage_dir=storage_dir, + lock=asyncio.Lock(), ) await client._update_metadata() diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 084d9abda0..a8a917cacb 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -98,6 +98,7 @@ def __init__( stats: dict, total_request_count: int, storage_dir: Path, + lock: asyncio.Lock, ) -> None: """Initialize a new instance. @@ -119,7 +120,7 @@ def __init__( self._storage_dir = storage_dir """The base directory where the request queue is stored.""" - self._lock = asyncio.Lock() + self._lock = lock """A lock to ensure that only one operation is performed at a time.""" self._request_cache = deque[Request]() @@ -194,6 +195,7 @@ async def open( client = cls( **metadata.model_dump(), storage_dir=storage_dir, + lock=asyncio.Lock(), ) await client._state.initialize() await client._discover_existing_requests() @@ -230,6 +232,7 @@ async def open( client = cls( **metadata.model_dump(), storage_dir=storage_dir, + lock=asyncio.Lock(), ) await client._state.initialize() @@ -251,6 +254,7 @@ async def open( stats={}, total_request_count=0, storage_dir=storage_dir, + lock=asyncio.Lock(), ) await client._state.initialize() await client._update_metadata() diff --git a/src/crawlee/storage_clients/_memory/_dataset_client.py b/src/crawlee/storage_clients/_memory/_dataset_client.py index 3827de7bb4..2d65e67653 100644 --- a/src/crawlee/storage_clients/_memory/_dataset_client.py +++ b/src/crawlee/storage_clients/_memory/_dataset_client.py @@ -54,8 +54,8 @@ def __init__( item_count=item_count, ) - # List to hold dataset items self._records = list[dict[str, Any]]() + """List to hold dataset items. Each item is a dictionary representing a record.""" @property @override diff --git a/src/crawlee/storage_clients/_memory/_key_value_store_client.py b/src/crawlee/storage_clients/_memory/_key_value_store_client.py index 8e68c25e81..6205e75b7c 100644 --- a/src/crawlee/storage_clients/_memory/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_memory/_key_value_store_client.py @@ -50,8 +50,8 @@ def __init__( modified_at=modified_at, ) - # Dictionary to hold key-value records with metadata self._records = dict[str, KeyValueStoreRecord]() + """Dictionary to hold key-value records.""" @property @override From d6c9877b5e09a32db4c6b1e5541af196a9c6b4e8 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 27 Jun 2025 16:11:05 +0200 Subject: [PATCH 39/43] rm open methods from base storage clients --- .../storage_clients/_base/_dataset_client.py | 26 ------------------ .../_base/_key_value_store_client.py | 27 ------------------- .../_base/_request_queue_client.py | 21 --------------- .../_file_system/_dataset_client.py | 18 ++++++++++++- .../_file_system/_key_value_store_client.py | 18 ++++++++++++- .../_file_system/_request_queue_client.py | 18 ++++++++++++- .../_memory/_dataset_client.py | 17 +++++++++--- .../_memory/_key_value_store_client.py | 17 +++++++++--- .../_memory/_request_queue_client.py | 27 ++++++++++++------- .../_memory/_storage_client.py | 6 ++--- 10 files changed, 98 insertions(+), 97 deletions(-) diff --git a/src/crawlee/storage_clients/_base/_dataset_client.py b/src/crawlee/storage_clients/_base/_dataset_client.py index 3ae8d38f77..203f35f701 100644 --- a/src/crawlee/storage_clients/_base/_dataset_client.py +++ b/src/crawlee/storage_clients/_base/_dataset_client.py @@ -9,7 +9,6 @@ from collections.abc import AsyncIterator from typing import Any - from crawlee.configuration import Configuration from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata @@ -32,31 +31,6 @@ class DatasetClient(ABC): def metadata(self) -> DatasetMetadata: """The metadata of the dataset.""" - @classmethod - @abstractmethod - async def open( - cls, - *, - id: str | None, - name: str | None, - configuration: Configuration, - ) -> DatasetClient: - """Open existing or create a new dataset client. - - If a dataset with the given name or ID already exists, the appropriate dataset client is returned. - Otherwise, a new dataset is created and client for it is returned. - - The backend method for the `Dataset.open` call. - - Args: - id: The ID of the dataset. If not provided, an ID may be generated. - name: The name of the dataset. If not provided a default name may be used. - configuration: The configuration object. - - Returns: - A dataset client instance. - """ - @abstractmethod async def drop(self) -> None: """Drop the whole dataset and remove all its items. diff --git a/src/crawlee/storage_clients/_base/_key_value_store_client.py b/src/crawlee/storage_clients/_base/_key_value_store_client.py index df56bc1ac8..b01c278f11 100644 --- a/src/crawlee/storage_clients/_base/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_base/_key_value_store_client.py @@ -8,7 +8,6 @@ if TYPE_CHECKING: from collections.abc import AsyncIterator - from crawlee.configuration import Configuration from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata @@ -31,32 +30,6 @@ class KeyValueStoreClient(ABC): def metadata(self) -> KeyValueStoreMetadata: """The metadata of the key-value store.""" - @classmethod - @abstractmethod - async def open( - cls, - *, - id: str | None, - name: str | None, - configuration: Configuration, - ) -> KeyValueStoreClient: - """Open existing or create a new key-value store client. - - If a key-value store with the given name or ID already exists, the appropriate - key-value store client is returned. Otherwise, a new key-value store is created - and a client for it is returned. - - The backend method for the `KeyValueStoreClient.open` call. - - Args: - id: The ID of the key-value store. If not provided, an ID may be generated. - name: The name of the key-value store. If not provided a default name may be used. - configuration: The configuration object. - - Returns: - A key-value store client instance. - """ - @abstractmethod async def drop(self) -> None: """Drop the whole key-value store and remove all its values. diff --git a/src/crawlee/storage_clients/_base/_request_queue_client.py b/src/crawlee/storage_clients/_base/_request_queue_client.py index 64659b4d02..136c8531ee 100644 --- a/src/crawlee/storage_clients/_base/_request_queue_client.py +++ b/src/crawlee/storage_clients/_base/_request_queue_client.py @@ -9,7 +9,6 @@ from collections.abc import Sequence from crawlee import Request - from crawlee.configuration import Configuration from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata @@ -26,26 +25,6 @@ class RequestQueueClient(ABC): def metadata(self) -> RequestQueueMetadata: """The metadata of the request queue.""" - @classmethod - @abstractmethod - async def open( - cls, - *, - id: str | None, - name: str | None, - configuration: Configuration, - ) -> RequestQueueClient: - """Open a request queue client. - - Args: - id: ID of the queue to open. If not provided, a new queue will be created with a random ID. - name: Name of the queue to open. If not provided, the queue will be unnamed. - configuration: The configuration object. - - Returns: - A request queue client. - """ - @abstractmethod async def drop(self) -> None: """Drop the whole request queue and remove all its values. diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index dc1af4fd07..1717a5fd3a 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -100,7 +100,6 @@ def path_to_metadata(self) -> Path: """The full path to the dataset metadata file.""" return self.path_to_dataset / METADATA_FILENAME - @override @classmethod async def open( cls, @@ -109,6 +108,23 @@ async def open( name: str | None, configuration: Configuration, ) -> FileSystemDatasetClient: + """Open or create a file system dataset client. + + This method attempts to open an existing dataset from the file system. If a dataset with the specified ID + or name exists, it loads the metadata from the stored files. If no existing dataset is found, a new one + is created. + + Args: + id: The ID of the dataset to open. If provided, searches for existing dataset by ID. + name: The name of the dataset to open. If not provided, uses the default dataset. + configuration: The configuration object containing storage directory settings. + + Returns: + An instance for the opened or created storage client. + + Raises: + ValueError: If a dataset with the specified ID is not found, or if metadata is invalid. + """ storage_dir = Path(configuration.storage_dir) dataset_base_path = storage_dir / cls._STORAGE_SUBDIR diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 435aa494a7..5188277edf 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -97,7 +97,6 @@ def path_to_metadata(self) -> Path: """The full path to the key-value store metadata file.""" return self.path_to_kvs / METADATA_FILENAME - @override @classmethod async def open( cls, @@ -106,6 +105,23 @@ async def open( name: str | None, configuration: Configuration, ) -> FileSystemKeyValueStoreClient: + """Open or create a file system key-value store client. + + This method attempts to open an existing key-value store from the file system. If a KVS with the specified + ID or name exists, it loads the metadata from the stored files. If no existing store is found, a new one + is created. + + Args: + id: The ID of the key-value store to open. If provided, searches for existing store by ID. + name: The name of the key-value store to open. If not provided, uses the default store. + configuration: The configuration object containing storage directory settings. + + Returns: + An instance for the opened or created storage client. + + Raises: + ValueError: If a store with the specified ID is not found, or if metadata is invalid. + """ storage_dir = Path(configuration.storage_dir) kvs_base_path = storage_dir / cls._STORAGE_SUBDIR diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index a8a917cacb..700e670e36 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -159,7 +159,6 @@ def path_to_metadata(self) -> Path: """The full path to the request queue metadata file.""" return self.path_to_rq / METADATA_FILENAME - @override @classmethod async def open( cls, @@ -168,6 +167,23 @@ async def open( name: str | None, configuration: Configuration, ) -> FileSystemRequestQueueClient: + """Open or create a file system request queue client. + + This method attempts to open an existing request queue from the file system. If a queue with the specified + ID or name exists, it loads the metadata and state from the stored files. If no existing queue is found, + a new one is created. + + Args: + id: The ID of the request queue to open. If provided, searches for existing queue by ID. + name: The name of the request queue to open. If not provided, uses the default queue. + configuration: The configuration object containing storage directory settings. + + Returns: + An instance for the opened or created storage client. + + Raises: + ValueError: If a queue with the specified ID is not found, or if metadata is invalid. + """ storage_dir = Path(configuration.storage_dir) rq_base_path = storage_dir / cls._STORAGE_SUBDIR diff --git a/src/crawlee/storage_clients/_memory/_dataset_client.py b/src/crawlee/storage_clients/_memory/_dataset_client.py index 2d65e67653..0b639ded51 100644 --- a/src/crawlee/storage_clients/_memory/_dataset_client.py +++ b/src/crawlee/storage_clients/_memory/_dataset_client.py @@ -13,8 +13,6 @@ if TYPE_CHECKING: from collections.abc import AsyncIterator - from crawlee.configuration import Configuration - logger = getLogger(__name__) @@ -62,15 +60,26 @@ def __init__( def metadata(self) -> DatasetMetadata: return self._metadata - @override @classmethod async def open( cls, *, id: str | None, name: str | None, - configuration: Configuration, ) -> MemoryDatasetClient: + """Open or create a new memory dataset client. + + This method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory + datasets don't check for existing datasets with the same name or ID since all data exists only in memory + and is lost when the process terminates. + + Args: + id: The ID of the dataset. If not provided, a random ID will be generated. + name: The name of the dataset. If not provided, the dataset will be unnamed. + + Returns: + An instance for the opened or created storage client. + """ # Otherwise create a new dataset dataset_id = id or crypto_random_object_id() now = datetime.now(timezone.utc) diff --git a/src/crawlee/storage_clients/_memory/_key_value_store_client.py b/src/crawlee/storage_clients/_memory/_key_value_store_client.py index 6205e75b7c..f7c67daed1 100644 --- a/src/crawlee/storage_clients/_memory/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_memory/_key_value_store_client.py @@ -14,8 +14,6 @@ if TYPE_CHECKING: from collections.abc import AsyncIterator - from crawlee.configuration import Configuration - class MemoryKeyValueStoreClient(KeyValueStoreClient): """Memory implementation of the key-value store client. @@ -58,15 +56,26 @@ def __init__( def metadata(self) -> KeyValueStoreMetadata: return self._metadata - @override @classmethod async def open( cls, *, id: str | None, name: str | None, - configuration: Configuration, ) -> MemoryKeyValueStoreClient: + """Open or create a new memory key-value store client. + + This method creates a new in-memory key-value store instance. Unlike persistent storage implementations, + memory KVS don't check for existing stores with the same name or ID since all data exists only in memory + and is lost when the process terminates. + + Args: + id: The ID of the key-value store. If not provided, a random ID will be generated. + name: The name of the key-value store. If not provided, the store will be unnamed. + + Returns: + An instance for the opened or created storage client. + """ # Otherwise create a new key-value store store_id = id or crypto_random_object_id() now = datetime.now(timezone.utc) diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index cdf7a86dd2..48e99c4af8 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -16,20 +16,18 @@ if TYPE_CHECKING: from collections.abc import Sequence - from crawlee.configuration import Configuration - logger = getLogger(__name__) class MemoryRequestQueueClient(RequestQueueClient): """Memory implementation of the request queue client. - No data is persisted between process runs, which means all requests are lost when - the program terminates. This implementation is primarily useful for testing, - development, and short-lived crawler runs where persistence is not required. + No data is persisted between process runs, which means all requests are lost when the program terminates. + This implementation is primarily useful for testing, development, and short-lived crawler runs where + persistence is not required. - This client provides fast access to request data but is limited by available memory and - does not support data sharing across different processes. + This client provides fast access to request data but is limited by available memory and does not support + data sharing across different processes. """ def __init__( @@ -83,15 +81,26 @@ def __init__( def metadata(self) -> RequestQueueMetadata: return self._metadata - @override @classmethod async def open( cls, *, id: str | None, name: str | None, - configuration: Configuration, ) -> MemoryRequestQueueClient: + """Open or create a new memory request queue client. + + This method creates a new in-memory request queue instance. Unlike persistent storage implementations, + memory queues don't check for existing queues with the same name or ID since all data exists only + in memory and is lost when the process terminates. + + Args: + id: The ID of the request queue. If not provided, a random ID will be generated. + name: The name of the request queue. If not provided, the queue will be unnamed. + + Returns: + An instance for the opened or created storage client. + """ # Otherwise create a new queue queue_id = id or crypto_random_object_id() now = datetime.now(timezone.utc) diff --git a/src/crawlee/storage_clients/_memory/_storage_client.py b/src/crawlee/storage_clients/_memory/_storage_client.py index 9e3a2a4d2f..645294cad7 100644 --- a/src/crawlee/storage_clients/_memory/_storage_client.py +++ b/src/crawlee/storage_clients/_memory/_storage_client.py @@ -36,7 +36,7 @@ async def create_dataset_client( configuration: Configuration | None = None, ) -> MemoryDatasetClient: configuration = configuration or Configuration.get_global_configuration() - client = await MemoryDatasetClient.open(id=id, name=name, configuration=configuration) + client = await MemoryDatasetClient.open(id=id, name=name) await self._purge_if_needed(client, configuration) return client @@ -49,7 +49,7 @@ async def create_kvs_client( configuration: Configuration | None = None, ) -> MemoryKeyValueStoreClient: configuration = configuration or Configuration.get_global_configuration() - client = await MemoryKeyValueStoreClient.open(id=id, name=name, configuration=configuration) + client = await MemoryKeyValueStoreClient.open(id=id, name=name) await self._purge_if_needed(client, configuration) return client @@ -62,6 +62,6 @@ async def create_rq_client( configuration: Configuration | None = None, ) -> MemoryRequestQueueClient: configuration = configuration or Configuration.get_global_configuration() - client = await MemoryRequestQueueClient.open(id=id, name=name, configuration=configuration) + client = await MemoryRequestQueueClient.open(id=id, name=name) await self._purge_if_needed(client, configuration) return client From 3b133ceed410bbe68a196f5937ecf843806cb25a Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 30 Jun 2025 16:34:53 +0200 Subject: [PATCH 40/43] update storage clients inits --- .../_file_system/_dataset_client.py | 36 +++++-------------- .../_file_system/_key_value_store_client.py | 32 +++++------------ .../_file_system/_request_queue_client.py | 35 +++++------------- .../_memory/_dataset_client.py | 20 +++-------- .../_memory/_key_value_store_client.py | 18 +++------- .../_memory/_request_queue_client.py | 28 +++------------ 6 files changed, 42 insertions(+), 127 deletions(-) diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index 1717a5fd3a..4fedf20477 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -55,12 +55,7 @@ class FileSystemDatasetClient(DatasetClient): def __init__( self, *, - id: str, - name: str | None, - created_at: datetime, - accessed_at: datetime, - modified_at: datetime, - item_count: int, + metadata: DatasetMetadata, storage_dir: Path, lock: asyncio.Lock, ) -> None: @@ -68,16 +63,10 @@ def __init__( Preferably use the `FileSystemDatasetClient.open` class method to create a new instance. """ - self._metadata = DatasetMetadata( - id=id, - name=name, - created_at=created_at, - accessed_at=accessed_at, - modified_at=modified_at, - item_count=item_count, - ) + self._metadata = metadata self._storage_dir = storage_dir + """The base directory where the storage data are being persisted.""" self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @@ -149,12 +138,7 @@ async def open( metadata = DatasetMetadata(**file_content) if metadata.id == id: client = cls( - id=metadata.id, - name=metadata.name, - created_at=metadata.created_at, - accessed_at=metadata.accessed_at, - modified_at=metadata.modified_at, - item_count=metadata.item_count, + metadata=metadata, storage_dir=storage_dir, lock=asyncio.Lock(), ) @@ -189,12 +173,7 @@ async def open( raise ValueError(f'Invalid metadata file for dataset "{name}"') from exc client = cls( - id=metadata.id, - name=name, - created_at=metadata.created_at, - accessed_at=metadata.accessed_at, - modified_at=metadata.modified_at, - item_count=metadata.item_count, + metadata=metadata, storage_dir=storage_dir, lock=asyncio.Lock(), ) @@ -204,13 +183,16 @@ async def open( # Otherwise, create a new dataset client. else: now = datetime.now(timezone.utc) - client = cls( + metadata = DatasetMetadata( id=crypto_random_object_id(), name=name, created_at=now, accessed_at=now, modified_at=now, item_count=0, + ) + client = cls( + metadata=metadata, storage_dir=storage_dir, lock=asyncio.Lock(), ) diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 5188277edf..5f505e9fe6 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -54,11 +54,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient): def __init__( self, *, - id: str, - name: str | None, - created_at: datetime, - accessed_at: datetime, - modified_at: datetime, + metadata: KeyValueStoreMetadata, storage_dir: Path, lock: asyncio.Lock, ) -> None: @@ -66,15 +62,10 @@ def __init__( Preferably use the `FileSystemKeyValueStoreClient.open` class method to create a new instance. """ - self._metadata = KeyValueStoreMetadata( - id=id, - name=name, - created_at=created_at, - accessed_at=accessed_at, - modified_at=modified_at, - ) + self._metadata = metadata self._storage_dir = storage_dir + """The base directory where the storage data are being persisted.""" self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @@ -146,11 +137,7 @@ async def open( metadata = KeyValueStoreMetadata(**file_content) if metadata.id == id: client = cls( - id=metadata.id, - name=metadata.name, - created_at=metadata.created_at, - accessed_at=metadata.accessed_at, - modified_at=metadata.modified_at, + metadata=metadata, storage_dir=storage_dir, lock=asyncio.Lock(), ) @@ -183,11 +170,7 @@ async def open( raise ValueError(f'Invalid metadata file for key-value store "{name}"') from exc client = cls( - id=metadata.id, - name=name, - created_at=metadata.created_at, - accessed_at=metadata.accessed_at, - modified_at=metadata.modified_at, + metadata=metadata, storage_dir=storage_dir, lock=asyncio.Lock(), ) @@ -197,12 +180,15 @@ async def open( # Otherwise, create a new key-value store client. else: now = datetime.now(timezone.utc) - client = cls( + metadata = KeyValueStoreMetadata( id=crypto_random_object_id(), name=name, created_at=now, accessed_at=now, modified_at=now, + ) + client = cls( + metadata=metadata, storage_dir=storage_dir, lock=asyncio.Lock(), ) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 700e670e36..a76029d8fe 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -87,16 +87,7 @@ class FileSystemRequestQueueClient(RequestQueueClient): def __init__( self, *, - id: str, - name: str | None, - created_at: datetime, - accessed_at: datetime, - modified_at: datetime, - had_multiple_clients: bool, - handled_request_count: int, - pending_request_count: int, - stats: dict, - total_request_count: int, + metadata: RequestQueueMetadata, storage_dir: Path, lock: asyncio.Lock, ) -> None: @@ -104,21 +95,10 @@ def __init__( Preferably use the `FileSystemRequestQueueClient.open` class method to create a new instance. """ - self._metadata = RequestQueueMetadata( - id=id, - name=name, - created_at=created_at, - accessed_at=accessed_at, - modified_at=modified_at, - had_multiple_clients=had_multiple_clients, - handled_request_count=handled_request_count, - pending_request_count=pending_request_count, - stats=stats, - total_request_count=total_request_count, - ) + self._metadata = metadata self._storage_dir = storage_dir - """The base directory where the request queue is stored.""" + """The base directory where the storage data are being persisted.""" self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @@ -209,7 +189,7 @@ async def open( if metadata.id == id: client = cls( - **metadata.model_dump(), + metadata=metadata, storage_dir=storage_dir, lock=asyncio.Lock(), ) @@ -246,7 +226,7 @@ async def open( metadata.name = name client = cls( - **metadata.model_dump(), + metadata=metadata, storage_dir=storage_dir, lock=asyncio.Lock(), ) @@ -258,7 +238,7 @@ async def open( # Otherwise, create a new dataset client. else: now = datetime.now(timezone.utc) - client = cls( + metadata = RequestQueueMetadata( id=crypto_random_object_id(), name=name, created_at=now, @@ -269,6 +249,9 @@ async def open( pending_request_count=0, stats={}, total_request_count=0, + ) + client = cls( + metadata=metadata, storage_dir=storage_dir, lock=asyncio.Lock(), ) diff --git a/src/crawlee/storage_clients/_memory/_dataset_client.py b/src/crawlee/storage_clients/_memory/_dataset_client.py index 0b639ded51..1960cb8187 100644 --- a/src/crawlee/storage_clients/_memory/_dataset_client.py +++ b/src/crawlee/storage_clients/_memory/_dataset_client.py @@ -32,25 +32,13 @@ class MemoryDatasetClient(DatasetClient): def __init__( self, *, - id: str, - name: str | None, - created_at: datetime, - accessed_at: datetime, - modified_at: datetime, - item_count: int, + metadata: DatasetMetadata, ) -> None: """Initialize a new instance. Preferably use the `MemoryDatasetClient.open` class method to create a new instance. """ - self._metadata = DatasetMetadata( - id=id, - name=name, - created_at=created_at, - accessed_at=accessed_at, - modified_at=modified_at, - item_count=item_count, - ) + self._metadata = metadata self._records = list[dict[str, Any]]() """List to hold dataset items. Each item is a dictionary representing a record.""" @@ -84,7 +72,7 @@ async def open( dataset_id = id or crypto_random_object_id() now = datetime.now(timezone.utc) - return cls( + metadata = DatasetMetadata( id=dataset_id, name=name, created_at=now, @@ -93,6 +81,8 @@ async def open( item_count=0, ) + return cls(metadata=metadata) + @override async def drop(self) -> None: self._records.clear() diff --git a/src/crawlee/storage_clients/_memory/_key_value_store_client.py b/src/crawlee/storage_clients/_memory/_key_value_store_client.py index f7c67daed1..ea078eb07c 100644 --- a/src/crawlee/storage_clients/_memory/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_memory/_key_value_store_client.py @@ -30,23 +30,13 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient): def __init__( self, *, - id: str, - name: str | None, - created_at: datetime, - accessed_at: datetime, - modified_at: datetime, + metadata: KeyValueStoreMetadata, ) -> None: """Initialize a new instance. Preferably use the `MemoryKeyValueStoreClient.open` class method to create a new instance. """ - self._metadata = KeyValueStoreMetadata( - id=id, - name=name, - created_at=created_at, - accessed_at=accessed_at, - modified_at=modified_at, - ) + self._metadata = metadata self._records = dict[str, KeyValueStoreRecord]() """Dictionary to hold key-value records.""" @@ -80,7 +70,7 @@ async def open( store_id = id or crypto_random_object_id() now = datetime.now(timezone.utc) - return cls( + metadata = KeyValueStoreMetadata( id=store_id, name=name, created_at=now, @@ -88,6 +78,8 @@ async def open( modified_at=now, ) + return cls(metadata=metadata) + @override async def drop(self) -> None: self._records.clear() diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index 48e99c4af8..b7ab1ce6c7 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -33,33 +33,13 @@ class MemoryRequestQueueClient(RequestQueueClient): def __init__( self, *, - id: str, - name: str | None, - created_at: datetime, - accessed_at: datetime, - modified_at: datetime, - had_multiple_clients: bool, - handled_request_count: int, - pending_request_count: int, - stats: dict, - total_request_count: int, + metadata: RequestQueueMetadata, ) -> None: """Initialize a new instance. Preferably use the `MemoryRequestQueueClient.open` class method to create a new instance. """ - self._metadata = RequestQueueMetadata( - id=id, - name=name, - created_at=created_at, - accessed_at=accessed_at, - modified_at=modified_at, - had_multiple_clients=had_multiple_clients, - handled_request_count=handled_request_count, - pending_request_count=pending_request_count, - stats=stats, - total_request_count=total_request_count, - ) + self._metadata = metadata self._pending_requests = deque[Request]() """Pending requests are those that have been added to the queue but not yet fetched for processing.""" @@ -105,7 +85,7 @@ async def open( queue_id = id or crypto_random_object_id() now = datetime.now(timezone.utc) - return cls( + metadata = RequestQueueMetadata( id=queue_id, name=name, created_at=now, @@ -118,6 +98,8 @@ async def open( total_request_count=0, ) + return cls(metadata=metadata) + @override async def drop(self) -> None: self._pending_requests.clear() From 43b9fe920caf9129f2230ae127a4c191d3d5d77c Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 1 Jul 2025 13:29:07 +0200 Subject: [PATCH 41/43] async metadata getter --- .../storage_clients/_base/_dataset_client.py | 5 +- .../_base/_key_value_store_client.py | 5 +- .../_base/_request_queue_client.py | 5 +- .../storage_clients/_base/_storage_client.py | 3 +- .../_file_system/_dataset_client.py | 7 +- .../_file_system/_key_value_store_client.py | 7 +- .../_file_system/_request_queue_client.py | 7 +- .../_memory/_dataset_client.py | 6 +- .../_memory/_key_value_store_client.py | 3 +- .../_memory/_request_queue_client.py | 3 +- src/crawlee/storages/_base.py | 3 +- src/crawlee/storages/_dataset.py | 17 +++-- src/crawlee/storages/_key_value_store.py | 19 ++++-- src/crawlee/storages/_request_queue.py | 23 ++++--- .../storages/_storage_instance_manager.py | 3 +- .../_file_system/test_fs_dataset_client.py | 32 +++++---- .../_file_system/test_fs_kvs_client.py | 27 ++++---- .../_file_system/test_fs_rq_client.py | 34 +++++----- .../_memory/test_memory_dataset_client.py | 23 ++++--- .../_memory/test_memory_kvs_client.py | 23 ++++--- .../_memory/test_memory_rq_client.py | 23 ++++--- tests/unit/storages/test_dataset.py | 24 +++++-- tests/unit/storages/test_request_queue.py | 67 +++++++++++-------- 23 files changed, 209 insertions(+), 160 deletions(-) diff --git a/src/crawlee/storage_clients/_base/_dataset_client.py b/src/crawlee/storage_clients/_base/_dataset_client.py index 203f35f701..840d816ea2 100644 --- a/src/crawlee/storage_clients/_base/_dataset_client.py +++ b/src/crawlee/storage_clients/_base/_dataset_client.py @@ -26,10 +26,9 @@ class DatasetClient(ABC): This abstract class defines the interface that all specific dataset clients must implement. """ - @property @abstractmethod - def metadata(self) -> DatasetMetadata: - """The metadata of the dataset.""" + async def get_metadata(self) -> DatasetMetadata: + """Get the metadata of the dataset.""" @abstractmethod async def drop(self) -> None: diff --git a/src/crawlee/storage_clients/_base/_key_value_store_client.py b/src/crawlee/storage_clients/_base/_key_value_store_client.py index b01c278f11..0def370551 100644 --- a/src/crawlee/storage_clients/_base/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_base/_key_value_store_client.py @@ -25,10 +25,9 @@ class KeyValueStoreClient(ABC): This abstract class defines the interface that all specific KVS clients must implement. """ - @property @abstractmethod - def metadata(self) -> KeyValueStoreMetadata: - """The metadata of the key-value store.""" + async def get_metadata(self) -> KeyValueStoreMetadata: + """Get the metadata of the key-value store.""" @abstractmethod async def drop(self) -> None: diff --git a/src/crawlee/storage_clients/_base/_request_queue_client.py b/src/crawlee/storage_clients/_base/_request_queue_client.py index 136c8531ee..c50b1af685 100644 --- a/src/crawlee/storage_clients/_base/_request_queue_client.py +++ b/src/crawlee/storage_clients/_base/_request_queue_client.py @@ -20,10 +20,9 @@ class RequestQueueClient(ABC): client, like a memory storage client. """ - @property @abstractmethod - def metadata(self) -> RequestQueueMetadata: - """The metadata of the request queue.""" + async def get_metadata(self) -> RequestQueueMetadata: + """Get the metadata of the request queue.""" @abstractmethod async def drop(self) -> None: diff --git a/src/crawlee/storage_clients/_base/_storage_client.py b/src/crawlee/storage_clients/_base/_storage_client.py index de49500d20..ef27e3e563 100644 --- a/src/crawlee/storage_clients/_base/_storage_client.py +++ b/src/crawlee/storage_clients/_base/_storage_client.py @@ -77,5 +77,6 @@ async def _purge_if_needed( client: The storage client to potentially purge. configuration: Configuration that determines whether purging should occur. """ - if configuration.purge_on_start and client.metadata.name is None: + metadata = await client.get_metadata() + if configuration.purge_on_start and metadata.name is None: await client.purge() diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index 4fedf20477..54b0fe30ca 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -71,18 +71,17 @@ def __init__( self._lock = lock """A lock to ensure that only one operation is performed at a time.""" - @property @override - def metadata(self) -> DatasetMetadata: + async def get_metadata(self) -> DatasetMetadata: return self._metadata @property def path_to_dataset(self) -> Path: """The full path to the dataset directory.""" - if self.metadata.name is None: + if self._metadata.name is None: return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT - return self._storage_dir / self._STORAGE_SUBDIR / self.metadata.name + return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name @property def path_to_metadata(self) -> Path: diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 5f505e9fe6..bc94980bcc 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -70,18 +70,17 @@ def __init__( self._lock = lock """A lock to ensure that only one operation is performed at a time.""" - @property @override - def metadata(self) -> KeyValueStoreMetadata: + async def get_metadata(self) -> KeyValueStoreMetadata: return self._metadata @property def path_to_kvs(self) -> Path: """The full path to the key-value store directory.""" - if self.metadata.name is None: + if self._metadata.name is None: return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT - return self._storage_dir / self._STORAGE_SUBDIR / self.metadata.name + return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name @property def path_to_metadata(self) -> Path: diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index a76029d8fe..e574855e99 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -121,18 +121,17 @@ def __init__( ) """Recoverable state to maintain request ordering, in-progress status, and handled status.""" - @property @override - def metadata(self) -> RequestQueueMetadata: + async def get_metadata(self) -> RequestQueueMetadata: return self._metadata @property def path_to_rq(self) -> Path: """The full path to the request queue directory.""" - if self.metadata.name is None: + if self._metadata.name is None: return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT - return self._storage_dir / self._STORAGE_SUBDIR / self.metadata.name + return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name @property def path_to_metadata(self) -> Path: diff --git a/src/crawlee/storage_clients/_memory/_dataset_client.py b/src/crawlee/storage_clients/_memory/_dataset_client.py index 1960cb8187..dd64a9d9ed 100644 --- a/src/crawlee/storage_clients/_memory/_dataset_client.py +++ b/src/crawlee/storage_clients/_memory/_dataset_client.py @@ -43,9 +43,8 @@ def __init__( self._records = list[dict[str, Any]]() """List to hold dataset items. Each item is a dictionary representing a record.""" - @property @override - def metadata(self) -> DatasetMetadata: + async def get_metadata(self) -> DatasetMetadata: return self._metadata @classmethod @@ -103,7 +102,8 @@ async def purge(self) -> None: @override async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: - new_item_count = self.metadata.item_count + metadata = await self.get_metadata() + new_item_count = metadata.item_count if isinstance(data, list): for item in data: diff --git a/src/crawlee/storage_clients/_memory/_key_value_store_client.py b/src/crawlee/storage_clients/_memory/_key_value_store_client.py index ea078eb07c..7dacf6d95d 100644 --- a/src/crawlee/storage_clients/_memory/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_memory/_key_value_store_client.py @@ -41,9 +41,8 @@ def __init__( self._records = dict[str, KeyValueStoreRecord]() """Dictionary to hold key-value records.""" - @property @override - def metadata(self) -> KeyValueStoreMetadata: + async def get_metadata(self) -> KeyValueStoreMetadata: return self._metadata @classmethod diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index b7ab1ce6c7..ad166e20bd 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -56,9 +56,8 @@ def __init__( self._requests_by_unique_key = dict[str, Request]() """Unique key -> Request mapping for fast lookup by unique key.""" - @property @override - def metadata(self) -> RequestQueueMetadata: + async def get_metadata(self) -> RequestQueueMetadata: return self._metadata @classmethod diff --git a/src/crawlee/storages/_base.py b/src/crawlee/storages/_base.py index fc0a04979c..073d27f77c 100644 --- a/src/crawlee/storages/_base.py +++ b/src/crawlee/storages/_base.py @@ -22,9 +22,8 @@ def id(self) -> str: def name(self) -> str | None: """Get the storage name.""" - @property @abstractmethod - def metadata(self) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata: + async def get_metadata(self) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata: """Get the storage metadata.""" @classmethod diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index 50badbf246..7004e4cd2d 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -65,30 +65,33 @@ class Dataset(Storage): ``` """ - def __init__(self, client: DatasetClient) -> None: + def __init__(self, client: DatasetClient, id: str, name: str | None) -> None: """Initialize a new instance. Preferably use the `Dataset.open` constructor to create a new instance. Args: - client: An instance of a dataset client. + client: An instance of a storage client. + id: The unique identifier of the storage. + name: The name of the storage, if available. """ self._client = client + self._id = id + self._name = name @property @override def id(self) -> str: - return self._client.metadata.id + return self._id @property @override def name(self) -> str | None: - return self._client.metadata.name + return self._name - @property @override - def metadata(self) -> DatasetMetadata: - return self._client.metadata + async def get_metadata(self) -> DatasetMetadata: + return await self._client.get_metadata() @override @classmethod diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index 8838f39c2b..f205011bfb 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -74,31 +74,36 @@ class KeyValueStore(Storage): ] = {} """Cache for recoverable (auto-saved) values.""" - def __init__(self, client: KeyValueStoreClient) -> None: + def __init__(self, client: KeyValueStoreClient, id: str, name: str | None) -> None: """Initialize a new instance. Preferably use the `KeyValueStore.open` constructor to create a new instance. Args: - client: An instance of a key-value store client. + client: An instance of a storage client. + id: The unique identifier of the storage. + name: The name of the storage, if available. """ self._client = client + self._id = id + self._name = name + self._autosave_lock = asyncio.Lock() + """Lock for autosaving values to prevent concurrent modifications.""" @property @override def id(self) -> str: - return self._client.metadata.id + return self._id @property @override def name(self) -> str | None: - return self._client.metadata.name + return self._name - @property @override - def metadata(self) -> KeyValueStoreMetadata: - return self._client.metadata + async def get_metadata(self) -> KeyValueStoreMetadata: + return await self._client.get_metadata() @override @classmethod diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index 36551a6e16..c1b0227bdf 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -70,15 +70,19 @@ class RequestQueue(Storage, RequestManager): ``` """ - def __init__(self, client: RequestQueueClient) -> None: + def __init__(self, client: RequestQueueClient, id: str, name: str | None) -> None: """Initialize a new instance. Preferably use the `RequestQueue.open` constructor to create a new instance. Args: - client: An instance of a request queue client. + client: An instance of a storage client. + id: The unique identifier of the storage. + name: The name of the storage, if available. """ self._client = client + self._id = id + self._name = name self._add_requests_tasks = list[asyncio.Task]() """A list of tasks for adding requests to the queue.""" @@ -86,25 +90,26 @@ def __init__(self, client: RequestQueueClient) -> None: @property @override def id(self) -> str: - return self._client.metadata.id + return self._id @property @override def name(self) -> str | None: - return self._client.metadata.name + return self._name - @property @override - def metadata(self) -> RequestQueueMetadata: - return self._client.metadata + async def get_metadata(self) -> RequestQueueMetadata: + return await self._client.get_metadata() @override async def get_handled_count(self) -> int: - return self._client.metadata.handled_request_count + metadata = await self._client.get_metadata() + return metadata.handled_request_count @override async def get_total_count(self) -> int: - return self._client.metadata.total_request_count + metadata = await self._client.get_metadata() + return metadata.total_request_count @override @classmethod diff --git a/src/crawlee/storages/_storage_instance_manager.py b/src/crawlee/storages/_storage_instance_manager.py index e4cdc5587f..7e51a7343b 100644 --- a/src/crawlee/storages/_storage_instance_manager.py +++ b/src/crawlee/storages/_storage_instance_manager.py @@ -75,7 +75,8 @@ async def open_storage_instance( # Create new instance client = await client_opener(id=id, name=name, configuration=configuration) - instance = cls(client) # type: ignore[call-arg] + metadata = await client.get_metadata() + instance = cls(client, metadata.id, metadata.name) # type: ignore[call-arg] instance_name = getattr(instance, 'name', None) # Cache the instance diff --git a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py index 0f0ca0fe19..c5f31f144e 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py @@ -49,7 +49,8 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None # Verify metadata file structure with client.path_to_metadata.open() as f: metadata = json.load(f) - assert metadata['id'] == client.metadata.id + client_metadata = await client.get_metadata() + assert metadata['id'] == client_metadata.id assert metadata['name'] == 'new_dataset' assert metadata['item_count'] == 0 @@ -99,9 +100,10 @@ async def test_drop_removes_files_from_disk(dataset_client: FileSystemDatasetCli async def test_metadata_file_updates(dataset_client: FileSystemDatasetClient) -> None: """Test that metadata file is updated correctly after operations.""" # Record initial timestamps - initial_created = dataset_client.metadata.created_at - initial_accessed = dataset_client.metadata.accessed_at - initial_modified = dataset_client.metadata.modified_at + metadata = await dataset_client.get_metadata() + initial_created = metadata.created_at + initial_accessed = metadata.accessed_at + initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) @@ -110,11 +112,12 @@ async def test_metadata_file_updates(dataset_client: FileSystemDatasetClient) -> await dataset_client.get_data() # Verify timestamps - assert dataset_client.metadata.created_at == initial_created - assert dataset_client.metadata.accessed_at > initial_accessed - assert dataset_client.metadata.modified_at == initial_modified + metadata = await dataset_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.accessed_at > initial_accessed + assert metadata.modified_at == initial_modified - accessed_after_get = dataset_client.metadata.accessed_at + accessed_after_get = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) @@ -123,14 +126,15 @@ async def test_metadata_file_updates(dataset_client: FileSystemDatasetClient) -> await dataset_client.push_data({'new': 'item'}) # Verify timestamps again - assert dataset_client.metadata.created_at == initial_created - assert dataset_client.metadata.modified_at > initial_modified - assert dataset_client.metadata.accessed_at > accessed_after_get + metadata = await dataset_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.modified_at > initial_modified + assert metadata.accessed_at > accessed_after_get # Verify metadata file is updated on disk with dataset_client.path_to_metadata.open() as f: - metadata = json.load(f) - assert metadata['item_count'] == 1 + metadata_json = json.load(f) + assert metadata_json['item_count'] == 1 async def test_data_persistence_across_reopens(configuration: Configuration) -> None: @@ -146,7 +150,7 @@ async def test_data_persistence_across_reopens(configuration: Configuration) -> test_data = {'test_item': 'test_value', 'id': 123} await original_client.push_data(test_data) - dataset_id = original_client.metadata.id + dataset_id = (await original_client.get_metadata()).id # Reopen by ID and verify data persists reopened_client = await storage_client.create_dataset_client( diff --git a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py index 0c36258ccc..c5bfa96c47 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py @@ -49,7 +49,7 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None # Verify metadata file structure with client.path_to_metadata.open() as f: metadata = json.load(f) - assert metadata['id'] == client.metadata.id + assert metadata['id'] == (await client.get_metadata()).id assert metadata['name'] == 'new_kvs' await client.drop() @@ -150,9 +150,10 @@ async def test_drop_removes_directory(kvs_client: FileSystemKeyValueStoreClient) async def test_metadata_file_updates(kvs_client: FileSystemKeyValueStoreClient) -> None: """Test that read/write operations properly update metadata file timestamps.""" # Record initial timestamps - initial_created = kvs_client.metadata.created_at - initial_accessed = kvs_client.metadata.accessed_at - initial_modified = kvs_client.metadata.modified_at + metadata = await kvs_client.get_metadata() + initial_created = metadata.created_at + initial_accessed = metadata.accessed_at + initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) @@ -161,11 +162,12 @@ async def test_metadata_file_updates(kvs_client: FileSystemKeyValueStoreClient) await kvs_client.get_value(key='nonexistent') # Verify accessed timestamp was updated - assert kvs_client.metadata.created_at == initial_created - assert kvs_client.metadata.accessed_at > initial_accessed - assert kvs_client.metadata.modified_at == initial_modified + metadata = await kvs_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.accessed_at > initial_accessed + assert metadata.modified_at == initial_modified - accessed_after_read = kvs_client.metadata.accessed_at + accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) @@ -174,9 +176,10 @@ async def test_metadata_file_updates(kvs_client: FileSystemKeyValueStoreClient) await kvs_client.set_value(key='test', value='test-value') # Verify modified timestamp was updated - assert kvs_client.metadata.created_at == initial_created - assert kvs_client.metadata.modified_at > initial_modified - assert kvs_client.metadata.accessed_at > accessed_after_read + metadata = await kvs_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.modified_at > initial_modified + assert metadata.accessed_at > accessed_after_read async def test_data_persistence_across_reopens(configuration: Configuration) -> None: @@ -193,7 +196,7 @@ async def test_data_persistence_across_reopens(configuration: Configuration) -> test_value = 'persistent-value' await original_client.set_value(key=test_key, value=test_value) - kvs_id = original_client.metadata.id + kvs_id = (await original_client.get_metadata()).id # Reopen by ID and verify data persists reopened_client = await storage_client.create_kvs_client( diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index 1b9d329b3d..0be182fcd8 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -49,7 +49,7 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None # Verify metadata file structure with client.path_to_metadata.open() as f: metadata = json.load(f) - assert metadata['id'] == client.metadata.id + assert metadata['id'] == (await client.get_metadata()).id assert metadata['name'] == 'new_request_queue' await client.drop() @@ -98,9 +98,10 @@ async def test_drop_removes_directory(rq_client: FileSystemRequestQueueClient) - async def test_metadata_file_updates(rq_client: FileSystemRequestQueueClient) -> None: """Test that metadata file is updated correctly after operations.""" # Record initial timestamps - initial_created = rq_client.metadata.created_at - initial_accessed = rq_client.metadata.accessed_at - initial_modified = rq_client.metadata.modified_at + metadata = await rq_client.get_metadata() + initial_created = metadata.created_at + initial_accessed = metadata.accessed_at + initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) @@ -109,11 +110,12 @@ async def test_metadata_file_updates(rq_client: FileSystemRequestQueueClient) -> await rq_client.is_empty() # Verify accessed timestamp was updated - assert rq_client.metadata.created_at == initial_created - assert rq_client.metadata.accessed_at > initial_accessed - assert rq_client.metadata.modified_at == initial_modified + metadata = await rq_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.accessed_at > initial_accessed + assert metadata.modified_at == initial_modified - accessed_after_read = rq_client.metadata.accessed_at + accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) @@ -122,14 +124,15 @@ async def test_metadata_file_updates(rq_client: FileSystemRequestQueueClient) -> await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) # Verify modified timestamp was updated - assert rq_client.metadata.created_at == initial_created - assert rq_client.metadata.modified_at > initial_modified - assert rq_client.metadata.accessed_at > accessed_after_read + metadata = await rq_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.modified_at > initial_modified + assert metadata.accessed_at > accessed_after_read # Verify metadata file is updated on disk with rq_client.path_to_metadata.open() as f: - metadata = json.load(f) - assert metadata['total_request_count'] == 1 + metadata_json = json.load(f) + assert metadata_json['total_request_count'] == 1 async def test_data_persistence_across_reopens(configuration: Configuration) -> None: @@ -148,7 +151,7 @@ async def test_data_persistence_across_reopens(configuration: Configuration) -> ] await original_client.add_batch_of_requests(test_requests) - rq_id = original_client.metadata.id + rq_id = (await original_client.get_metadata()).id # Reopen by ID and verify requests persist reopened_client = await storage_client.create_rq_client( @@ -156,7 +159,8 @@ async def test_data_persistence_across_reopens(configuration: Configuration) -> configuration=configuration, ) - assert reopened_client.metadata.total_request_count == 2 + metadata = await reopened_client.get_metadata() + assert metadata.total_request_count == 2 # Fetch requests to verify they're still there request1 = await reopened_client.fetch_next_request() diff --git a/tests/unit/storage_clients/_memory/test_memory_dataset_client.py b/tests/unit/storage_clients/_memory/test_memory_dataset_client.py index cbea599bc2..8cc846b0f4 100644 --- a/tests/unit/storage_clients/_memory/test_memory_dataset_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_dataset_client.py @@ -51,9 +51,10 @@ async def test_memory_specific_purge_behavior() -> None: async def test_memory_metadata_updates(dataset_client: MemoryDatasetClient) -> None: """Test that metadata timestamps are updated correctly in memory storage.""" # Record initial timestamps - initial_created = dataset_client.metadata.created_at - initial_accessed = dataset_client.metadata.accessed_at - initial_modified = dataset_client.metadata.modified_at + metadata = await dataset_client.get_metadata() + initial_created = metadata.created_at + initial_accessed = metadata.accessed_at + initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) @@ -62,11 +63,12 @@ async def test_memory_metadata_updates(dataset_client: MemoryDatasetClient) -> N await dataset_client.get_data() # Verify timestamps (memory-specific behavior) - assert dataset_client.metadata.created_at == initial_created - assert dataset_client.metadata.accessed_at > initial_accessed - assert dataset_client.metadata.modified_at == initial_modified + metadata = await dataset_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.accessed_at > initial_accessed + assert metadata.modified_at == initial_modified - accessed_after_read = dataset_client.metadata.accessed_at + accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) @@ -75,6 +77,7 @@ async def test_memory_metadata_updates(dataset_client: MemoryDatasetClient) -> N await dataset_client.push_data({'new': 'item'}) # Verify timestamps were updated - assert dataset_client.metadata.created_at == initial_created - assert dataset_client.metadata.modified_at > initial_modified - assert dataset_client.metadata.accessed_at > accessed_after_read + metadata = await dataset_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.modified_at > initial_modified + assert metadata.accessed_at > accessed_after_read diff --git a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py index 6b4388984e..463fb2a14c 100644 --- a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py @@ -52,9 +52,10 @@ async def test_memory_specific_purge_behavior() -> None: async def test_memory_metadata_updates(kvs_client: MemoryKeyValueStoreClient) -> None: """Test that metadata timestamps are updated correctly in memory storage.""" # Record initial timestamps - initial_created = kvs_client.metadata.created_at - initial_accessed = kvs_client.metadata.accessed_at - initial_modified = kvs_client.metadata.modified_at + metadata = await kvs_client.get_metadata() + initial_created = metadata.created_at + initial_accessed = metadata.accessed_at + initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) @@ -63,11 +64,12 @@ async def test_memory_metadata_updates(kvs_client: MemoryKeyValueStoreClient) -> await kvs_client.get_value(key='nonexistent') # Verify timestamps (memory-specific behavior) - assert kvs_client.metadata.created_at == initial_created - assert kvs_client.metadata.accessed_at > initial_accessed - assert kvs_client.metadata.modified_at == initial_modified + metadata = await kvs_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.accessed_at > initial_accessed + assert metadata.modified_at == initial_modified - accessed_after_read = kvs_client.metadata.accessed_at + accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) @@ -76,6 +78,7 @@ async def test_memory_metadata_updates(kvs_client: MemoryKeyValueStoreClient) -> await kvs_client.set_value(key='test', value='test-value') # Verify timestamps were updated - assert kvs_client.metadata.created_at == initial_created - assert kvs_client.metadata.modified_at > initial_modified - assert kvs_client.metadata.accessed_at > accessed_after_read + metadata = await kvs_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.modified_at > initial_modified + assert metadata.accessed_at > accessed_after_read diff --git a/tests/unit/storage_clients/_memory/test_memory_rq_client.py b/tests/unit/storage_clients/_memory/test_memory_rq_client.py index 68a838d4cc..7877d8af79 100644 --- a/tests/unit/storage_clients/_memory/test_memory_rq_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_rq_client.py @@ -51,9 +51,10 @@ async def test_memory_specific_purge_behavior() -> None: async def test_memory_metadata_updates(rq_client: MemoryRequestQueueClient) -> None: """Test that metadata timestamps are updated correctly in memory storage.""" # Record initial timestamps - initial_created = rq_client.metadata.created_at - initial_accessed = rq_client.metadata.accessed_at - initial_modified = rq_client.metadata.modified_at + metadata = await rq_client.get_metadata() + initial_created = metadata.created_at + initial_accessed = metadata.accessed_at + initial_modified = metadata.modified_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) @@ -62,11 +63,12 @@ async def test_memory_metadata_updates(rq_client: MemoryRequestQueueClient) -> N await rq_client.is_empty() # Verify timestamps (memory-specific behavior) - assert rq_client.metadata.created_at == initial_created - assert rq_client.metadata.accessed_at > initial_accessed - assert rq_client.metadata.modified_at == initial_modified + metadata = await rq_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.accessed_at > initial_accessed + assert metadata.modified_at == initial_modified - accessed_after_read = rq_client.metadata.accessed_at + accessed_after_read = metadata.accessed_at # Wait a moment to ensure timestamps can change await asyncio.sleep(0.01) @@ -75,6 +77,7 @@ async def test_memory_metadata_updates(rq_client: MemoryRequestQueueClient) -> N await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) # Verify timestamps were updated - assert rq_client.metadata.created_at == initial_created - assert rq_client.metadata.modified_at > initial_modified - assert rq_client.metadata.accessed_at > accessed_after_read + metadata = await rq_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.modified_at > initial_modified + assert metadata.accessed_at > accessed_after_read diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py index 093bfdbbfc..b4f75bc6b4 100644 --- a/tests/unit/storages/test_dataset.py +++ b/tests/unit/storages/test_dataset.py @@ -66,7 +66,9 @@ async def test_open_creates_new_dataset( # Verify dataset properties assert dataset.id is not None assert dataset.name == 'new_dataset' - assert dataset.metadata.item_count == 0 + + metadata = await dataset.get_metadata() + assert metadata.item_count == 0 await dataset.drop() @@ -84,11 +86,13 @@ async def test_reopen_default( # Verify default properties assert dataset_1.id is not None - assert dataset_1.metadata.item_count == 0 + metadata_1 = await dataset_1.get_metadata() + assert metadata_1.item_count == 0 # Add an item await dataset_1.push_data({'key': 'value'}) - assert dataset_1.metadata.item_count == 1 + metadata_1 = await dataset_1.get_metadata() + assert metadata_1.item_count == 1 # Reopen the same dataset dataset_2 = await Dataset.open( @@ -99,7 +103,9 @@ async def test_reopen_default( # Verify both instances reference the same dataset assert dataset_2.id == dataset_1.id assert dataset_2.name == dataset_1.name - assert dataset_2.metadata.item_count == dataset_1.metadata.item_count == 1 + metadata_1 = await dataset_1.get_metadata() + metadata_2 = await dataset_2.get_metadata() + assert metadata_2.item_count == metadata_1.item_count == 1 # Verify they are the same object (cached) assert id(dataset_1) == id(dataset_2) @@ -159,7 +165,9 @@ async def test_open_existing_dataset( # Verify dataset properties assert dataset.id == reopened_dataset.id assert dataset.name == reopened_dataset.name - assert dataset.metadata.item_count == reopened_dataset.metadata.item_count + metadata = await dataset.get_metadata() + reopened_metadata = await reopened_dataset.get_metadata() + assert metadata.item_count == reopened_metadata.item_count # Verify they are the same object (from cache) assert id(dataset) == id(reopened_dataset) @@ -544,7 +552,8 @@ async def test_purge( data = await dataset.get_data() assert data.count == 3 assert data.total == 3 - assert dataset.metadata.item_count == 3 + metadata = await dataset.get_metadata() + assert metadata.item_count == 3 # Record the dataset ID dataset_id = dataset.id @@ -560,7 +569,8 @@ async def test_purge( data = await dataset.get_data() assert data.count == 0 assert data.total == 0 - assert dataset.metadata.item_count == 0 + metadata = await dataset.get_metadata() + assert metadata.item_count == 0 # Verify we can add new data after purging new_item = {'id': 4, 'name': 'New Item After Purge'} diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index 1f3d1936df..8df759a27f 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -67,9 +67,10 @@ async def test_open_creates_new_rq( # Verify request queue properties assert rq.id is not None assert rq.name == 'new_request_queue' - assert rq.metadata.pending_request_count == 0 - assert rq.metadata.handled_request_count == 0 - assert rq.metadata.total_request_count == 0 + metadata = await rq.get_metadata() + assert metadata.pending_request_count == 0 + assert metadata.handled_request_count == 0 + assert metadata.total_request_count == 0 await rq.drop() @@ -155,8 +156,9 @@ async def test_add_request_string_url(rq: RequestQueue) -> None: assert result.was_already_handled is False # Verify the queue stats were updated - assert rq.metadata.total_request_count == 1 - assert rq.metadata.pending_request_count == 1 + metadata = await rq.get_metadata() + assert metadata.total_request_count == 1 + assert metadata.pending_request_count == 1 async def test_add_request_object(rq: RequestQueue) -> None: @@ -172,8 +174,9 @@ async def test_add_request_object(rq: RequestQueue) -> None: assert result.was_already_handled is False # Verify the queue stats were updated - assert rq.metadata.total_request_count == 1 - assert rq.metadata.pending_request_count == 1 + metadata = await rq.get_metadata() + assert metadata.total_request_count == 1 + assert metadata.pending_request_count == 1 async def test_add_duplicate_request(rq: RequestQueue) -> None: @@ -190,8 +193,9 @@ async def test_add_duplicate_request(rq: RequestQueue) -> None: assert second_result.unique_key == first_result.unique_key # Verify the queue stats weren't incremented twice - assert rq.metadata.total_request_count == 1 - assert rq.metadata.pending_request_count == 1 + metadata = await rq.get_metadata() + assert metadata.total_request_count == 1 + assert metadata.pending_request_count == 1 async def test_add_requests_batch(rq: RequestQueue) -> None: @@ -210,8 +214,9 @@ async def test_add_requests_batch(rq: RequestQueue) -> None: await asyncio.sleep(0.1) # Verify the queue stats - assert rq.metadata.total_request_count == 3 - assert rq.metadata.pending_request_count == 3 + metadata = await rq.get_metadata() + assert metadata.total_request_count == 3 + assert metadata.pending_request_count == 3 async def test_add_requests_batch_with_forefront(rq: RequestQueue) -> None: @@ -352,9 +357,10 @@ async def test_fetch_next_request_and_mark_handled(rq: RequestQueue) -> None: await rq.mark_request_as_handled(request2) # Verify counts - assert rq.metadata.total_request_count == 2 - assert rq.metadata.handled_request_count == 2 - assert rq.metadata.pending_request_count == 0 + metadata = await rq.get_metadata() + assert metadata.total_request_count == 2 + assert metadata.handled_request_count == 2 + assert metadata.pending_request_count == 0 # Verify queue is empty empty_request = await rq.fetch_next_request() @@ -518,8 +524,9 @@ async def test_drop( # Verify the queue is empty assert await new_rq.is_empty() is True - assert new_rq.metadata.total_request_count == 0 - assert new_rq.metadata.pending_request_count == 0 + metadata = await new_rq.get_metadata() + assert metadata.total_request_count == 0 + assert metadata.pending_request_count == 0 await new_rq.drop() @@ -550,13 +557,15 @@ async def test_reopen_default( ) # Verify we're starting fresh - assert rq1.metadata.pending_request_count == 0 + metadata1 = await rq1.get_metadata() + assert metadata1.pending_request_count == 0 # Add a request await rq1.add_request('https://example.com/') # Verify the request was added - assert rq1.metadata.pending_request_count == 1 + metadata1 = await rq1.get_metadata() + assert metadata1.pending_request_count == 1 # Open the default request queue again rq2 = await RequestQueue.open( @@ -567,9 +576,11 @@ async def test_reopen_default( # Verify they are the same queue assert rq1.id == rq2.id assert rq1.name == rq2.name - assert rq1.metadata.total_request_count == rq2.metadata.total_request_count - assert rq1.metadata.pending_request_count == rq2.metadata.pending_request_count - assert rq1.metadata.handled_request_count == rq2.metadata.handled_request_count + metadata1 = await rq1.get_metadata() + metadata2 = await rq2.get_metadata() + assert metadata1.total_request_count == metadata2.total_request_count + assert metadata1.pending_request_count == metadata2.pending_request_count + assert metadata1.handled_request_count == metadata2.handled_request_count # Verify the request is accessible request = await rq2.fetch_next_request() @@ -602,9 +613,10 @@ async def test_purge( ) # Verify requests were added - assert rq.metadata.total_request_count == 3 - assert rq.metadata.pending_request_count == 3 - assert rq.metadata.handled_request_count == 0 + metadata = await rq.get_metadata() + assert metadata.total_request_count == 3 + assert metadata.pending_request_count == 3 + assert metadata.handled_request_count == 0 # Record the queue ID queue_id = rq.id @@ -617,9 +629,10 @@ async def test_purge( assert rq.name == 'purge_test_queue' # Same name preserved # Queue should be empty now - assert rq.metadata.total_request_count == 3 - assert rq.metadata.pending_request_count == 0 - assert rq.metadata.handled_request_count == 0 + metadata = await rq.get_metadata() + assert metadata.total_request_count == 3 + assert metadata.pending_request_count == 0 + assert metadata.handled_request_count == 0 assert await rq.is_empty() is True # Verify we can add new requests after purging From b628fbb3e0adb1a593c0ebb40c454942b239c292 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 1 Jul 2025 13:53:57 +0200 Subject: [PATCH 42/43] better typing in storage instance manager --- .../storages/_storage_instance_manager.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/crawlee/storages/_storage_instance_manager.py b/src/crawlee/storages/_storage_instance_manager.py index 7e51a7343b..9bd52f1219 100644 --- a/src/crawlee/storages/_storage_instance_manager.py +++ b/src/crawlee/storages/_storage_instance_manager.py @@ -1,13 +1,24 @@ from __future__ import annotations -from typing import Any, Callable, TypeVar, cast +from collections.abc import Awaitable +from typing import TYPE_CHECKING, Callable, TypeVar, Union, cast from crawlee._utils.docs import docs_group +from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient from ._base import Storage +if TYPE_CHECKING: + from crawlee.configuration import Configuration + T = TypeVar('T', bound='Storage') +StorageClientType = Union[DatasetClient, KeyValueStoreClient, RequestQueueClient] +"""Type alias for the storage client types.""" + +ClientOpener = Callable[..., Awaitable[StorageClientType]] +"""Type alias for the client opener function.""" + @docs_group('Classes') class StorageInstanceManager: @@ -33,8 +44,8 @@ async def open_storage_instance( *, id: str | None, name: str | None, - configuration: Any, - client_opener: Callable[..., Any], + configuration: Configuration, + client_opener: ClientOpener, ) -> T: """Open a storage instance with caching support. @@ -76,6 +87,7 @@ async def open_storage_instance( # Create new instance client = await client_opener(id=id, name=name, configuration=configuration) metadata = await client.get_metadata() + instance = cls(client, metadata.id, metadata.name) # type: ignore[call-arg] instance_name = getattr(instance, 'name', None) From 9dfac4b8afb8027979d85947f0db303f384b7158 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 1 Jul 2025 14:24:21 +0200 Subject: [PATCH 43/43] update upgrading guide --- .../registering_storage_client_example.py | 2 +- docs/upgrading/upgrading_to_v1.md | 29 ++++++++++++++----- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/docs/guides/code_examples/storage_clients/registering_storage_client_example.py b/docs/guides/code_examples/storage_clients/registering_storage_client_example.py index f91cb3574d..995278e7f6 100644 --- a/docs/guides/code_examples/storage_clients/registering_storage_client_example.py +++ b/docs/guides/code_examples/storage_clients/registering_storage_client_example.py @@ -1,6 +1,6 @@ import asyncio -from crawlee._service_locator import service_locator +from crawlee import service_locator from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import MemoryStorageClient from crawlee.storages import Dataset diff --git a/docs/upgrading/upgrading_to_v1.md b/docs/upgrading/upgrading_to_v1.md index 894104d85e..1d7219dbb4 100644 --- a/docs/upgrading/upgrading_to_v1.md +++ b/docs/upgrading/upgrading_to_v1.md @@ -69,14 +69,24 @@ The way you register storage clients remains the same: from crawlee import service_locator from crawlee.crawlers import ParselCrawler from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset +# Create custom storage client, MemoryStorageClient for example. storage_client = MemoryStorageClient() -# Either via the service locator: +# Register it globally via the service locator. service_locator.set_storage_client(storage_client) -# Or provide it directly to the crawler: +# Or pass it directly to the crawler, it will be registered globally +# to the service locator under the hood. crawler = ParselCrawler(storage_client=storage_client) + +# Or just provide it when opening a storage (e.g. dataset), it will be used +# for this storage only, not globally. +dataset = await Dataset.open( + name='my_dataset', + storage_client=storage_client, +) ``` ### Breaking changes @@ -105,31 +115,34 @@ destination you choose. ## Dataset -- There are two new methods: +- There are a few new methods: + - `get_metadata` - `purge` - `list_items` - The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead. -- The `get_info` and `storage_object` properties have been replaced by the new `metadata` property. +- The `get_info` and `storage_object` properties have been replaced by the new `get_metadata` method. - The `set_metadata` method has been removed. - The `write_to_json` and `write_to_csv` methods have been removed - use `export_to` instead. ## Key-value store -- There are three new methods: +- There are a few new methods: + - `get_metadata` - `purge` - `delete_value` - `list_keys` - The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead. -- The `get_info` and `storage_object` properties have been replaced by the new `metadata` property. +- The `get_info` and `storage_object` properties have been replaced by the new `get_metadata` method. - The `set_metadata` method has been removed. ## Request queue -- There are two new methods: +- There are a few new methods: + - `get_metadata` - `purge` - `add_requests` (renamed from `add_requests_batched`) - The `from_storage_object` method has been removed - use the `open` method with `name` or `id` instead. -- The `get_info` and `storage_object` properties have been replaced by the new `metadata` property. +- The `get_info` and `storage_object` properties have been replaced by the new `get_metadata` method. - The `set_metadata` method has been removed. - `resource_directory` from `RequestQueueMetadata` removed – use `path_to_...` property. - `RequestQueueHead` model replaced with `RequestQueueHeadWithLocks`.