From 49d8db13ae1b71f66d7bb46071727f57538cc49f Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sat, 13 Sep 2025 03:34:42 +0000 Subject: [PATCH 01/12] core implementation --- pyproject.toml | 3 +- src/crawlee/storage_clients/__init__.py | 2 + .../storage_clients/_redis/__init__.py | 6 + .../storage_clients/_redis/_client_mixin.py | 90 +++ .../storage_clients/_redis/_dataset_client.py | 348 ++++++++++++ .../_redis/_key_value_store_client.py | 302 ++++++++++ .../_redis/_request_queue_client.py | 515 ++++++++++++++++++ .../storage_clients/_redis/_storage_client.py | 141 +++++ src/crawlee/storage_clients/_redis/_utils.py | 22 + .../lua_scripts/atomic_add_requests.lua | 36 ++ .../lua_scripts/atomic_fetch_request.lua | 49 ++ .../lua_scripts/reclaim_stale_requests.lua | 38 ++ src/crawlee/storage_clients/_redis/py.typed | 0 tests/unit/storages/test_dataset.py | 7 +- tests/unit/storages/test_key_value_store.py | 7 +- tests/unit/storages/test_request_queue.py | 7 +- uv.lock | 109 +++- 17 files changed, 1673 insertions(+), 9 deletions(-) create mode 100644 src/crawlee/storage_clients/_redis/__init__.py create mode 100644 src/crawlee/storage_clients/_redis/_client_mixin.py create mode 100644 src/crawlee/storage_clients/_redis/_dataset_client.py create mode 100644 src/crawlee/storage_clients/_redis/_key_value_store_client.py create mode 100644 src/crawlee/storage_clients/_redis/_request_queue_client.py create mode 100644 src/crawlee/storage_clients/_redis/_storage_client.py create mode 100644 src/crawlee/storage_clients/_redis/_utils.py create mode 100644 src/crawlee/storage_clients/_redis/lua_scripts/atomic_add_requests.lua create mode 100644 src/crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua create mode 100644 src/crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua create mode 100644 src/crawlee/storage_clients/_redis/py.typed diff --git a/pyproject.toml b/pyproject.toml index fe0abe67ae..8e00e1ad96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ dependencies = [ ] [project.optional-dependencies] -all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel]"] +all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,redis]"] adaptive-crawler = [ "jaro-winkler>=2.0.3", "playwright>=1.27.0", @@ -71,6 +71,7 @@ otel = [ "opentelemetry-semantic-conventions>=0.54", "wrapt>=1.17.0", ] +redis = ["redis[hiredis] >= 6.4.0"] [project.scripts] crawlee = "crawlee._cli:cli" diff --git a/src/crawlee/storage_clients/__init__.py b/src/crawlee/storage_clients/__init__.py index ce8c713ca9..38fb8f60f9 100644 --- a/src/crawlee/storage_clients/__init__.py +++ b/src/crawlee/storage_clients/__init__.py @@ -1,9 +1,11 @@ from ._base import StorageClient from ._file_system import FileSystemStorageClient from ._memory import MemoryStorageClient +from ._redis import RedisStorageClient __all__ = [ 'FileSystemStorageClient', 'MemoryStorageClient', + 'RedisStorageClient', 'StorageClient', ] diff --git a/src/crawlee/storage_clients/_redis/__init__.py b/src/crawlee/storage_clients/_redis/__init__.py new file mode 100644 index 0000000000..889023c137 --- /dev/null +++ b/src/crawlee/storage_clients/_redis/__init__.py @@ -0,0 +1,6 @@ +from ._dataset_client import RedisDatasetClient +from ._key_value_store_client import RedisKeyValueStoreClient +from ._request_queue_client import RedisRequestQueueClient +from ._storage_client import RedisStorageClient + +__all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient'] diff --git a/src/crawlee/storage_clients/_redis/_client_mixin.py b/src/crawlee/storage_clients/_redis/_client_mixin.py new file mode 100644 index 0000000000..021aa4da34 --- /dev/null +++ b/src/crawlee/storage_clients/_redis/_client_mixin.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +import asyncio +from abc import ABC +from contextlib import asynccontextmanager +from logging import getLogger +from pathlib import Path +from typing import TYPE_CHECKING, ClassVar, TypedDict + +from ._utils import await_redis_response, read_lua_script + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + + from redis.asyncio import Redis + from redis.asyncio.client import Pipeline + from redis.commands.core import AsyncScript + from typing_extensions import NotRequired + + +logger = getLogger(__name__) + + +class MetadataUpdateParams(TypedDict, total=False): + """Parameters for updating metadata.""" + + update_accessed_at: NotRequired[bool] + update_modified_at: NotRequired[bool] + force: NotRequired[bool] + + +class RedisClientMixin(ABC): + """Mixin class for SQL clients. + + This mixin provides common SQL operations and basic methods for SQL storage clients. + """ + + _DEFAULT_NAME = 'default' + + _MAIN_KEY: ClassVar[str] + + def __init__(self, *, storage_name: str, redis: Redis) -> None: + self._storage_name = storage_name + self._redis = redis + + @classmethod + async def _get_metadata_by_name(cls, name: str, redis: Redis) -> dict | None: + response = await await_redis_response(redis.json().get(f'{cls._MAIN_KEY}:{name}:metadata')) + data = response[0] if response is not None and isinstance(response, list) else response + if data is not None and not isinstance(data, dict): + raise TypeError('The metadata data was received in an incorrect format.') + return data + + @classmethod + async def _get_metadata_name_by_id(cls, id: str, redis: Redis) -> str | None: + return await await_redis_response(redis.get(f'{cls._MAIN_KEY}:id_to_name:{id}')) + + @asynccontextmanager + async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pipeline]: + """Create a new Redis pipeline for this storage.""" + async with self._redis.pipeline() as pipe: + try: + pipe.multi() # type: ignore[no-untyped-call] + yield pipe + finally: + if with_execute: + await pipe.execute() + + async def _create_storage(self, pipeline: Pipeline) -> None: + _pipeline = pipeline # To avoid unused variable mypy error + + async def _create_script(self, script_name: str) -> AsyncScript: + """Load a Lua script from a file and return a Script object.""" + script_path = Path(__file__).parent / 'lua_scripts' / script_name + script_content = await asyncio.to_thread(read_lua_script, script_path) + + return self._redis.register_script(script_content) + + async def _create_metadata_and_storage(self, metadata: dict) -> None: + metadata_key = f'{self._MAIN_KEY}:{self._storage_name}:metadata' + index_id_to_name = f'{self._MAIN_KEY}:id_to_name:{metadata["id"]}' + metadata['created_at'] = metadata['created_at'].isoformat() + metadata['accessed_at'] = metadata['accessed_at'].isoformat() + metadata['modified_at'] = metadata['modified_at'].isoformat() + name = metadata['name'] if metadata['name'] is not None else self._DEFAULT_NAME + # Use a transaction to ensure atomicity + async with self._get_pipeline() as pipe: + await await_redis_response(pipe.json().set(metadata_key, '$', metadata, nx=True)) + await await_redis_response(pipe.set(index_id_to_name, name, nx=True)) + await self._create_storage(pipe) diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py new file mode 100644 index 0000000000..8b42fe257a --- /dev/null +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -0,0 +1,348 @@ +from __future__ import annotations + +from datetime import datetime, timezone +from logging import getLogger +from typing import TYPE_CHECKING, Any, cast + +from typing_extensions import override + +from crawlee._utils.crypto import crypto_random_object_id +from crawlee.storage_clients._base import DatasetClient +from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata + +from ._client_mixin import RedisClientMixin +from ._utils import await_redis_response + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + + from redis.asyncio import Redis + from redis.asyncio.client import Pipeline + +logger = getLogger(__name__) + + +class RedisDatasetClient(DatasetClient, RedisClientMixin): + """Memory implementation of the dataset client. + + This client stores dataset items in memory using Python lists and dictionaries. No data is persisted + between process runs, meaning all stored data is lost when the program terminates. This implementation + is primarily useful for testing, development, and short-lived crawler operations where persistent + storage is not required. + + The memory implementation provides fast access to data but is limited by available memory and + does not support data sharing across different processes. It supports all dataset operations including + sorting, filtering, and pagination, but performs them entirely in memory. + """ + + _DEFAULT_NAME = 'default' + + _MAIN_KEY = 'dataset' + + def __init__( + self, + dataset_name: str, + redis: Redis, + ) -> None: + """Initialize a new instance. + + Preferably use the `MemoryDatasetClient.open` class method to create a new instance. + """ + super().__init__(storage_name=dataset_name, redis=redis) + + @override + async def get_metadata(self) -> DatasetMetadata: + metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis) + if metadata_dict is None: + raise ValueError(f'Dataset with name "{self._storage_name}" does not exist.') + return DatasetMetadata.model_validate(metadata_dict) + + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + redis: Redis, + ) -> RedisDatasetClient: + """Open or create a new Redis dataset client. + + This method creates a new Redis dataset instance. Unlike persistent storage implementations, Redis + datasets don't check for existing datasets with the same name or ID since all data exists only in memory + and is lost when the process terminates. + + Args: + id: The ID of the dataset. If not provided, a random ID will be generated. + name: The name of the dataset. If not provided, the dataset will be unnamed. + redis: Redis client instance. + + Returns: + An instance for the opened or created storage client. + """ + if id: + dataset_name = await cls._get_metadata_name_by_id(id=id, redis=redis) + if dataset_name is None: + raise ValueError(f'Dataset with ID "{id}" does not exist.') + else: + search_name = name or cls._DEFAULT_NAME + metadata_data = await cls._get_metadata_by_name(name=search_name, redis=redis) + dataset_name = search_name if metadata_data is not None else None + if dataset_name: + client = cls(dataset_name=dataset_name, redis=redis) + async with client._get_pipeline() as pipe: + await client._update_metadata(pipe, update_accessed_at=True) + else: + now = datetime.now(timezone.utc) + metadata = DatasetMetadata( + id=crypto_random_object_id(), + name=name, + created_at=now, + accessed_at=now, + modified_at=now, + item_count=0, + ) + dataset_name = name or cls._DEFAULT_NAME + client = cls(dataset_name=dataset_name, redis=redis) + await client._create_metadata_and_storage(metadata.model_dump()) + return client + + @override + async def _create_storage(self, pipeline: Pipeline) -> None: + items_key = f'{self._MAIN_KEY}:{self._storage_name}:items' + await await_redis_response(pipeline.json().set(items_key, '$', [])) + + @override + async def drop(self) -> None: + storage_id = (await self.get_metadata()).id + async with self._get_pipeline() as pipe: + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:metadata') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:items') + await pipe.delete(f'{self._MAIN_KEY}:id_to_name:{storage_id}') + + @override + async def purge(self) -> None: + async with self._get_pipeline() as pipe: + await self._create_storage(pipe) + + await self._update_metadata( + pipe, + update_accessed_at=True, + update_modified_at=True, + new_item_count=0, + ) + + @override + async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: + if isinstance(data, dict): + data = [data] + + async with self._get_pipeline() as pipe: + # Incorrect signature for args type in redis-py + pipe.json().arrappend(f'{self._MAIN_KEY}:{self._storage_name}:items', '$', *data) # type: ignore[arg-type] + delta_item_count = len(data) + await self._update_metadata( + pipe, update_accessed_at=True, update_modified_at=True, delta_item_count=delta_item_count + ) + + @override + async def get_data( + self, + *, + offset: int = 0, + limit: int | None = 999_999_999_999, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: list[str] | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + flatten: list[str] | None = None, + view: str | None = None, + ) -> DatasetItemsListPage: + # Check for unsupported arguments and log a warning if found + # When implementing, explore the capabilities of jsonpath to determine what can be done at the Redis level. + unsupported_args: dict[str, Any] = { + 'clean': clean, + 'fields': fields, + 'omit': omit, + 'unwind': unwind, + 'skip_hidden': skip_hidden, + 'flatten': flatten, + 'view': view, + } + unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} + + if unsupported: + logger.warning( + f'The arguments {list(unsupported.keys())} of get_data are not supported ' + f'by the {self.__class__.__name__} client.' + ) + + metadata = await self.get_metadata() + + total = metadata.item_count + items_key = f'{self._MAIN_KEY}:{self._storage_name}:items' + json_path = '$' + + # Apply sorting and pagination + if desc: + if offset and limit is not None: + json_path += f'[-{offset + limit}:-{offset}]' + elif limit is not None: + json_path += f'[-{limit}:]' + elif offset: + json_path += f'[:-{offset}]' + else: # noqa: PLR5501 # not a mistake, just to please the linter + if offset and limit is not None: + json_path += f'[{offset}:{offset + limit}]' + elif limit is not None: + json_path += f'[:{limit}]' + elif offset: + json_path += f'[{offset}:]' + + if json_path == '$': + json_path = '$[*]' + + data = await await_redis_response(self._redis.json().get(items_key, json_path)) + + if data is None: + data = [] + + if skip_empty: + data = [item for item in data if item] + + if desc: + data = list(reversed(data)) + + async with self._get_pipeline() as pipe: + await self._update_metadata(pipe, update_accessed_at=True) + + return DatasetItemsListPage( + count=len(data), + offset=offset, + limit=limit or (total - offset), + total=total, + desc=desc, + items=data, + ) + + @override + async def iterate_items( + self, + *, + offset: int = 0, + limit: int | None = None, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: list[str] | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + ) -> AsyncIterator[dict[str, Any]]: + """Iterate over dataset items one by one. + + This method yields items individually instead of loading all items at once, + which is more memory efficient for large datasets. + """ + # Log warnings for unsupported arguments + unsupported_args: dict[str, Any] = { + 'clean': clean, + 'fields': fields, + 'omit': omit, + 'unwind': unwind, + 'skip_hidden': skip_hidden, + } + unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)} + + if unsupported: + logger.warning( + f'The arguments {list(unsupported.keys())} of iterate_items are not supported ' + f'by the {self.__class__.__name__} client.' + ) + + metadata = await self.get_metadata() + total_items = metadata.item_count + items_key = f'{self._MAIN_KEY}:{self._storage_name}:items' + + # Calculate actual range based on parameters + start_idx = offset + end_idx = min(total_items, offset + limit) if limit is not None else total_items + + # Update accessed_at timestamp + async with self._get_pipeline() as pipe: + await self._update_metadata(pipe, update_accessed_at=True) + + # Process items in batches for better network efficiency + batch_size = 100 + + for batch_start in range(start_idx, end_idx, batch_size): + batch_end = min(batch_start + batch_size, end_idx) + + # Build JsonPath for batch slice + if desc: + # For descending order, we need to reverse the slice calculation + desc_batch_start = total_items - batch_end + desc_batch_end = total_items - batch_start + json_path = f'$[{desc_batch_start}:{desc_batch_end}]' + else: + json_path = f'$[{batch_start}:{batch_end}]' + + # Get batch of items + batch_items = await await_redis_response(self._redis.json().get(items_key, json_path)) + + # Handle case where batch_items might be None or not a list + if batch_items is None: + continue + + # Reverse batch if desc order (since we got items in normal order but need desc) + if desc: + batch_items = list(reversed(batch_items)) + + # Yield items from batch + for item in batch_items: + # Apply skip_empty filter + if skip_empty and not item: + continue + + yield cast('dict[str, Any]', item) + + async with self._get_pipeline() as pipe: + await self._update_metadata(pipe, update_accessed_at=True) + + async def _update_metadata( + self, + pipeline: Pipeline, + *, + new_item_count: int | None = None, + delta_item_count: int | None = None, + update_accessed_at: bool = False, + update_modified_at: bool = False, + ) -> None: + """Update the dataset metadata with current information. + + Args: + pipeline: The Redis pipeline to use for the update. + new_item_count: If provided, update the item count to this value. + update_accessed_at: If True, update the `accessed_at` timestamp to the current time. + update_modified_at: If True, update the `modified_at` timestamp to the current time. + delta_item_count: If provided, increment the item count by this value. + """ + metadata_key = f'{self._MAIN_KEY}:{self._storage_name}:metadata' + now = datetime.now(timezone.utc) + + if update_accessed_at: + await await_redis_response( + pipeline.json().set(metadata_key, '$.accessed_at', now.isoformat(), nx=False, xx=True) + ) + if update_modified_at: + await await_redis_response( + pipeline.json().set(metadata_key, '$.modified_at', now.isoformat(), nx=False, xx=True) + ) + if new_item_count is not None: + await await_redis_response( + pipeline.json().set(metadata_key, '$.item_count', new_item_count, nx=False, xx=True) + ) + elif delta_item_count is not None: + await await_redis_response(pipeline.json().numincrby(metadata_key, '$.item_count', delta_item_count)) diff --git a/src/crawlee/storage_clients/_redis/_key_value_store_client.py b/src/crawlee/storage_clients/_redis/_key_value_store_client.py new file mode 100644 index 0000000000..3bcdd59df3 --- /dev/null +++ b/src/crawlee/storage_clients/_redis/_key_value_store_client.py @@ -0,0 +1,302 @@ +from __future__ import annotations + +import json +from datetime import datetime, timezone +from logging import getLogger +from typing import TYPE_CHECKING, Any + +from typing_extensions import override + +from crawlee._utils.crypto import crypto_random_object_id +from crawlee._utils.file import infer_mime_type +from crawlee.storage_clients._base import KeyValueStoreClient +from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata + +from ._client_mixin import RedisClientMixin +from ._utils import await_redis_response + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + + from redis.asyncio import Redis + from redis.asyncio.client import Pipeline + +logger = getLogger(__name__) + + +class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin): + """Memory implementation of the key-value store client. + + This client stores data in memory as Python dictionaries. No data is persisted between + process runs, meaning all stored data is lost when the program terminates. This implementation + is primarily useful for testing, development, and short-lived crawler operations where + persistence is not required. + + The memory implementation provides fast access to data but is limited by available memory and + does not support data sharing across different processes. + """ + + _DEFAULT_NAME = 'default' + + _MAIN_KEY = 'key-value-store' + + def __init__( + self, + dataset_name: str, + redis: Redis, + ) -> None: + """Initialize a new instance. + + Preferably use the `MemoryDatasetClient.open` class method to create a new instance. + """ + super().__init__(storage_name=dataset_name, redis=redis) + + @override + async def get_metadata(self) -> KeyValueStoreMetadata: + metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis) + if metadata_dict is None: + raise ValueError(f'Dataset with name "{self._storage_name}" does not exist.') + return KeyValueStoreMetadata.model_validate(metadata_dict) + + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + redis: Redis, + ) -> RedisKeyValueStoreClient: + """Open or create a new Redis dataset client. + + This method creates a new Redis dataset instance. Unlike persistent storage implementations, Redis + datasets don't check for existing datasets with the same name or ID since all data exists only in memory + and is lost when the process terminates. + + Args: + id: The ID of the dataset. If not provided, a random ID will be generated. + name: The name of the dataset. If not provided, the dataset will be unnamed. + redis: Redis client instance. + + Returns: + An instance for the opened or created storage client. + """ + if id: + dataset_name = await cls._get_metadata_name_by_id(id=id, redis=redis) + if dataset_name is None: + raise ValueError(f'Dataset with ID "{id}" does not exist.') + else: + search_name = name or cls._DEFAULT_NAME + metadata_data = await cls._get_metadata_by_name(name=search_name, redis=redis) + dataset_name = search_name if metadata_data is not None else None + if dataset_name: + client = cls(dataset_name=dataset_name, redis=redis) + async with client._get_pipeline() as pipe: + await client._update_metadata(pipe, update_accessed_at=True) + else: + now = datetime.now(timezone.utc) + metadata = KeyValueStoreMetadata( + id=crypto_random_object_id(), + name=name, + created_at=now, + accessed_at=now, + modified_at=now, + ) + dataset_name = name or cls._DEFAULT_NAME + client = cls(dataset_name=dataset_name, redis=redis) + await client._create_metadata_and_storage(metadata.model_dump()) + return client + + @override + async def drop(self) -> None: + storage_id = (await self.get_metadata()).id + async with self._get_pipeline() as pipe: + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:metadata') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:items') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:metadata_items') + await pipe.delete(f'{self._MAIN_KEY}:id_to_name:{storage_id}') + + @override + async def purge(self) -> None: + async with self._get_pipeline() as pipe: + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:items') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:metadata_items') + await self._update_metadata( + pipe, + update_accessed_at=True, + update_modified_at=True, + ) + + @override + async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None: + # Special handling for None values + if value is None: + content_type = 'application/x-none' # Special content type to identify None values + value_bytes = b'' + else: + content_type = content_type or infer_mime_type(value) + + # Serialize the value to bytes. + if 'application/json' in content_type: + value_bytes = json.dumps(value, default=str, ensure_ascii=False).encode('utf-8') + elif isinstance(value, str): + value_bytes = value.encode('utf-8') + elif isinstance(value, (bytes, bytearray)): + value_bytes = value + else: + # Fallback: attempt to convert to string and encode. + value_bytes = str(value).encode('utf-8') + + size = len(value_bytes) + item_metadata = KeyValueStoreRecordMetadata( + key=key, + content_type=content_type, + size=size, + ) + + async with self._get_pipeline() as pipe: + # redis-py typing issue + await await_redis_response(pipe.hset(f'{self._MAIN_KEY}:{self._storage_name}:items', key, value_bytes)) # type: ignore[arg-type] + + await await_redis_response( + pipe.hset( + f'{self._MAIN_KEY}:{self._storage_name}:metadata_items', + key, + item_metadata.model_dump_json(), + ) + ) + await self._update_metadata(pipe, update_accessed_at=True, update_modified_at=True) + + @override + async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: + serialized_metadata_item = await await_redis_response( + self._redis.hget(f'{self._MAIN_KEY}:{self._storage_name}:metadata_items', key) + ) + + if not isinstance(serialized_metadata_item, (str, bytes, bytearray)): + logger.warning(f'Metadata for key "{key}" is missing or invalid.') + return None + + metadata_item = KeyValueStoreRecordMetadata.model_validate_json(serialized_metadata_item) + + # Handle None values + if metadata_item.content_type == 'application/x-none': + return KeyValueStoreRecord(value=None, **metadata_item.model_dump()) + + # Query the record by key + # redis-py typing issue + value_bytes: bytes | None = await await_redis_response( + self._redis.hget(f'{self._MAIN_KEY}:{self._storage_name}:items', key) # type: ignore[arg-type] + ) + + if value_bytes is None: + logger.warning(f'Value for key "{key}" is missing.') + return None + + # Handle JSON values + if 'application/json' in metadata_item.content_type: + try: + value = json.loads(value_bytes.decode('utf-8')) + except (json.JSONDecodeError, UnicodeDecodeError): + logger.warning(f'Failed to decode JSON value for key "{key}"') + return None + # Handle text values + elif metadata_item.content_type.startswith('text/'): + try: + value = value_bytes.decode('utf-8') + except UnicodeDecodeError: + logger.warning(f'Failed to decode text value for key "{key}"') + return None + # Handle binary values + else: + value = value_bytes + + return KeyValueStoreRecord(value=value, **metadata_item.model_dump()) + + @override + async def delete_value(self, *, key: str) -> None: + async with self._get_pipeline() as pipe: + await await_redis_response(pipe.hdel(f'{self._MAIN_KEY}:{self._storage_name}:items', key)) + await await_redis_response(pipe.hdel(f'{self._MAIN_KEY}:{self._storage_name}:metadata_items', key)) + await self._update_metadata(pipe, update_accessed_at=True, update_modified_at=True) + + @override + async def iterate_keys( + self, + *, + exclusive_start_key: str | None = None, + limit: int | None = None, + ) -> AsyncIterator[KeyValueStoreRecordMetadata]: + items_data = await await_redis_response( + self._redis.hgetall(f'{self._MAIN_KEY}:{self._storage_name}:metadata_items') + ) + + if not items_data: + return # No items to iterate over + + if not isinstance(items_data, dict): + raise TypeError('The items data was received in an incorrect format.') + + # Get all keys, sorted alphabetically + keys = sorted(items_data.keys()) + + # Apply exclusive_start_key filter if provided + if exclusive_start_key is not None: + bytes_exclusive_start_key = exclusive_start_key.encode() + keys = [k for k in keys if k > bytes_exclusive_start_key] + + # Apply limit if provided + if limit is not None: + keys = keys[:limit] + + # Yield metadata for each key + for key in keys: + record = items_data[key] + yield KeyValueStoreRecordMetadata.model_validate_json(record) + + async with self._get_pipeline() as pipe: + await self._update_metadata( + pipe, + update_accessed_at=True, + ) + + @override + async def get_public_url(self, *, key: str) -> str: + raise NotImplementedError('Public URLs are not supported for memory key-value stores.') + + @override + async def record_exists(self, *, key: str) -> bool: + async with self._get_pipeline(with_execute=False) as pipe: + await await_redis_response(pipe.hexists(f'{self._MAIN_KEY}:{self._storage_name}:items', key)) + await self._update_metadata( + pipe, + update_accessed_at=True, + ) + results = await pipe.execute() + + return bool(results[0]) + + async def _update_metadata( + self, + pipeline: Pipeline, + *, + update_accessed_at: bool = False, + update_modified_at: bool = False, + ) -> None: + """Update the dataset metadata with current information. + + Args: + pipeline: The Redis pipeline to use for the update. + update_accessed_at: If True, update the `accessed_at` timestamp to the current time. + update_modified_at: If True, update the `modified_at` timestamp to the current time. + """ + metadata_key = f'{self._MAIN_KEY}:{self._storage_name}:metadata' + now = datetime.now(timezone.utc) + + if update_accessed_at: + await await_redis_response( + pipeline.json().set(metadata_key, '$.accessed_at', now.isoformat(), nx=False, xx=True) + ) + if update_modified_at: + await await_redis_response( + pipeline.json().set(metadata_key, '$.modified_at', now.isoformat(), nx=False, xx=True) + ) diff --git a/src/crawlee/storage_clients/_redis/_request_queue_client.py b/src/crawlee/storage_clients/_redis/_request_queue_client.py new file mode 100644 index 0000000000..ee1db38f16 --- /dev/null +++ b/src/crawlee/storage_clients/_redis/_request_queue_client.py @@ -0,0 +1,515 @@ +from __future__ import annotations + +import json +from collections import deque +from contextlib import suppress +from datetime import datetime, timezone +from logging import getLogger +from typing import TYPE_CHECKING + +from redis.exceptions import ResponseError +from typing_extensions import override + +from crawlee import Request +from crawlee._utils.crypto import crypto_random_object_id +from crawlee.storage_clients._base import RequestQueueClient +from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata + +from ._client_mixin import RedisClientMixin +from ._utils import await_redis_response + +if TYPE_CHECKING: + from collections.abc import Sequence + + from redis.asyncio import Redis + from redis.asyncio.client import Pipeline + from redis.commands.core import AsyncScript + +logger = getLogger(__name__) + + +class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin): + """Memory implementation of the request queue client. + + No data is persisted between process runs, which means all requests are lost when the program terminates. + This implementation is primarily useful for testing, development, and short-lived crawler runs where + persistence is not required. + + This client provides fast access to request data but is limited by available memory and does not support + data sharing across different processes. + """ + + _MAX_BATCH_FETCH_SIZE = 10 + + _BLOCK_REQUEST_TIME = 300_000 # milliseconds + + _DEFAULT_NAME = 'default' + + _MAIN_KEY = 'request_queue' + + def __init__( + self, + dataset_name: str, + redis: Redis, + ) -> None: + """Initialize a new instance. + + Preferably use the `MemoryDatasetClient.open` class method to create a new instance. + """ + super().__init__(storage_name=dataset_name, redis=redis) + + self._pending_fetch_cache: deque[Request] = deque() + """Cache for requests: ordered by sequence number.""" + + self.client_key = crypto_random_object_id(length=32)[:32] + """Unique identifier for this client instance.""" + + self._fetch_script: AsyncScript | None = None + + self._reclaim_stale_script: AsyncScript | None = None + + self._add_requests_script: AsyncScript | None = None + + self._scripts_loaded = False + + async def _ensure_scripts_loaded(self) -> None: + """Ensure Lua scripts are loaded in Redis.""" + if not self._scripts_loaded: + self._fetch_script = await self._create_script('atomic_fetch_request.lua') + self._reclaim_stale_script = await self._create_script('reclaim_stale_requests.lua') + self._add_requests_script = await self._create_script('atomic_add_requests.lua') + + self._scripts_loaded = True + + @override + async def get_metadata(self) -> RequestQueueMetadata: + metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis) + if metadata_dict is None: + raise ValueError(f'Dataset with name "{self._storage_name}" does not exist.') + return RequestQueueMetadata.model_validate(metadata_dict) + + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + redis: Redis, + ) -> RedisRequestQueueClient: + """Open or create a new memory request queue client. + + This method creates a new in-memory request queue instance. Unlike persistent storage implementations, + memory queues don't check for existing queues with the same name or ID since all data exists only + in memory and is lost when the process terminates. + + Args: + id: The ID of the request queue. If not provided, a random ID will be generated. + name: The name of the request queue. If not provided, the queue will be unnamed. + redis: Redis client instance. + + Returns: + An instance for the opened or created storage client. + """ + # Otherwise create a new queue + if id: + dataset_name = await cls._get_metadata_name_by_id(id=id, redis=redis) + if dataset_name is None: + raise ValueError(f'Dataset with ID "{id}" does not exist.') + else: + search_name = name or cls._DEFAULT_NAME + metadata_data = await cls._get_metadata_by_name(name=search_name, redis=redis) + dataset_name = search_name if metadata_data is not None else None + if dataset_name: + client = cls(dataset_name=dataset_name, redis=redis) + async with client._get_pipeline() as pipe: + await client._update_metadata(pipe, update_accessed_at=True) + else: + now = datetime.now(timezone.utc) + metadata = RequestQueueMetadata( + id=crypto_random_object_id(), + name=name, + created_at=now, + accessed_at=now, + modified_at=now, + had_multiple_clients=False, + handled_request_count=0, + pending_request_count=0, + total_request_count=0, + ) + dataset_name = name or cls._DEFAULT_NAME + client = cls(dataset_name=dataset_name, redis=redis) + with suppress(ResponseError): + await client._create_metadata_and_storage(metadata.model_dump()) + + await client._ensure_scripts_loaded() + return client + + @override + async def _create_storage(self, pipeline: Pipeline) -> None: + added_bloom_filter_key = f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter' + handled_bloom_filter_key = f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter' + await await_redis_response(pipeline.bf().create(added_bloom_filter_key, 0.1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] + await await_redis_response(pipeline.bf().create(handled_bloom_filter_key, 0.1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] + + @override + async def drop(self) -> None: + storage_id = (await self.get_metadata()).id + async with self._get_pipeline() as pipe: + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:metadata') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:queue') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:data') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:in_progress') + await pipe.delete(f'{self._MAIN_KEY}:id_to_name:{storage_id}') + + @override + async def purge(self) -> None: + async with self._get_pipeline() as pipe: + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:queue') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:data') + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:in_progress') + + await self._create_storage(pipe) + + await self._update_metadata( + pipe, + update_accessed_at=True, + update_modified_at=True, + new_pending_request_count=0, + ) + + @override + async def add_batch_of_requests( + self, + requests: Sequence[Request], + *, + forefront: bool = False, + ) -> AddRequestsResponse: + # Mypy workaround + if self._add_requests_script is None: + raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') + + processed_requests = [] + + delta_pending = 0 + delta_total = 0 + + added_bloom_filter_key = f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter' + handled_bloom_filter_key = f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter' + queue_key = f'{self._MAIN_KEY}:{self._storage_name}:queue' + data_key = f'{self._MAIN_KEY}:{self._storage_name}:data' + + requests_by_unique_key = {req.unique_key: req for req in requests} + unique_keys = list(requests_by_unique_key.keys()) + async with self._get_pipeline(with_execute=False) as pipe: + await await_redis_response(pipe.bf().mexists(added_bloom_filter_key, *unique_keys)) # type: ignore[no-untyped-call] + await await_redis_response(pipe.bf().mexists(handled_bloom_filter_key, *unique_keys)) # type: ignore[no-untyped-call] + + results = await pipe.execute() + + added_flags = results[0] + handled_flags = results[1] + + new_unique_keys = [] + new_request_data = {} + delta_pending = 0 + delta_total = 0 + + for i, unique_key in enumerate(unique_keys): + # Already handled - skip + if handled_flags[i]: + processed_requests.append( + ProcessedRequest( + unique_key=unique_key, + was_already_present=True, + was_already_handled=True, + ) + ) + continue + + # Already in queue - skip + if added_flags[i]: + processed_requests.append( + ProcessedRequest( + unique_key=unique_key, + was_already_present=True, + was_already_handled=False, + ) + ) + continue + + # New request - will add to queue + request = requests_by_unique_key[unique_key] + + new_unique_keys.append(unique_key) + new_request_data[unique_key] = request.model_dump_json() + + if new_unique_keys: + script_results = await self._add_requests_script( + keys=[added_bloom_filter_key, queue_key, data_key], + args=[int(forefront), json.dumps(new_unique_keys), json.dumps(new_request_data)], + ) + actually_added = set(json.loads(script_results)) + + delta_pending = len(actually_added) + delta_total = len(actually_added) + + for unique_key in new_unique_keys: + if unique_key in actually_added: + processed_requests.append( + ProcessedRequest( + unique_key=unique_key, + was_already_present=False, + was_already_handled=False, + ) + ) + else: + processed_requests.append( + ProcessedRequest( + unique_key=unique_key, + was_already_present=True, + was_already_handled=False, + ) + ) + + async with self._get_pipeline() as pipe: + await self._update_metadata( + pipe, + update_accessed_at=True, + update_modified_at=True, + delta_pending_request_count=delta_pending, + delta_total_request_count=delta_total, + ) + + return AddRequestsResponse( + processed_requests=processed_requests, + unprocessed_requests=[], + ) + + @override + async def fetch_next_request(self) -> Request | None: + if self._pending_fetch_cache: + return self._pending_fetch_cache.popleft() + + # Mypy workaround + if self._fetch_script is None: + raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') + + queue_key = f'{self._MAIN_KEY}:{self._storage_name}:queue' + in_progress_key = f'{self._MAIN_KEY}:{self._storage_name}:in_progress' + data_key = f'{self._MAIN_KEY}:{self._storage_name}:data' + + blocked_until_timestamp = int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME + + requests_json = await self._fetch_script( + keys=[queue_key, in_progress_key, data_key], + args=[self.client_key, blocked_until_timestamp, self._MAX_BATCH_FETCH_SIZE], + ) + + async with self._get_pipeline() as pipe: + await self._update_metadata(pipe, update_accessed_at=True) + + if not requests_json: + return None + + requests = [Request.model_validate_json(req_json) for req_json in requests_json] + + self._pending_fetch_cache.extend(requests[1:]) + + return requests[0] + + async def _reclaim_stale_requests(self) -> None: + # Mypy workaround + if self._reclaim_stale_script is None: + raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') + + in_progress_key = f'{self._MAIN_KEY}:{self._storage_name}:in_progress' + queue_key = f'{self._MAIN_KEY}:{self._storage_name}:queue' + data_key = f'{self._MAIN_KEY}:{self._storage_name}:data' + + current_time = int(datetime.now(tz=timezone.utc).timestamp() * 1000) + + await self._reclaim_stale_script(keys=[in_progress_key, queue_key, data_key], args=[current_time]) + + @override + async def get_request(self, unique_key: str) -> Request | None: + data_key = f'{self._MAIN_KEY}:{self._storage_name}:data' + + request_data = await await_redis_response(self._redis.hget(data_key, unique_key)) + + if isinstance(request_data, (str, bytes, bytearray)): + return Request.model_validate_json(request_data) + + return None + + @override + async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: + # Check if the request is in progress. + in_progress_key = f'{self._MAIN_KEY}:{self._storage_name}:in_progress' + handled_bloom_filter_key = f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter' + data_key = f'{self._MAIN_KEY}:{self._storage_name}:data' + + check_in_progress = await await_redis_response(self._redis.hexists(in_progress_key, request.unique_key)) + if not check_in_progress: + logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.') + return None + + async with self._get_pipeline() as pipe: + await await_redis_response(pipe.bf().add(handled_bloom_filter_key, request.unique_key)) # type: ignore[no-untyped-call] + + await await_redis_response(pipe.hdel(in_progress_key, request.unique_key)) + await await_redis_response(pipe.hdel(data_key, request.unique_key)) + + await self._update_metadata( + pipe, + update_accessed_at=True, + update_modified_at=True, + delta_handled_request_count=1, + delta_pending_request_count=-1, + ) + + return ProcessedRequest( + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=True, + ) + + @override + async def reclaim_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest | None: + in_progress_key = f'{self._MAIN_KEY}:{self._storage_name}:in_progress' + queue_key = f'{self._MAIN_KEY}:{self._storage_name}:queue' + + check_in_progress = await await_redis_response(self._redis.hexists(in_progress_key, request.unique_key)) + if not check_in_progress: + logger.info(f'Reclaiming request {request.unique_key} that is not in progress.') + return None + + async with self._get_pipeline() as pipe: + if forefront: + blocked_until_timestamp = ( + int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME + ) + + await await_redis_response( + pipe.hset( + in_progress_key, + request.unique_key, + f'{{"client_id":"{self.client_key}","blocked_until_timestamp":{blocked_until_timestamp}}}', + ) + ) + self._pending_fetch_cache.appendleft(request) + else: + await await_redis_response(pipe.rpush(queue_key, request.unique_key)) + await await_redis_response(pipe.hdel(in_progress_key, request.unique_key)) + await self._update_metadata( + pipe, + update_modified_at=True, + update_accessed_at=True, + ) + + return ProcessedRequest( + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=False, + ) + + @override + async def is_empty(self) -> bool: + """Check if the queue is empty. + + Returns: + True if the queue is empty, False otherwise. + """ + if self._pending_fetch_cache: + return False + + metadata = await self.get_metadata() + + return metadata.pending_request_count == 0 + + async def _update_metadata( + self, + pipeline: Pipeline, + *, + update_accessed_at: bool = False, + update_modified_at: bool = False, + delta_handled_request_count: int | None = None, + new_handled_request_count: int | None = None, + delta_pending_request_count: int | None = None, + new_pending_request_count: int | None = None, + delta_total_request_count: int | None = None, + new_total_request_count: int | None = None, + update_had_multiple_clients: bool = False, + ) -> None: + """Update the request queue metadata with current information. + + Args: + pipeline: The Redis pipeline to use for the update. + update_accessed_at: If True, update the `accessed_at` timestamp to the current time. + update_modified_at: If True, update the `modified_at` timestamp to the current time. + new_handled_request_count: If provided, update the handled_request_count to this value. + new_pending_request_count: If provided, update the pending_request_count to this value. + new_total_request_count: If provided, update the total_request_count to this value. + delta_handled_request_count: If provided, add this value to the handled_request_count. + delta_pending_request_count: If provided, add this value to the pending_request_count. + delta_total_request_count: If provided, add this value to the total_request_count. + update_had_multiple_clients: If True, set had_multiple_clients to True. + """ + now = datetime.now(timezone.utc) + + metadata_key = f'{self._MAIN_KEY}:{self._storage_name}:metadata' + now = datetime.now(timezone.utc) + + if update_accessed_at: + await await_redis_response( + pipeline.json().set(metadata_key, '$.accessed_at', now.isoformat(), nx=False, xx=True) + ) + if update_modified_at: + await await_redis_response( + pipeline.json().set(metadata_key, '$.modified_at', now.isoformat(), nx=False, xx=True) + ) + if new_pending_request_count is not None: + await await_redis_response( + pipeline.json().set( + metadata_key, '$.pending_request_count', new_pending_request_count, nx=False, xx=True + ) + ) + elif delta_pending_request_count is not None: + await await_redis_response( + pipeline.json().numincrby(metadata_key, '$.pending_request_count', delta_pending_request_count) + ) + + if new_handled_request_count is not None: + await await_redis_response( + pipeline.json().set( + metadata_key, '$.handled_request_count', new_handled_request_count, nx=False, xx=True + ) + ) + elif delta_handled_request_count is not None: + await await_redis_response( + pipeline.json().numincrby(metadata_key, '$.handled_request_count', delta_handled_request_count) + ) + + if new_total_request_count is not None: + await await_redis_response( + pipeline.json().set(metadata_key, '$.total_request_count', new_total_request_count, nx=False, xx=True) + ) + elif delta_total_request_count is not None: + await await_redis_response( + pipeline.json().numincrby(metadata_key, '$.total_request_count', delta_total_request_count) + ) + + if update_had_multiple_clients: + await await_redis_response( + pipeline.json().set( + metadata_key, '$.had_multiple_clients', update_had_multiple_clients, nx=False, xx=True + ) + ) diff --git a/src/crawlee/storage_clients/_redis/_storage_client.py b/src/crawlee/storage_clients/_redis/_storage_client.py new file mode 100644 index 0000000000..860738b3da --- /dev/null +++ b/src/crawlee/storage_clients/_redis/_storage_client.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +from redis.asyncio import Redis +from typing_extensions import override + +from crawlee._utils.docs import docs_group +from crawlee.configuration import Configuration +from crawlee.storage_clients._base import StorageClient + +from ._dataset_client import RedisDatasetClient +from ._key_value_store_client import RedisKeyValueStoreClient +from ._request_queue_client import RedisRequestQueueClient + + +@docs_group('Storage clients') +class RedisStorageClient(StorageClient): + """Redis implementation of the storage client. + + This storage client provides access to datasets, key-value stores, and request queues that persist data + to a Redis database. Each storage type uses a different key pattern to store and retrieve data. + + The client accepts either a database connection string or a pre-configured AsyncEngine. If neither is + provided, it creates a default SQLite database 'crawlee.db' in the storage directory. + + Database schema is automatically created during initialization. SQLite databases receive performance + optimizations including WAL mode and increased cache size. + + Warning: + This is an experimental feature. The behavior and interface may change in future versions. + """ + + def __init__( + self, + *, + connection_string: str | None = None, + redis: Redis | None = None, + ) -> None: + """Initialize the SQL storage client. + + Args: + connection_string: Database connection string. + redis: Pre-configured Redis client instance. + """ + if redis is not None and connection_string is not None: + raise ValueError('Either redis or connection_string must be provided, not both.') + + if redis is None and connection_string is None: + raise ValueError('Either redis or connection_string must be provided.') + + if redis is not None: + self._redis = redis + + elif connection_string is not None: + self._redis = Redis.from_url(connection_string) + + @override + async def create_dataset_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> RedisDatasetClient: + """Create or open a Redis dataset client. + + Args: + id: Specific dataset ID to open. If provided, name is ignored. + name: Dataset name to open or create. Uses 'default' if not specified. + configuration: Configuration object. Uses global config if not provided. + + Returns: + Configured dataset client ready for use. + """ + configuration = configuration or Configuration.get_global_configuration() + + client = await RedisDatasetClient.open( + id=id, + name=name, + redis=self._redis, + ) + + await self._purge_if_needed(client, configuration) + return client + + @override + async def create_kvs_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> RedisKeyValueStoreClient: + """Create or open a SQL key-value store client. + + Args: + id: Specific store ID to open. If provided, name is ignored. + name: Store name to open or create. Uses 'default' if not specified. + configuration: Configuration object. Uses global config if not provided. + + Returns: + Configured key-value store client ready for use. + """ + configuration = configuration or Configuration.get_global_configuration() + + client = await RedisKeyValueStoreClient.open( + id=id, + name=name, + redis=self._redis, + ) + + await self._purge_if_needed(client, configuration) + return client + + @override + async def create_rq_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> RedisRequestQueueClient: + """Create or open a SQL request queue client. + + Args: + id: Specific queue ID to open. If provided, name is ignored. + name: Queue name to open or create. Uses 'default' if not specified. + configuration: Configuration object. Uses global config if not provided. + + Returns: + Configured request queue client ready for use. + """ + configuration = configuration or Configuration.get_global_configuration() + + client = await RedisRequestQueueClient.open( + id=id, + name=name, + redis=self._redis, + ) + + await self._purge_if_needed(client, configuration) + return client diff --git a/src/crawlee/storage_clients/_redis/_utils.py b/src/crawlee/storage_clients/_redis/_utils.py new file mode 100644 index 0000000000..364b103877 --- /dev/null +++ b/src/crawlee/storage_clients/_redis/_utils.py @@ -0,0 +1,22 @@ +from collections.abc import Awaitable +from pathlib import Path +from typing import TypeVar, overload + +T = TypeVar('T') + + +@overload +async def await_redis_response(response: Awaitable[T]) -> T: ... +@overload +async def await_redis_response(response: T) -> T: ... + + +async def await_redis_response(response: Awaitable[T] | T) -> T: + """Solve the problem of ambiguous typing for redis.""" + return await response if isinstance(response, Awaitable) else response + + +def read_lua_script(file_path: Path) -> str: + """Read a Lua script from a file.""" + with file_path.open('r', encoding='utf-8') as file: + return file.read() diff --git a/src/crawlee/storage_clients/_redis/lua_scripts/atomic_add_requests.lua b/src/crawlee/storage_clients/_redis/lua_scripts/atomic_add_requests.lua new file mode 100644 index 0000000000..096bec9414 --- /dev/null +++ b/src/crawlee/storage_clients/_redis/lua_scripts/atomic_add_requests.lua @@ -0,0 +1,36 @@ +local added_filter_key = KEYS[1] +local queue_key = KEYS[2] +local data_key = KEYS[3] + +local forefront = ARGV[1] == '1' +local unique_keys = cjson.decode(ARGV[2]) +local requests_data = cjson.decode(ARGV[3]) + +-- Add and check which unique keys are actually new using Bloom filter +local bf_results = redis.call('bf.madd', added_filter_key, unpack(unique_keys)) + +local actually_added = {} +local hset_args = {} + +-- Process the results +for i, unique_key in ipairs(unique_keys) do + if bf_results[i] == 1 then + -- This key was added by us (did not exist before) + table.insert(hset_args, unique_key) + table.insert(hset_args, requests_data[unique_key]) + table.insert(actually_added, unique_key) + end +end + +-- Add only those that are actually new +if #actually_added > 0 then + redis.call('hset', data_key, unpack(hset_args)) + + if forefront then + redis.call('lpush', queue_key, unpack(actually_added)) + else + redis.call('rpush', queue_key, unpack(actually_added)) + end +end + +return cjson.encode(actually_added) diff --git a/src/crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua b/src/crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua new file mode 100644 index 0000000000..234116dc04 --- /dev/null +++ b/src/crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua @@ -0,0 +1,49 @@ +local queue_key = KEYS[1] +local in_progress_key = KEYS[2] +local data_key = KEYS[3] +local client_id = ARGV[1] +local blocked_until_timestamp = ARGV[2] +local batch_size = tonumber(ARGV[3]) + +-- Pop batch unique_key from queue +local batch_result = redis.call('LMPOP', 1, queue_key, 'LEFT', 'COUNT', batch_size) +if not batch_result then + return nil +end +local unique_keys = batch_result[2] + +-- Get requests data +local requests_data = redis.call('HMGET', data_key, unpack(unique_keys)) +if not requests_data then + -- Data missing, skip this request + return nil +end + +-- Prepare results and update in_progress +local final_result = {} +local in_progress_hmset = {} +local pending_decrement = 0 +local in_progress_data = cjson.encode({ + client_id = client_id, + blocked_until_timestamp = tonumber(blocked_until_timestamp) +}) +for i = 1, #unique_keys do + local unique_key = unique_keys[i] + local request_data = requests_data[i] + + if request_data then + -- Add to in_progress hash + table.insert(in_progress_hmset, unique_key) + table.insert(in_progress_hmset, in_progress_data) + + table.insert(final_result, request_data) + end +end + +-- Update in_progress hash +if #in_progress_hmset > 0 then + redis.call('HMSET', in_progress_key, unpack(in_progress_hmset)) +end + +-- Return result with requests data +return final_result diff --git a/src/crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua b/src/crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua new file mode 100644 index 0000000000..a6d9434e00 --- /dev/null +++ b/src/crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua @@ -0,0 +1,38 @@ +local in_progress_key = KEYS[1] +local queue_key = KEYS[2] +local data_key = KEYS[3] +local current_time = tonumber(ARGV[1]) + +local max_reclaim = 1000 + +local cursor = "0" +local count = 0 + +repeat + local result = redis.call('hscan', in_progress_key, cursor, 'COUNT', 100) + cursor = result[1] + local entries = result[2] + + for i = 1, #entries, 2 do + if count >= max_reclaim then + break + end + + local unique_key = entries[i] + local data = cjson.decode(entries[i + 1]) + + -- Check if timed out + if current_time > data.blocked_until_timestamp then + -- Atomically remove from in_progress and add back to queue + req_obj = cjson.decode(redis.call('hget', data_key, unique_key) or '{}') + redis.call('hdel', in_progress_key, unique_key) + if req_obj.forefront then + redis.call('lpush', queue_key, unique_key) + else + redis.call('rpush', queue_key, unique_key) + count = count + 1 + end + end +until cursor == "0" or count >= max_reclaim + +return count diff --git a/src/crawlee/storage_clients/_redis/py.typed b/src/crawlee/storage_clients/_redis/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py index b4f75bc6b4..2af5dbfa2b 100644 --- a/tests/unit/storages/test_dataset.py +++ b/tests/unit/storages/test_dataset.py @@ -8,7 +8,7 @@ import pytest from crawlee.configuration import Configuration -from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, RedisStorageClient from crawlee.storages import Dataset, KeyValueStore if TYPE_CHECKING: @@ -19,12 +19,15 @@ from crawlee.storage_clients import StorageClient -@pytest.fixture(params=['memory', 'file_system']) +@pytest.fixture(params=['memory', 'file_system', 'redis']) def storage_client(request: pytest.FixtureRequest) -> StorageClient: """Parameterized fixture to test with different storage clients.""" if request.param == 'memory': return MemoryStorageClient() + if request.param == 'redis': + return RedisStorageClient(connection_string='redis://localhost:6379/0') + return FileSystemStorageClient() diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index 25bbcb4fc0..60250b32e3 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -9,7 +9,7 @@ import pytest from crawlee.configuration import Configuration -from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, RedisStorageClient from crawlee.storages import KeyValueStore if TYPE_CHECKING: @@ -19,12 +19,15 @@ from crawlee.storage_clients import StorageClient -@pytest.fixture(params=['memory', 'file_system']) +@pytest.fixture(params=['memory', 'file_system', 'redis']) def storage_client(request: pytest.FixtureRequest) -> StorageClient: """Parameterized fixture to test with different storage clients.""" if request.param == 'memory': return MemoryStorageClient() + if request.param == 'redis': + return RedisStorageClient(connection_string='redis://localhost:6379/0') + return FileSystemStorageClient() diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index 7227504a95..0a1cb743ef 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -10,7 +10,7 @@ from crawlee import Request, service_locator from crawlee.configuration import Configuration -from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, StorageClient +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, RedisStorageClient, StorageClient from crawlee.storages import RequestQueue if TYPE_CHECKING: @@ -20,12 +20,15 @@ from crawlee.storage_clients import StorageClient -@pytest.fixture(params=['memory', 'file_system']) +@pytest.fixture(params=['memory', 'file_system', 'redis']) def storage_client(request: pytest.FixtureRequest) -> StorageClient: """Parameterized fixture to test with different storage clients.""" if request.param == 'memory': return MemoryStorageClient() + if request.param == 'redis': + return RedisStorageClient(connection_string='redis://localhost:6379/0') + return FileSystemStorageClient() diff --git a/uv.lock b/uv.lock index 3223fc9229..8f75f08754 100644 --- a/uv.lock +++ b/uv.lock @@ -86,6 +86,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f8/ed/e97229a566617f2ae958a6b13e7cc0f585470eac730a73e9e82c32a3cdd2/arrow-1.3.0-py3-none-any.whl", hash = "sha256:c728b120ebc00eb84e01882a6f5e7927a53960aa990ce7dd2b10f39005a67f80", size = 66419, upload-time = "2023-09-30T22:11:16.072Z" }, ] +[[package]] +name = "async-timeout" +version = "5.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274, upload-time = "2024-11-06T16:41:39.6Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233, upload-time = "2024-11-06T16:41:37.9Z" }, +] + [[package]] name = "backports-asyncio-runner" version = "1.2.0" @@ -620,6 +629,7 @@ all = [ { name = "opentelemetry-semantic-conventions" }, { name = "parsel" }, { name = "playwright" }, + { name = "redis", extra = ["hiredis"] }, { name = "rich" }, { name = "scikit-learn" }, { name = "typer" }, @@ -660,6 +670,9 @@ playwright = [ { name = "browserforge" }, { name = "playwright" }, ] +redis = [ + { name = "redis", extra = ["hiredis"] }, +] [package.dev-dependencies] dev = [ @@ -697,7 +710,7 @@ requires-dist = [ { name = "cachetools", specifier = ">=5.5.0" }, { name = "colorama", specifier = ">=0.4.0" }, { name = "cookiecutter", marker = "extra == 'cli'", specifier = ">=2.6.0" }, - { name = "crawlee", extras = ["adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel"], marker = "extra == 'all'" }, + { name = "crawlee", extras = ["adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "redis"], marker = "extra == 'all'" }, { name = "curl-cffi", marker = "extra == 'curl-impersonate'", specifier = ">=0.9.0" }, { name = "html5lib", marker = "extra == 'beautifulsoup'", specifier = ">=1.0" }, { name = "httpx", extras = ["brotli", "http2", "zstd"], marker = "extra == 'httpx'", specifier = ">=0.27.0" }, @@ -719,6 +732,7 @@ requires-dist = [ { name = "pydantic", specifier = ">=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2" }, { name = "pydantic-settings", specifier = ">=2.2.0,!=2.7.0,!=2.7.1,!=2.8.0" }, { name = "pyee", specifier = ">=9.0.0" }, + { name = "redis", extras = ["hiredis"], marker = "extra == 'redis'", specifier = ">=6.4.0" }, { name = "rich", marker = "extra == 'cli'", specifier = ">=13.9.0" }, { name = "scikit-learn", marker = "extra == 'adaptive-crawler'", specifier = ">=1.6.0" }, { name = "tldextract", specifier = ">=5.1.0" }, @@ -727,7 +741,7 @@ requires-dist = [ { name = "wrapt", marker = "extra == 'otel'", specifier = ">=1.17.0" }, { name = "yarl", specifier = ">=1.18.0" }, ] -provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel"] +provides-extras = ["all", "adaptive-crawler", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "redis"] [package.metadata.requires-dev] dev = [ @@ -1066,6 +1080,80 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" }, ] +[[package]] +name = "hiredis" +version = "3.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f7/08/24b72f425b75e1de7442fb1740f69ca66d5820b9f9c0e2511ff9aadab3b7/hiredis-3.2.1.tar.gz", hash = "sha256:5a5f64479bf04dd829fe7029fad0ea043eac4023abc6e946668cbbec3493a78d", size = 89096, upload-time = "2025-05-23T11:41:57.227Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/12/e797b676d65b86d9ad56f434cb4548b1bd0ebf531cd2e36ef74c5cd46dcd/hiredis-3.2.1-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:add17efcbae46c5a6a13b244ff0b4a8fa079602ceb62290095c941b42e9d5dec", size = 82441, upload-time = "2025-05-23T11:39:36.142Z" }, + { url = "https://files.pythonhosted.org/packages/d3/04/45783d5cf6e7430b1c67d64a7919ee45381e8b98d6d4578516579c5a4420/hiredis-3.2.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:5fe955cc4f66c57df1ae8e5caf4de2925d43b5efab4e40859662311d1bcc5f54", size = 45235, upload-time = "2025-05-23T11:39:37.49Z" }, + { url = "https://files.pythonhosted.org/packages/d5/97/7f50bad0b8213a3ee7780e295cd3d5e3db2839de2a6342b3c0ceeaf8e0af/hiredis-3.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f9ad63cd9065820a43fb1efb8ed5ae85bb78f03ef5eb53f6bde47914708f5718", size = 43250, upload-time = "2025-05-23T11:39:38.518Z" }, + { url = "https://files.pythonhosted.org/packages/51/d0/38d4b5bf36bfd010fdfd460c53efc0aaef7c81d6c20f4041ca35e26a1e12/hiredis-3.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e7f9e5fdba08841d78d4e1450cae03a4dbed2eda8a4084673cafa5615ce24a", size = 168996, upload-time = "2025-05-23T11:39:39.563Z" }, + { url = "https://files.pythonhosted.org/packages/99/22/4e2e9fde2b2efcf9847a2442a21f404c4112c57cccd6a09e564524dd70f3/hiredis-3.2.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1dce2508eca5d4e47ef38bc7c0724cb45abcdb0089f95a2ef49baf52882979a8", size = 165508, upload-time = "2025-05-23T11:39:40.723Z" }, + { url = "https://files.pythonhosted.org/packages/98/d0/b05bc8d4f339abaa455a9e677fc5223e25cd97630e66a2da0ad25e67b131/hiredis-3.2.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186428bf353e4819abae15aa2ad64c3f40499d596ede280fe328abb9e98e72ce", size = 180109, upload-time = "2025-05-23T11:39:41.865Z" }, + { url = "https://files.pythonhosted.org/packages/e3/ca/6df2cf488792ace30ee525a5444e12f432cc1da4acb47756ea5de265ea80/hiredis-3.2.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:74f2500d90a0494843aba7abcdc3e77f859c502e0892112d708c02e1dcae8f90", size = 169161, upload-time = "2025-05-23T11:39:43.432Z" }, + { url = "https://files.pythonhosted.org/packages/15/8b/afcef7a30bf5b94936264edb7daaf12a165f2b57007e384a57ac48411886/hiredis-3.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32822a94d2fdd1da96c05b22fdeef6d145d8fdbd865ba2f273f45eb949e4a805", size = 169485, upload-time = "2025-05-23T11:39:45.008Z" }, + { url = "https://files.pythonhosted.org/packages/43/14/3443dee27bd20f2ac88a759b67b29e7f3756a9a38bbe8084de049dfc5cac/hiredis-3.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ead809fb08dd4fdb5b4b6e2999c834e78c3b0c450a07c3ed88983964432d0c64", size = 163644, upload-time = "2025-05-23T11:39:46.755Z" }, + { url = "https://files.pythonhosted.org/packages/3f/24/8a3cee0f08071af0a9632ca81a057fe2b638e7b6956c9b5704a2049c1305/hiredis-3.2.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b90fada20301c3a257e868dd6a4694febc089b2b6d893fa96a3fc6c1f9ab4340", size = 162180, upload-time = "2025-05-23T11:39:47.939Z" }, + { url = "https://files.pythonhosted.org/packages/bd/2c/34cb6e665535dce1cbb7077cb9cc608198f254050241b5e232d62393f6a7/hiredis-3.2.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:6d8bff53f526da3d9db86c8668011e4f7ca2958ee3a46c648edab6fe2cd1e709", size = 174369, upload-time = "2025-05-23T11:39:49.13Z" }, + { url = "https://files.pythonhosted.org/packages/f8/24/96702f71991d884412d7ac89577ad9caa28875e2e309f53751b8c5f969be/hiredis-3.2.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:043d929ae262d03e1db0f08616e14504a9119c1ff3de13d66f857d85cd45caff", size = 166511, upload-time = "2025-05-23T11:39:50.232Z" }, + { url = "https://files.pythonhosted.org/packages/de/d0/8d3753244bdea37ab1700db8eec220df8361d0e3f72b9b5314ce4a0471ac/hiredis-3.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8d470fef39d02dbe5c541ec345cc4ffd7d2baec7d6e59c92bd9d9545dc221829", size = 164329, upload-time = "2025-05-23T11:39:51.365Z" }, + { url = "https://files.pythonhosted.org/packages/44/2e/28b5fffd2872e51182aec94992ff34641b6aab00c135e21da1d2f6c8c99b/hiredis-3.2.1-cp310-cp310-win32.whl", hash = "sha256:efa4c76c45cc8c42228c7989b279fa974580e053b5e6a4a834098b5324b9eafa", size = 20401, upload-time = "2025-05-23T11:39:52.4Z" }, + { url = "https://files.pythonhosted.org/packages/62/14/cbad8202ca7996686d51a779a552fb9d16a59c4fe60b68b076907a8a44f0/hiredis-3.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:cbac5ec3a620b095c46ef3a8f1f06da9c86c1cdc411d44a5f538876c39a2b321", size = 22076, upload-time = "2025-05-23T11:39:53.229Z" }, + { url = "https://files.pythonhosted.org/packages/48/84/2ea9636f2ba0811d9eb3bebbbfa84f488238180ddab70c9cb7fa13419d78/hiredis-3.2.1-cp311-cp311-macosx_10_15_universal2.whl", hash = "sha256:e4ae0be44cab5e74e6e4c4a93d04784629a45e781ff483b136cc9e1b9c23975c", size = 82425, upload-time = "2025-05-23T11:39:54.135Z" }, + { url = "https://files.pythonhosted.org/packages/fc/24/b9ebf766a99998fda3975937afa4912e98de9d7f8d0b83f48096bdd961c1/hiredis-3.2.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:24647e84c9f552934eb60b7f3d2116f8b64a7020361da9369e558935ca45914d", size = 45231, upload-time = "2025-05-23T11:39:55.455Z" }, + { url = "https://files.pythonhosted.org/packages/68/4c/c009b4d9abeb964d607f0987561892d1589907f770b9e5617552b34a4a4d/hiredis-3.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6fb3e92d1172da8decc5f836bf8b528c0fc9b6d449f1353e79ceeb9dc1801132", size = 43240, upload-time = "2025-05-23T11:39:57.8Z" }, + { url = "https://files.pythonhosted.org/packages/e9/83/d53f3ae9e4ac51b8a35afb7ccd68db871396ed1d7c8ba02ce2c30de0cf17/hiredis-3.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38ba7a32e51e518b6b3e470142e52ed2674558e04d7d73d86eb19ebcb37d7d40", size = 169624, upload-time = "2025-05-23T11:40:00.055Z" }, + { url = "https://files.pythonhosted.org/packages/91/2f/f9f091526e22a45385d45f3870204dc78aee365b6fe32e679e65674da6a7/hiredis-3.2.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4fc632be73174891d6bb71480247e57b2fd8f572059f0a1153e4d0339e919779", size = 165799, upload-time = "2025-05-23T11:40:01.194Z" }, + { url = "https://files.pythonhosted.org/packages/1c/cc/e561274438cdb19794f0638136a5a99a9ca19affcb42679b12a78016b8ad/hiredis-3.2.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f03e6839ff21379ad3c195e0700fc9c209e7f344946dea0f8a6d7b5137a2a141", size = 180612, upload-time = "2025-05-23T11:40:02.385Z" }, + { url = "https://files.pythonhosted.org/packages/83/ba/a8a989f465191d55672e57aea2a331bfa3a74b5cbc6f590031c9e11f7491/hiredis-3.2.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:99983873e37c71bb71deb544670ff4f9d6920dab272aaf52365606d87a4d6c73", size = 169934, upload-time = "2025-05-23T11:40:03.524Z" }, + { url = "https://files.pythonhosted.org/packages/52/5f/1148e965df1c67b17bdcaef199f54aec3def0955d19660a39c6ee10a6f55/hiredis-3.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffd982c419f48e3a57f592678c72474429465bb4bfc96472ec805f5d836523f0", size = 170074, upload-time = "2025-05-23T11:40:04.618Z" }, + { url = "https://files.pythonhosted.org/packages/43/5e/e6846ad159a938b539fb8d472e2e68cb6758d7c9454ea0520211f335ea72/hiredis-3.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bc993f4aa4abc029347f309e722f122e05a3b8a0c279ae612849b5cc9dc69f2d", size = 164158, upload-time = "2025-05-23T11:40:05.653Z" }, + { url = "https://files.pythonhosted.org/packages/0a/a1/5891e0615f0993f194c1b51a65aaac063b0db318a70df001b28e49f0579d/hiredis-3.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:dde790d420081f18b5949227649ccb3ed991459df33279419a25fcae7f97cd92", size = 162591, upload-time = "2025-05-23T11:40:07.041Z" }, + { url = "https://files.pythonhosted.org/packages/d4/da/8bce52ca81716f53c1014f689aea4c170ba6411e6848f81a1bed1fc375eb/hiredis-3.2.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:b0c8cae7edbef860afcf3177b705aef43e10b5628f14d5baf0ec69668247d08d", size = 174808, upload-time = "2025-05-23T11:40:09.146Z" }, + { url = "https://files.pythonhosted.org/packages/84/91/fc1ef444ed4dc432b5da9b48e9bd23266c703528db7be19e2b608d67ba06/hiredis-3.2.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e8a90eaca7e1ce7f175584f07a2cdbbcab13f4863f9f355d7895c4d28805f65b", size = 167060, upload-time = "2025-05-23T11:40:10.757Z" }, + { url = "https://files.pythonhosted.org/packages/66/ad/beebf73a5455f232b97e00564d1e8ad095d4c6e18858c60c6cfdd893ac1e/hiredis-3.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:476031958fa44e245e803827e0787d49740daa4de708fe514370293ce519893a", size = 164833, upload-time = "2025-05-23T11:40:12.001Z" }, + { url = "https://files.pythonhosted.org/packages/75/79/a9591bdc0148c0fbdf54cf6f3d449932d3b3b8779e87f33fa100a5a8088f/hiredis-3.2.1-cp311-cp311-win32.whl", hash = "sha256:eb3f5df2a9593b4b4b676dce3cea53b9c6969fc372875188589ddf2bafc7f624", size = 20402, upload-time = "2025-05-23T11:40:13.216Z" }, + { url = "https://files.pythonhosted.org/packages/9f/05/c93cc6fab31e3c01b671126c82f44372fb211facb8bd4571fd372f50898d/hiredis-3.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:1402e763d8a9fdfcc103bbf8b2913971c0a3f7b8a73deacbda3dfe5f3a9d1e0b", size = 22085, upload-time = "2025-05-23T11:40:14.19Z" }, + { url = "https://files.pythonhosted.org/packages/60/a1/6da1578a22df1926497f7a3f6a3d2408fe1d1559f762c1640af5762a8eb6/hiredis-3.2.1-cp312-cp312-macosx_10_15_universal2.whl", hash = "sha256:3742d8b17e73c198cabeab11da35f2e2a81999d406f52c6275234592256bf8e8", size = 82627, upload-time = "2025-05-23T11:40:15.362Z" }, + { url = "https://files.pythonhosted.org/packages/6c/b1/1056558ca8dc330be5bb25162fe5f268fee71571c9a535153df9f871a073/hiredis-3.2.1-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9c2f3176fb617a79f6cccf22cb7d2715e590acb534af6a82b41f8196ad59375d", size = 45404, upload-time = "2025-05-23T11:40:16.72Z" }, + { url = "https://files.pythonhosted.org/packages/58/4f/13d1fa1a6b02a99e9fed8f546396f2d598c3613c98e6c399a3284fa65361/hiredis-3.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a8bd46189c7fa46174e02670dc44dfecb60f5bd4b67ed88cb050d8f1fd842f09", size = 43299, upload-time = "2025-05-23T11:40:17.697Z" }, + { url = "https://files.pythonhosted.org/packages/c0/25/ddfac123ba5a32eb1f0b40ba1b2ec98a599287f7439def8856c3c7e5dd0d/hiredis-3.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f86ee4488c8575b58139cdfdddeae17f91e9a893ffee20260822add443592e2f", size = 172194, upload-time = "2025-05-23T11:40:19.143Z" }, + { url = "https://files.pythonhosted.org/packages/2c/1e/443a3703ce570b631ca43494094fbaeb051578a0ebe4bfcefde351e1ba25/hiredis-3.2.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3717832f4a557b2fe7060b9d4a7900e5de287a15595e398c3f04df69019ca69d", size = 168429, upload-time = "2025-05-23T11:40:20.329Z" }, + { url = "https://files.pythonhosted.org/packages/3b/d6/0d8c6c706ed79b2298c001b5458c055615e3166533dcee3900e821a18a3e/hiredis-3.2.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e5cb12c21fb9e2403d28c4e6a38120164973342d34d08120f2d7009b66785644", size = 182967, upload-time = "2025-05-23T11:40:21.921Z" }, + { url = "https://files.pythonhosted.org/packages/da/68/da8dd231fbce858b5a20ab7d7bf558912cd125f08bac4c778865ef5fe2c2/hiredis-3.2.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:080fda1510bbd389af91f919c11a4f2aa4d92f0684afa4709236faa084a42cac", size = 172495, upload-time = "2025-05-23T11:40:23.105Z" }, + { url = "https://files.pythonhosted.org/packages/65/25/83a31420535e2778662caa95533d5c997011fa6a88331f0cdb22afea9ec3/hiredis-3.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1252e10a1f3273d1c6bf2021e461652c2e11b05b83e0915d6eb540ec7539afe2", size = 173142, upload-time = "2025-05-23T11:40:24.24Z" }, + { url = "https://files.pythonhosted.org/packages/41/d7/cb907348889eb75e2aa2e6b63e065b611459e0f21fe1e371a968e13f0d55/hiredis-3.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d9e320e99ab7d2a30dc91ff6f745ba38d39b23f43d345cdee9881329d7b511d6", size = 166433, upload-time = "2025-05-23T11:40:25.287Z" }, + { url = "https://files.pythonhosted.org/packages/01/5d/7cbc69d82af7b29a95723d50f5261555ba3d024bfbdc414bdc3d23c0defb/hiredis-3.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:641668f385f16550fdd6fdc109b0af6988b94ba2acc06770a5e06a16e88f320c", size = 164883, upload-time = "2025-05-23T11:40:26.454Z" }, + { url = "https://files.pythonhosted.org/packages/f9/00/f995b1296b1d7e0247651347aa230f3225a9800e504fdf553cf7cd001cf7/hiredis-3.2.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1e1f44208c39d6c345ff451f82f21e9eeda6fe9af4ac65972cc3eeb58d41f7cb", size = 177262, upload-time = "2025-05-23T11:40:27.576Z" }, + { url = "https://files.pythonhosted.org/packages/c5/f3/723a67d729e94764ce9e0d73fa5f72a0f87d3ce3c98c9a0b27cbf001cc79/hiredis-3.2.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:f882a0d6415fffe1ffcb09e6281d0ba8b1ece470e866612bbb24425bf76cf397", size = 169619, upload-time = "2025-05-23T11:40:29.671Z" }, + { url = "https://files.pythonhosted.org/packages/45/58/f69028df00fb1b223e221403f3be2059ae86031e7885f955d26236bdfc17/hiredis-3.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b4e78719a0730ebffe335528531d154bc8867a246418f74ecd88adbc4d938c49", size = 167303, upload-time = "2025-05-23T11:40:30.902Z" }, + { url = "https://files.pythonhosted.org/packages/2b/7d/567411e65cce76cf265a9a4f837fd2ebc564bef6368dd42ac03f7a517c0a/hiredis-3.2.1-cp312-cp312-win32.whl", hash = "sha256:33c4604d9f79a13b84da79950a8255433fca7edaf292bbd3364fd620864ed7b2", size = 20551, upload-time = "2025-05-23T11:40:32.69Z" }, + { url = "https://files.pythonhosted.org/packages/90/74/b4c291eb4a4a874b3690ff9fc311a65d5292072556421b11b1d786e3e1d0/hiredis-3.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7b9749375bf9d171aab8813694f379f2cff0330d7424000f5e92890ad4932dc9", size = 22128, upload-time = "2025-05-23T11:40:33.686Z" }, + { url = "https://files.pythonhosted.org/packages/47/91/c07e737288e891c974277b9fa090f0a43c72ab6ccb5182117588f1c01269/hiredis-3.2.1-cp313-cp313-macosx_10_15_universal2.whl", hash = "sha256:7cabf7f1f06be221e1cbed1f34f00891a7bdfad05b23e4d315007dd42148f3d4", size = 82636, upload-time = "2025-05-23T11:40:35.035Z" }, + { url = "https://files.pythonhosted.org/packages/92/20/02cb1820360eda419bc17eb835eca976079e2b3e48aecc5de0666b79a54c/hiredis-3.2.1-cp313-cp313-macosx_10_15_x86_64.whl", hash = "sha256:db85cb86f8114c314d0ec6d8de25b060a2590b4713135240d568da4f7dea97ac", size = 45404, upload-time = "2025-05-23T11:40:36.113Z" }, + { url = "https://files.pythonhosted.org/packages/87/51/d30a4aadab8670ed9d40df4982bc06c891ee1da5cdd88d16a74e1ecbd520/hiredis-3.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c9a592a49b7b8497e4e62c3ff40700d0c7f1a42d145b71e3e23c385df573c964", size = 43301, upload-time = "2025-05-23T11:40:37.557Z" }, + { url = "https://files.pythonhosted.org/packages/f7/7b/2c613e1bb5c2e2bac36e8befeefdd58b42816befb17e26ab600adfe337fb/hiredis-3.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0079ef1e03930b364556b78548e67236ab3def4e07e674f6adfc52944aa972dd", size = 172486, upload-time = "2025-05-23T11:40:38.659Z" }, + { url = "https://files.pythonhosted.org/packages/1e/df/8f2c4fcc28d6f5178b25ee1ba2157cc473f9908c16ce4b8e0bdd79e38b05/hiredis-3.2.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d6a290ed45d9c14f4c50b6bda07afb60f270c69b5cb626fd23a4c2fde9e3da1", size = 168532, upload-time = "2025-05-23T11:40:39.843Z" }, + { url = "https://files.pythonhosted.org/packages/88/ae/d0864ffaa0461e29a6940a11c858daf78c99476c06ed531b41ad2255ec25/hiredis-3.2.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79dd5fe8c0892769f82949adeb021342ca46871af26e26945eb55d044fcdf0d0", size = 183216, upload-time = "2025-05-23T11:40:41.005Z" }, + { url = "https://files.pythonhosted.org/packages/75/17/558e831b77692d73f5bcf8b493ab3eace9f11b0aa08839cdbb87995152c7/hiredis-3.2.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:998a82281a159f4aebbfd4fb45cfe24eb111145206df2951d95bc75327983b58", size = 172689, upload-time = "2025-05-23T11:40:42.153Z" }, + { url = "https://files.pythonhosted.org/packages/35/b9/4fccda21f930f08c5072ad51e825d85d457748138443d7b510afe77b8264/hiredis-3.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41fc3cd52368ffe7c8e489fb83af5e99f86008ed7f9d9ba33b35fec54f215c0a", size = 173319, upload-time = "2025-05-23T11:40:43.328Z" }, + { url = "https://files.pythonhosted.org/packages/3d/8b/596d613588b0a3c58dfcf9a17edc6a886c4de6a3096e27c7142a94e2304d/hiredis-3.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8d10df3575ce09b0fa54b8582f57039dcbdafde5de698923a33f601d2e2a246c", size = 166695, upload-time = "2025-05-23T11:40:44.453Z" }, + { url = "https://files.pythonhosted.org/packages/e7/5b/6a1c266e9f6627a8be1fa0d8622e35e35c76ae40cce6d1c78a7e6021184a/hiredis-3.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1ab010d04be33735ad8e643a40af0d68a21d70a57b1d0bff9b6a66b28cca9dbf", size = 165181, upload-time = "2025-05-23T11:40:45.697Z" }, + { url = "https://files.pythonhosted.org/packages/6c/70/a9b91fa70d21763d9dfd1c27ddd378f130749a0ae4a0645552f754b3d1fc/hiredis-3.2.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:ec3b5f9ea34f70aaba3e061cbe1fa3556fea401d41f5af321b13e326792f3017", size = 177589, upload-time = "2025-05-23T11:40:46.903Z" }, + { url = "https://files.pythonhosted.org/packages/1a/c7/31bbb015156dc4441f6e19daa9598266a61445bf3f6e14c44292764638f6/hiredis-3.2.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:158dfb505fff6bffd17f823a56effc0c2a7a8bc4fb659d79a52782f22eefc697", size = 169883, upload-time = "2025-05-23T11:40:48.111Z" }, + { url = "https://files.pythonhosted.org/packages/89/44/cddc23379e0ce20ad7514b2adb2aa2c9b470ffb1ca0a2d8c020748962a22/hiredis-3.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9d632cd0ddd7895081be76748e6fb9286f81d2a51c371b516541c6324f2fdac9", size = 167585, upload-time = "2025-05-23T11:40:49.208Z" }, + { url = "https://files.pythonhosted.org/packages/48/92/8fc9b981ed01fc2bbac463a203455cd493482b749801bb555ebac72923f1/hiredis-3.2.1-cp313-cp313-win32.whl", hash = "sha256:e9726d03e7df068bf755f6d1ecc61f7fc35c6b20363c7b1b96f39a14083df940", size = 20554, upload-time = "2025-05-23T11:40:50.314Z" }, + { url = "https://files.pythonhosted.org/packages/e1/6e/e76341d68aa717a705a2ee3be6da9f4122a0d1e3f3ad93a7104ed7a81bea/hiredis-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:b5b1653ad7263a001f2e907e81a957d6087625f9700fa404f1a2268c0a4f9059", size = 22136, upload-time = "2025-05-23T11:40:51.497Z" }, + { url = "https://files.pythonhosted.org/packages/ed/f9/04a0a6c760d28e0b7d536646edacd6f5b4c979dd4c848621287bff5be9d0/hiredis-3.2.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:73913d2fa379e722d17ba52f21ce12dd578140941a08efd73e73b6fab1dea4d8", size = 40382, upload-time = "2025-05-23T11:41:34.425Z" }, + { url = "https://files.pythonhosted.org/packages/cd/1c/50fbce19cc5e393cf97a187462377d1c9441337684b3da1ed13ed0f20873/hiredis-3.2.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:15a3dff3eca31ecbf3d7d6d104cf1b318dc2b013bad3f4bdb2839cb9ea2e1584", size = 37760, upload-time = "2025-05-23T11:41:35.432Z" }, + { url = "https://files.pythonhosted.org/packages/5c/e6/d147636edf44e5267f9e4c3483cd8d6b027fd6cf008a003c932f5ff888f7/hiredis-3.2.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c78258032c2f9fc6f39fee7b07882ce26de281e09178266ce535992572132d95", size = 48738, upload-time = "2025-05-23T11:41:36.452Z" }, + { url = "https://files.pythonhosted.org/packages/97/b0/53c33900139149a9b85878c04748984987b62ee2583d452b4e4d578067a9/hiredis-3.2.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:578d6a881e64e46db065256355594e680202c3bacf3270be3140057171d2c23e", size = 56254, upload-time = "2025-05-23T11:41:38.395Z" }, + { url = "https://files.pythonhosted.org/packages/9d/af/b49debecac06674a9ccb51353f497300199d6122a7612f56930872076147/hiredis-3.2.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b7f34b170093c077c972b8cc0ceb15d8ff88ad0079751a8ae9733e94d77e733", size = 48905, upload-time = "2025-05-23T11:41:39.92Z" }, + { url = "https://files.pythonhosted.org/packages/c6/a2/5aacf68320bfaf531afac73f62f4fc55140742a4725bf04929671ca5d1cc/hiredis-3.2.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:291a18b228fc90f6720d178de2fac46522082c96330b4cc2d3dd8cb2c1cb2815", size = 22184, upload-time = "2025-05-23T11:41:41.196Z" }, +] + [[package]] name = "hpack" version = "4.1.0" @@ -2584,6 +2672,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a9/10/e4b1e0e5b6b6745c8098c275b69bc9d73e9542d5c7da4f137542b499ed44/readchar-4.2.1-py3-none-any.whl", hash = "sha256:a769305cd3994bb5fa2764aa4073452dc105a4ec39068ffe6efd3c20c60acc77", size = 9350, upload-time = "2024-11-04T18:28:02.859Z" }, ] +[[package]] +name = "redis" +version = "6.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "async-timeout", marker = "python_full_version < '3.11.3'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0d/d6/e8b92798a5bd67d659d51a18170e91c16ac3b59738d91894651ee255ed49/redis-6.4.0.tar.gz", hash = "sha256:b01bc7282b8444e28ec36b261df5375183bb47a07eb9c603f284e89cbc5ef010", size = 4647399, upload-time = "2025-08-07T08:10:11.441Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e8/02/89e2ed7e85db6c93dfa9e8f691c5087df4e3551ab39081a4d7c6d1f90e05/redis-6.4.0-py3-none-any.whl", hash = "sha256:f0544fa9604264e9464cdf4814e7d4830f74b165d52f2a330a760a88dd248b7f", size = 279847, upload-time = "2025-08-07T08:10:09.84Z" }, +] + +[package.optional-dependencies] +hiredis = [ + { name = "hiredis" }, +] + [[package]] name = "requests" version = "2.32.5" From fe3eee116880839f878b86649c33b3faacd23fc6 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sat, 13 Sep 2025 03:52:23 +0000 Subject: [PATCH 02/12] add fakeredis --- pyproject.toml | 1 + tests/unit/conftest.py | 6 + tests/unit/storages/test_dataset.py | 9 +- tests/unit/storages/test_key_value_store.py | 9 +- tests/unit/storages/test_request_queue.py | 9 +- uv.lock | 118 ++++++++++++++++++++ 6 files changed, 143 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8e00e1ad96..1a24ab1863 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,6 +91,7 @@ dev = [ "apify_client", # For e2e tests. "build~=1.3.0", # For e2e tests. "dycw-pytest-only~=2.1.0", + "fakeredis[probabilistic,json,lua]>=2.31.0", "mypy~=1.17.0", "pre-commit~=4.3.0", "proxy-py~=2.4.0", diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index e57f190bc3..7d758e5ee5 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -9,6 +9,7 @@ import pytest from curl_cffi import CurlHttpVersion +from fakeredis import FakeAsyncRedis from proxy import Proxy from uvicorn.config import Config @@ -208,3 +209,8 @@ async def http_client(request: pytest.FixtureRequest) -> HttpClient: if request.param == 'impit': return ImpitHttpClient(http3=False) return HttpxHttpClient(http2=False) + + +@pytest.fixture +def redis_client() -> FakeAsyncRedis: + return FakeAsyncRedis() diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py index 2af5dbfa2b..9726f60b20 100644 --- a/tests/unit/storages/test_dataset.py +++ b/tests/unit/storages/test_dataset.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import pytest @@ -16,17 +16,20 @@ from pathlib import Path from typing import Any + from fakeredis import FakeAsyncRedis + from redis.asyncio import Redis + from crawlee.storage_clients import StorageClient @pytest.fixture(params=['memory', 'file_system', 'redis']) -def storage_client(request: pytest.FixtureRequest) -> StorageClient: +def storage_client(request: pytest.FixtureRequest, redis_client: FakeAsyncRedis) -> StorageClient: """Parameterized fixture to test with different storage clients.""" if request.param == 'memory': return MemoryStorageClient() if request.param == 'redis': - return RedisStorageClient(connection_string='redis://localhost:6379/0') + return RedisStorageClient(redis=cast('Redis', redis_client)) return FileSystemStorageClient() diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index 60250b32e3..ff7b2f61f6 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -4,7 +4,7 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import pytest @@ -16,17 +16,20 @@ from collections.abc import AsyncGenerator from pathlib import Path + from fakeredis import FakeAsyncRedis + from redis.asyncio import Redis + from crawlee.storage_clients import StorageClient @pytest.fixture(params=['memory', 'file_system', 'redis']) -def storage_client(request: pytest.FixtureRequest) -> StorageClient: +def storage_client(request: pytest.FixtureRequest, redis_client: FakeAsyncRedis) -> StorageClient: """Parameterized fixture to test with different storage clients.""" if request.param == 'memory': return MemoryStorageClient() if request.param == 'redis': - return RedisStorageClient(connection_string='redis://localhost:6379/0') + return RedisStorageClient(redis=cast('Redis', redis_client)) return FileSystemStorageClient() diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index 0a1cb743ef..76541afd7f 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -4,7 +4,7 @@ from __future__ import annotations import asyncio -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import pytest @@ -17,17 +17,20 @@ from collections.abc import AsyncGenerator from pathlib import Path + from fakeredis import FakeAsyncRedis + from redis.asyncio import Redis + from crawlee.storage_clients import StorageClient @pytest.fixture(params=['memory', 'file_system', 'redis']) -def storage_client(request: pytest.FixtureRequest) -> StorageClient: +def storage_client(request: pytest.FixtureRequest, redis_client: FakeAsyncRedis) -> StorageClient: """Parameterized fixture to test with different storage clients.""" if request.param == 'memory': return MemoryStorageClient() if request.param == 'redis': - return RedisStorageClient(connection_string='redis://localhost:6379/0') + return RedisStorageClient(redis=cast('Redis', redis_client)) return FileSystemStorageClient() diff --git a/uv.lock b/uv.lock index 8f75f08754..154df0be34 100644 --- a/uv.lock +++ b/uv.lock @@ -679,6 +679,7 @@ dev = [ { name = "apify-client" }, { name = "build" }, { name = "dycw-pytest-only" }, + { name = "fakeredis", extra = ["json", "lua", "probabilistic"] }, { name = "mypy" }, { name = "pre-commit" }, { name = "proxy-py" }, @@ -748,6 +749,7 @@ dev = [ { name = "apify-client" }, { name = "build", specifier = "~=1.3.0" }, { name = "dycw-pytest-only", specifier = "~=2.1.0" }, + { name = "fakeredis", extras = ["probabilistic", "json", "lua"], specifier = ">=2.31.0" }, { name = "mypy", specifier = "~=1.17.0" }, { name = "pre-commit", specifier = "~=4.3.0" }, { name = "proxy-py", specifier = "~=2.4.0" }, @@ -938,6 +940,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/43/09/2aea36ff60d16dd8879bdb2f5b3ee0ba8d08cbbdcdfe870e695ce3784385/execnet-2.1.1-py3-none-any.whl", hash = "sha256:26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc", size = 40612, upload-time = "2024-04-08T09:04:17.414Z" }, ] +[[package]] +name = "fakeredis" +version = "2.31.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "redis" }, + { name = "sortedcontainers" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/54/65/433bf2dfa3d5c72d7339bffcd3a48d3d5ce4449af51804f55aa78e17149e/fakeredis-2.31.1.tar.gz", hash = "sha256:bba58475d6ba3846752d242921c5d3f6dc948066e0ddd054f3a448cd9a1aacad", size = 170681, upload-time = "2025-08-31T18:49:09.163Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b4/98/1637792209ec01bb115b4d9c58bcf9e6bc536f6d724fdc9c541f5b12cea7/fakeredis-2.31.1-py3-none-any.whl", hash = "sha256:1c0403dedc42bb0038649f016e1a8b56b4b1c69dfb13cf11f870dc51e5c5b4df", size = 118329, upload-time = "2025-08-31T18:49:07.829Z" }, +] + +[package.optional-dependencies] +json = [ + { name = "jsonpath-ng" }, +] +lua = [ + { name = "lupa" }, +] +probabilistic = [ + { name = "pyprobables" }, +] + [[package]] name = "filelock" version = "3.19.1" @@ -1405,6 +1432,70 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/e8/685f47e0d754320684db4425a0967f7d3fa70126bffd76110b7009a0090f/joblib-1.5.2-py3-none-any.whl", hash = "sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241", size = 308396, upload-time = "2025-08-27T12:15:45.188Z" }, ] +[[package]] +name = "jsonpath-ng" +version = "1.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ply" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/86/08646239a313f895186ff0a4573452038eed8c86f54380b3ebac34d32fb2/jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c", size = 37838, upload-time = "2024-10-11T15:41:42.404Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/5a/73ecb3d82f8615f32ccdadeb9356726d6cae3a4bbc840b437ceb95708063/jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6", size = 30105, upload-time = "2024-11-20T17:58:30.418Z" }, +] + +[[package]] +name = "lupa" +version = "2.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d5/51/4e20b597795b58f840289042d87d65f0e33cdc73653a4c1c7026346e1725/lupa-2.5.tar.gz", hash = "sha256:69c6a89f2b7b08a3040d7ed2a1eeccba37a31ddc92fa199339c53a2ae3c48c34", size = 7235982, upload-time = "2025-06-15T15:31:51.745Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/cb/3c82773eb581b8a95fae98eb5f0b7aa535c9fab04ac91da6ccab2bf2b6eb/lupa-2.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:102538780e8a6164944fff6bf93737d7cb8bf9e6f7146baa56184755fadb96d5", size = 909182, upload-time = "2025-06-15T15:28:59.668Z" }, + { url = "https://files.pythonhosted.org/packages/54/e3/3ad6a3aae96dfd8fe15d696c824772fc14740ccd677907ac07872a9f5e72/lupa-2.5-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:d5235bf6880544f6b357513ba508f70a2d0363ae8bd94d696ba564b458435dbf", size = 1852437, upload-time = "2025-06-15T15:29:02.681Z" }, + { url = "https://files.pythonhosted.org/packages/f7/cf/2628b11289fafb8856f3b5a29f186e415978cf9a26b676931f6cb037ce32/lupa-2.5-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:05f7c091d59ef267e2572a7580c23093ce89894ed2755b68159a5a271b0b48eb", size = 964263, upload-time = "2025-06-15T15:29:04.697Z" }, + { url = "https://files.pythonhosted.org/packages/86/e1/18459bafaea54384305f1c0633d922197de7a349628fef5f5c83bad0a332/lupa-2.5-cp310-cp310-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:62a810cb5270dd3f983db49f65518c1c060e7575beb464b80feafbb6b54baba1", size = 1170422, upload-time = "2025-06-15T15:29:07.141Z" }, + { url = "https://files.pythonhosted.org/packages/bb/36/1e62c7f6da39b765742d3f104c0a0cb78fb562914d9a59c061194caf8f6b/lupa-2.5-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7d28842fcd98ef1f0b825ae1e0b9568710eb4c522fb5dffa53255024c7816b84", size = 1062304, upload-time = "2025-06-15T15:29:09.405Z" }, + { url = "https://files.pythonhosted.org/packages/cc/8c/a13bcec44aed28b0c05daddaa713e85a16386300f62fd10312e4e51baac2/lupa-2.5-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:82a845a5d93766fde05fc4094a27382f5253af97b699a36d496ca3cdf6afe769", size = 2105449, upload-time = "2025-06-15T15:29:12.093Z" }, + { url = "https://files.pythonhosted.org/packages/7d/2d/6c42afdf3eb2f4d8467cb70912df0db83d490584772db51f176475f831ca/lupa-2.5-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:366e98069d20164632d66cd8c0820fcb4d8fea1214364e1614d19bf69086e29f", size = 1085431, upload-time = "2025-06-15T15:29:14.489Z" }, + { url = "https://files.pythonhosted.org/packages/d8/fe/697f8846209230ca2948a9a78d65870e45e51772862313ad61dbfea23bbd/lupa-2.5-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c6133fa7d193e590f52266afedbeb55ae6dbb8c6def6f3a2074b10edfdb44727", size = 1196719, upload-time = "2025-06-15T15:29:16.761Z" }, + { url = "https://files.pythonhosted.org/packages/1d/cb/b77966f2229c29cbbdfa06f7b7cf59865cfe68fff8c4aa24ecea6cb5e0a1/lupa-2.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:4b4a4a030e6a075e940a36311d44b4e2609778253ea10f933cf61786182cffed", size = 2201540, upload-time = "2025-06-15T15:29:19.459Z" }, + { url = "https://files.pythonhosted.org/packages/14/c3/e3e686f7b55292ad192c9cf7307fe1370c8969e348ecb987b0b314d9af75/lupa-2.5-cp310-cp310-win32.whl", hash = "sha256:9f6b2d6e2b909e8ca1a746286881e718864d44d862596e4aae769dd9f63efcda", size = 1414886, upload-time = "2025-06-15T15:29:21.659Z" }, + { url = "https://files.pythonhosted.org/packages/b1/6e/f44ba7d13c53bfd99af3b03f22ac01620e3ec3eb81ac059edd56ae314641/lupa-2.5-cp310-cp310-win_amd64.whl", hash = "sha256:58bb044c788ad72b6b017e1f1513f7c2ef2a92f0d76f1b11bb0344f6bc82c623", size = 1668093, upload-time = "2025-06-15T15:29:24.234Z" }, + { url = "https://files.pythonhosted.org/packages/d8/d8/6f648335b66adde88651ff65afa96fcf5db26b957e30886b9860ca680799/lupa-2.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0014935935862d3acf0f96dfa9ce6657720ece6918feaf76eb7c63948ba03a58", size = 919573, upload-time = "2025-06-15T15:29:26.136Z" }, + { url = "https://files.pythonhosted.org/packages/d7/d8/7510b877d2bd02e3b0aae70b9d771ac9ed6a9ac842b6d1b2dcebaff3557c/lupa-2.5-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:5249ac3eb11aeba0e25dbc20d53352cd04d2e48299a412d187200d492fd0fd63", size = 1871844, upload-time = "2025-06-15T15:29:28.217Z" }, + { url = "https://files.pythonhosted.org/packages/57/6b/ec6ae84ba9fb6c9ac8768e1eab9a403fffb5b348ed0eab6a7a0a8cc694c3/lupa-2.5-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:789acf7f98e1328a2744e19dd7cb92c08e2d6397d5b9e8810954d5192499d2ae", size = 973418, upload-time = "2025-06-15T15:29:30.676Z" }, + { url = "https://files.pythonhosted.org/packages/28/5b/1a9ffcc53ff3bb59d096eae95db462056a22ab253ede8678119d3f72eb76/lupa-2.5-cp311-cp311-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:858e40d67292e524416da37993ec08290f29ad804608457803d8c570970623a7", size = 1154453, upload-time = "2025-06-15T15:29:33.204Z" }, + { url = "https://files.pythonhosted.org/packages/41/81/13aac83263bd62860db8bb3db313beb008a8bc33eb0c498b2bb5dce6d827/lupa-2.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d126bea5b69778eeb15277b0f3c362604a5509bdab1fc768d8d4e4f97ec5a922", size = 1052241, upload-time = "2025-06-15T15:29:35.599Z" }, + { url = "https://files.pythonhosted.org/packages/a2/0f/77d1908a099c19c8aec23c095c398bb1f23f1fe3ef03801a7769066b6cf0/lupa-2.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:29688640ebb88c317a76e41111de360b0dd38e833949928d76792dba2ba5cb0a", size = 2085299, upload-time = "2025-06-15T15:29:38.156Z" }, + { url = "https://files.pythonhosted.org/packages/07/43/590d17a39f78e2da4a51734af21c16bd237eff674392acce51a010a702c0/lupa-2.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3e7a9ae6f39015b07c44c32fe76fe8c3997451c7fd30a6fc12da3a78de502480", size = 1075624, upload-time = "2025-06-15T15:29:40.706Z" }, + { url = "https://files.pythonhosted.org/packages/92/86/65472ff813e46afa40d36533f6ea478cd6a8f88132ae66148338b79f54de/lupa-2.5-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:fa8cd11211c965d4fd1a79897d407d2614e60910936b2c2522303488331a712e", size = 1184649, upload-time = "2025-06-15T15:29:42.905Z" }, + { url = "https://files.pythonhosted.org/packages/b9/5c/2d9594b603ba08e52a2eaf25e051157430b6e7dcd7d7f65811406d8c176e/lupa-2.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e6bc86dd2cc2e4751e7f44fd925a6a88da822136dc9d20b37a2aac858850acf0", size = 2180943, upload-time = "2025-06-15T15:29:46.27Z" }, + { url = "https://files.pythonhosted.org/packages/e1/92/44d959d1d097fb58e87a2d4a069aa90a4ef496b2be6be6a13cea6b8b80d4/lupa-2.5-cp311-cp311-win32.whl", hash = "sha256:a35c8fce1e71dd9b57486407f783de32fba938a62b806d1ebe747a5e0475958a", size = 1415064, upload-time = "2025-06-15T15:29:48.016Z" }, + { url = "https://files.pythonhosted.org/packages/22/06/5e19b6e323339bb5092f782aaadaf6f5e1c876939756fec5900a99e77a1f/lupa-2.5-cp311-cp311-win_amd64.whl", hash = "sha256:0148bd1a1391d9fa3d14e0e4a07118f4091bdde7eb49cad98d417b4e5368ab77", size = 1679502, upload-time = "2025-06-15T15:29:50.547Z" }, + { url = "https://files.pythonhosted.org/packages/a5/44/db0fa42b126ea416efd9ea031b5490c37d3ce1575c7253f5b91d47faede4/lupa-2.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:85635865409320efa9e6b95f64176317a2409a3f4e261e032094c48f783eb5f5", size = 901972, upload-time = "2025-06-15T15:29:52.908Z" }, + { url = "https://files.pythonhosted.org/packages/81/96/2ff21f67532c3a2b4e15e6c108a589af26a111d9f58bee192b5cead3ad22/lupa-2.5-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:ec223da758c920f2e2b20e934a7761e233ad24121e6bba4708b7d3aafff9a513", size = 1851779, upload-time = "2025-06-15T15:29:55.105Z" }, + { url = "https://files.pythonhosted.org/packages/7e/da/815b6d7986d07f4ea1d992a31ba6bba8732dd5c1142311f24933aacc685b/lupa-2.5-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:7b3e258528a89f973a5e4a1b7d268a84bb1ae6e39912cfe5373c5a81ac8b82b6", size = 970599, upload-time = "2025-06-15T15:29:57.312Z" }, + { url = "https://files.pythonhosted.org/packages/75/ed/a62788e22fc2896f68bc3df9939ba505d94096acf337336aee8ce0fea3f4/lupa-2.5-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:07d91df9994c8a17e16d9923684ea810dfc2ecd290503e100a1525ed3aa48bc8", size = 1127171, upload-time = "2025-06-15T15:29:59.673Z" }, + { url = "https://files.pythonhosted.org/packages/a5/57/c88b8c99c552d025a28484cd91e3bd8e2132af64f62f543b98b3107fe35b/lupa-2.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d969ee4eed04788e7a5fa24613dcbc2fff6ef4e978a8ced4746f752092d70a9", size = 1041794, upload-time = "2025-06-15T15:30:01.944Z" }, + { url = "https://files.pythonhosted.org/packages/e6/6b/338c68c7df3b5ee03f79da32dc33bdd75c25ccc57e9204d31bd62515b271/lupa-2.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2478053c6d30b6c46a969a5ffb02181f35a5b299fc5481e99ba5ae63f0f0a63f", size = 2078068, upload-time = "2025-06-15T15:30:04.414Z" }, + { url = "https://files.pythonhosted.org/packages/53/35/2455e8fa0ad4f5a65003eb7e1efbdeada5c16ac13ce8fb31d95a2c1f4f1e/lupa-2.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e8bfdc69ebbc12271d9dfdb821bf6d5943887f8ec48acc4b19516c0a21bf98cf", size = 1060907, upload-time = "2025-06-15T15:30:06.65Z" }, + { url = "https://files.pythonhosted.org/packages/5c/44/38970fd1ff2069e3f3b87fa4817d5ce99272c3be49d31ca089603250cd79/lupa-2.5-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:953aa2bb0649ed08df911a001d845626d72b31e108da38701ed1116c22b3768f", size = 1175819, upload-time = "2025-06-15T15:30:08.784Z" }, + { url = "https://files.pythonhosted.org/packages/2e/0e/2cd499ac942aed285fc5be3ec7e2372a314cb83e7d17b596a9f855db98f2/lupa-2.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b4ed0d6dfb7246bc5c85f998c3507b0bd1b16553924eaf0834c4d896a60ee0cd", size = 2170558, upload-time = "2025-06-15T15:30:11.242Z" }, + { url = "https://files.pythonhosted.org/packages/7e/19/cb4d6bb92cf10a0f3824883a1382128198d7fd92b61665b9833af8d59fab/lupa-2.5-cp312-cp312-win32.whl", hash = "sha256:56e7e29980635a34422b9bc074015c3fc0a1ed354df6866ed27092b385b06839", size = 1423324, upload-time = "2025-06-15T15:30:13.485Z" }, + { url = "https://files.pythonhosted.org/packages/89/d7/98c42bf67692ce59fb8e5b32d6479042a8b4fd031579ea2969aeddf30621/lupa-2.5-cp312-cp312-win_amd64.whl", hash = "sha256:e51b0d1dee87a95f54b35f376a6eaa1143147ce3c5d89ba027772fb327555db6", size = 1698790, upload-time = "2025-06-15T15:30:15.885Z" }, + { url = "https://files.pythonhosted.org/packages/62/82/e535e181d751cbd0cc8465c0c3f8f162d8cd69ba075c9916912b6a7c5830/lupa-2.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:61f09dbb8af779d78f90d71811798710a29b455c6948ea51365eefc0ab142a0d", size = 897899, upload-time = "2025-06-15T15:30:17.572Z" }, + { url = "https://files.pythonhosted.org/packages/17/9f/2b16dfd4bea0effa256e51b446a543c1d95b059bd7217a671c8b3c0adec5/lupa-2.5-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:e372577ac3b54a4d13d43e43de2111ad48b6fabb8f7545f40bcd989e6c13b128", size = 1841869, upload-time = "2025-06-15T15:30:19.635Z" }, + { url = "https://files.pythonhosted.org/packages/19/34/c0219eaf75e60777973a1bc317483f6569b7e662e8c0295cf667dfa39ef1/lupa-2.5-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:671c7c38197a2d11040bb0e05593063ee62b29a67c982dda688bb2ef30b81670", size = 965235, upload-time = "2025-06-15T15:30:21.93Z" }, + { url = "https://files.pythonhosted.org/packages/fe/a7/54df82f921088ba125a48d1cb74778200013ec3814f5d45a63e92832405c/lupa-2.5-cp313-cp313-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:5cfbbec4cb74ad70b5d1747de1537095e21cb57ca70d2a6186738322d82cf917", size = 1120634, upload-time = "2025-06-15T15:30:24.454Z" }, + { url = "https://files.pythonhosted.org/packages/94/72/31e52086abcc34e1787ecbf7b913c039551dd42f8fa9610ae0f64bc36fb0/lupa-2.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:867430dde43c8cf463cd72d03b07a9158f2bee79bbdae08b0fb1e6e32982853e", size = 1037562, upload-time = "2025-06-15T15:30:26.535Z" }, + { url = "https://files.pythonhosted.org/packages/7f/31/96305ed6e702f9b22859c4422aa258d33f005546b8198b954bcb6c2294c9/lupa-2.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4326c120ae18de3fed09ea37be792a568a8433c7f3e01e0c7e32f883d98fc5a5", size = 2072701, upload-time = "2025-06-15T15:30:28.925Z" }, + { url = "https://files.pythonhosted.org/packages/37/9b/6440cde2f09b83c42ed3c6f121d0b8b9cf39f539960226600c8f72e2556f/lupa-2.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:78308d2ea41e2fae47659fe671725d5346d61d9d894d22b36e427f259b5a0cf1", size = 1057425, upload-time = "2025-06-15T15:30:30.988Z" }, + { url = "https://files.pythonhosted.org/packages/37/f4/9f5fb25df939408bac4b088c1ca42c96fb1606ac40aa1c6c28cc9f43e585/lupa-2.5-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5a9c12e74faaea60ae50a6d2670eb7e7cfc0b036045912bb37a15753a702fc28", size = 1172440, upload-time = "2025-06-15T15:30:33.147Z" }, + { url = "https://files.pythonhosted.org/packages/d3/3a/adebc756f088416c276359978a982a00b755aa3f23def7f7f6ec1d023105/lupa-2.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2a848ed378fbfcf735780116265bd2e68600691efefb4f7ff326a4ac089189d5", size = 2166142, upload-time = "2025-06-15T15:30:35.232Z" }, + { url = "https://files.pythonhosted.org/packages/6c/8a/fedac88be2af4bdf7e981d9eecfb2defe612004a22fb745978f68d9ddbee/lupa-2.5-cp313-cp313-win32.whl", hash = "sha256:1ea65fb8046bf2c7cf39dfb3677ce5e25d5ea1330e7f9bce9b274fcdf55db29b", size = 1422328, upload-time = "2025-06-15T15:30:37.057Z" }, + { url = "https://files.pythonhosted.org/packages/5d/38/9572b19463ee5aebcb4a503dee7d9f908179ee1b80e6dfc21300b156ee04/lupa-2.5-cp313-cp313-win_amd64.whl", hash = "sha256:e8d52999947d3d09c1dd2cf572cfb90a0ced3185f702e75f4b1a3ba4276b3c97", size = 1694135, upload-time = "2025-06-15T15:30:38.944Z" }, +] + [[package]] name = "lxml" version = "6.0.1" @@ -2184,6 +2275,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, ] +[[package]] +name = "ply" +version = "3.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e5/69/882ee5c9d017149285cab114ebeab373308ef0f874fcdac9beb90e0ac4da/ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3", size = 159130, upload-time = "2018-02-15T19:01:31.097Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a3/58/35da89ee790598a0700ea49b2a66594140f44dec458c07e8e3d4979137fc/ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce", size = 49567, upload-time = "2018-02-15T19:01:27.172Z" }, +] + [[package]] name = "pre-commit" version = "4.3.0" @@ -2507,6 +2607,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, ] +[[package]] +name = "pyprobables" +version = "0.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/79/dd/f2ed41047a0745f42c03da6c1730c11b17d7739f4c218df0e26ce506c52e/pyprobables-0.6.1.tar.gz", hash = "sha256:64b4d165d51beff05e716c01231c8a5503297844e58adee8771e5e7af130321d", size = 36043, upload-time = "2024-12-20T22:06:17.437Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/57/f0/1c11cc56aeedfa9e61f3b34aeba18628a9b956153aed3df015aff5520ea7/pyprobables-0.6.1-py3-none-any.whl", hash = "sha256:090d0c973f9e160f15927e8eb911dabf126285a7a1ecd478b7a9e04149e28392", size = 42421, upload-time = "2024-12-20T22:06:15.948Z" }, +] + [[package]] name = "pyproject-hooks" version = "1.2.0" @@ -2972,6 +3081,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] +[[package]] +name = "sortedcontainers" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, +] + [[package]] name = "soupsieve" version = "2.8" From 997fca361bd01c060b7e39515b2755da081b08e7 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sat, 13 Sep 2025 11:52:29 +0000 Subject: [PATCH 03/12] add support for NDU storages --- .../storage_clients/_redis/_dataset_client.py | 8 +++-- .../_redis/_key_value_store_client.py | 8 +++-- .../_redis/_request_queue_client.py | 9 ++--- .../storage_clients/_redis/_storage_client.py | 36 ++++--------------- 4 files changed, 21 insertions(+), 40 deletions(-) diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index 8b42fe257a..e246a4bb8a 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -63,6 +63,7 @@ async def open( *, id: str | None, name: str | None, + alias: str | None, redis: Redis, ) -> RedisDatasetClient: """Open or create a new Redis dataset client. @@ -73,18 +74,19 @@ async def open( Args: id: The ID of the dataset. If not provided, a random ID will be generated. - name: The name of the dataset. If not provided, the dataset will be unnamed. + name: The name of the dataset for named (global scope) storages. + alias: The alias of the dataset for unnamed (run scope) storages. redis: Redis client instance. Returns: An instance for the opened or created storage client. """ + search_name = name or alias or cls._DEFAULT_NAME if id: dataset_name = await cls._get_metadata_name_by_id(id=id, redis=redis) if dataset_name is None: raise ValueError(f'Dataset with ID "{id}" does not exist.') else: - search_name = name or cls._DEFAULT_NAME metadata_data = await cls._get_metadata_by_name(name=search_name, redis=redis) dataset_name = search_name if metadata_data is not None else None if dataset_name: @@ -101,7 +103,7 @@ async def open( modified_at=now, item_count=0, ) - dataset_name = name or cls._DEFAULT_NAME + dataset_name = name or alias or cls._DEFAULT_NAME client = cls(dataset_name=dataset_name, redis=redis) await client._create_metadata_and_storage(metadata.model_dump()) return client diff --git a/src/crawlee/storage_clients/_redis/_key_value_store_client.py b/src/crawlee/storage_clients/_redis/_key_value_store_client.py index 3bcdd59df3..d23578202a 100644 --- a/src/crawlee/storage_clients/_redis/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_redis/_key_value_store_client.py @@ -64,6 +64,7 @@ async def open( *, id: str | None, name: str | None, + alias: str | None, redis: Redis, ) -> RedisKeyValueStoreClient: """Open or create a new Redis dataset client. @@ -74,18 +75,19 @@ async def open( Args: id: The ID of the dataset. If not provided, a random ID will be generated. - name: The name of the dataset. If not provided, the dataset will be unnamed. + name: The name of the dataset for named (global scope) storages. + alias: The alias of the dataset for unnamed (run scope) storages. redis: Redis client instance. Returns: An instance for the opened or created storage client. """ + search_name = name or alias or cls._DEFAULT_NAME if id: dataset_name = await cls._get_metadata_name_by_id(id=id, redis=redis) if dataset_name is None: raise ValueError(f'Dataset with ID "{id}" does not exist.') else: - search_name = name or cls._DEFAULT_NAME metadata_data = await cls._get_metadata_by_name(name=search_name, redis=redis) dataset_name = search_name if metadata_data is not None else None if dataset_name: @@ -101,7 +103,7 @@ async def open( accessed_at=now, modified_at=now, ) - dataset_name = name or cls._DEFAULT_NAME + dataset_name = name or alias or cls._DEFAULT_NAME client = cls(dataset_name=dataset_name, redis=redis) await client._create_metadata_and_storage(metadata.model_dump()) return client diff --git a/src/crawlee/storage_clients/_redis/_request_queue_client.py b/src/crawlee/storage_clients/_redis/_request_queue_client.py index ee1db38f16..11c68bc9c7 100644 --- a/src/crawlee/storage_clients/_redis/_request_queue_client.py +++ b/src/crawlee/storage_clients/_redis/_request_queue_client.py @@ -94,6 +94,7 @@ async def open( *, id: str | None, name: str | None, + alias: str | None, redis: Redis, ) -> RedisRequestQueueClient: """Open or create a new memory request queue client. @@ -104,19 +105,19 @@ async def open( Args: id: The ID of the request queue. If not provided, a random ID will be generated. - name: The name of the request queue. If not provided, the queue will be unnamed. + name: The name of the dataset for named (global scope) storages. + alias: The alias of the dataset for unnamed (run scope) storages. redis: Redis client instance. Returns: An instance for the opened or created storage client. """ - # Otherwise create a new queue + search_name = name or alias or cls._DEFAULT_NAME if id: dataset_name = await cls._get_metadata_name_by_id(id=id, redis=redis) if dataset_name is None: raise ValueError(f'Dataset with ID "{id}" does not exist.') else: - search_name = name or cls._DEFAULT_NAME metadata_data = await cls._get_metadata_by_name(name=search_name, redis=redis) dataset_name = search_name if metadata_data is not None else None if dataset_name: @@ -136,7 +137,7 @@ async def open( pending_request_count=0, total_request_count=0, ) - dataset_name = name or cls._DEFAULT_NAME + dataset_name = name or alias or cls._DEFAULT_NAME client = cls(dataset_name=dataset_name, redis=redis) with suppress(ResponseError): await client._create_metadata_and_storage(metadata.model_dump()) diff --git a/src/crawlee/storage_clients/_redis/_storage_client.py b/src/crawlee/storage_clients/_redis/_storage_client.py index 860738b3da..9022699b3f 100644 --- a/src/crawlee/storage_clients/_redis/_storage_client.py +++ b/src/crawlee/storage_clients/_redis/_storage_client.py @@ -59,23 +59,15 @@ async def create_dataset_client( *, id: str | None = None, name: str | None = None, + alias: str | None = None, configuration: Configuration | None = None, ) -> RedisDatasetClient: - """Create or open a Redis dataset client. - - Args: - id: Specific dataset ID to open. If provided, name is ignored. - name: Dataset name to open or create. Uses 'default' if not specified. - configuration: Configuration object. Uses global config if not provided. - - Returns: - Configured dataset client ready for use. - """ configuration = configuration or Configuration.get_global_configuration() client = await RedisDatasetClient.open( id=id, name=name, + alias=alias, redis=self._redis, ) @@ -88,23 +80,15 @@ async def create_kvs_client( *, id: str | None = None, name: str | None = None, + alias: str | None = None, configuration: Configuration | None = None, ) -> RedisKeyValueStoreClient: - """Create or open a SQL key-value store client. - - Args: - id: Specific store ID to open. If provided, name is ignored. - name: Store name to open or create. Uses 'default' if not specified. - configuration: Configuration object. Uses global config if not provided. - - Returns: - Configured key-value store client ready for use. - """ configuration = configuration or Configuration.get_global_configuration() client = await RedisKeyValueStoreClient.open( id=id, name=name, + alias=alias, redis=self._redis, ) @@ -117,23 +101,15 @@ async def create_rq_client( *, id: str | None = None, name: str | None = None, + alias: str | None = None, configuration: Configuration | None = None, ) -> RedisRequestQueueClient: - """Create or open a SQL request queue client. - - Args: - id: Specific queue ID to open. If provided, name is ignored. - name: Queue name to open or create. Uses 'default' if not specified. - configuration: Configuration object. Uses global config if not provided. - - Returns: - Configured request queue client ready for use. - """ configuration = configuration or Configuration.get_global_configuration() client = await RedisRequestQueueClient.open( id=id, name=name, + alias=alias, redis=self._redis, ) From 3c1aeedaa59fee7052d5f87a72a51a0251e3c724 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sat, 13 Sep 2025 23:46:37 +0000 Subject: [PATCH 04/12] clean code --- .../storage_clients/_redis/_client_mixin.py | 200 ++++++++++++- .../storage_clients/_redis/_dataset_client.py | 113 +++----- .../_redis/_key_value_store_client.py | 130 +++------ .../_redis/_request_queue_client.py | 274 ++++++++---------- .../lua_scripts/reclaim_stale_requests.lua | 6 +- 5 files changed, 379 insertions(+), 344 deletions(-) diff --git a/src/crawlee/storage_clients/_redis/_client_mixin.py b/src/crawlee/storage_clients/_redis/_client_mixin.py index 021aa4da34..5d0764d908 100644 --- a/src/crawlee/storage_clients/_redis/_client_mixin.py +++ b/src/crawlee/storage_clients/_redis/_client_mixin.py @@ -1,11 +1,14 @@ from __future__ import annotations import asyncio -from abc import ABC +from abc import ABC, abstractmethod from contextlib import asynccontextmanager +from datetime import datetime, timezone from logging import getLogger from pathlib import Path -from typing import TYPE_CHECKING, ClassVar, TypedDict +from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, overload + +from crawlee._utils.crypto import crypto_random_object_id from ._utils import await_redis_response, read_lua_script @@ -15,7 +18,9 @@ from redis.asyncio import Redis from redis.asyncio.client import Pipeline from redis.commands.core import AsyncScript - from typing_extensions import NotRequired + from typing_extensions import NotRequired, Self + + from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata logger = getLogger(__name__) @@ -39,12 +44,32 @@ class RedisClientMixin(ABC): _MAIN_KEY: ClassVar[str] - def __init__(self, *, storage_name: str, redis: Redis) -> None: + _CLIENT_TYPE: ClassVar[str] + """Human-readable client type for error messages.""" + + def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: self._storage_name = storage_name + self._storage_id = storage_id self._redis = redis + self._scripts_loaded = False + + @property + def redis(self) -> Redis: + """Return the Redis client instance.""" + return self._redis + + @property + def metadata_key(self) -> str: + """Return the Redis key for the metadata of this storage.""" + return f'{self._MAIN_KEY}:{self._storage_name}:metadata' + @classmethod - async def _get_metadata_by_name(cls, name: str, redis: Redis) -> dict | None: + async def _get_metadata_by_name(cls, name: str, redis: Redis, *, with_wait: bool = False) -> dict | None: + if with_wait: + await await_redis_response(redis.blpop([f'{cls._MAIN_KEY}:{name}:created_signal'], timeout=30)) + await await_redis_response(redis.lpush(f'{cls._MAIN_KEY}:{name}:created_signal', 1)) + response = await await_redis_response(redis.json().get(f'{cls._MAIN_KEY}:{name}:metadata')) data = response[0] if response is not None and isinstance(response, list) else response if data is not None and not isinstance(data, dict): @@ -55,6 +80,78 @@ async def _get_metadata_by_name(cls, name: str, redis: Redis) -> dict | None: async def _get_metadata_name_by_id(cls, id: str, redis: Redis) -> str | None: return await await_redis_response(redis.get(f'{cls._MAIN_KEY}:id_to_name:{id}')) + @classmethod + async def _open( + cls, + *, + id: str | None, + name: str | None, + alias: str | None, + metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata], + redis: Redis, + extra_metadata_fields: dict[str, Any], + ) -> Self: + """Open or create a new Redis dataset client. + + This method creates a new Redis dataset instance. Unlike persistent storage implementations, Redis + datasets don't check for existing datasets with the same name or ID since all data exists only in memory + and is lost when the process terminates. + + Args: + id: The ID of the dataset. If not provided, a random ID will be generated. + name: The name of the dataset for named (global scope) storages. + alias: The alias of the dataset for unnamed (run scope) storages. + redis: Redis client instance. + metadata_model: Pydantic model for metadata validation. + extra_metadata_fields: Storage-specific metadata fields. + + Returns: + An instance for the opened or created storage client. + """ + internal_name = name or alias or cls._DEFAULT_NAME + storage_id: str | None = None + if id: + storage_name = await cls._get_metadata_name_by_id(id=id, redis=redis) + storage_id = id + if storage_name is None: + raise ValueError(f'Dataset with ID "{id}" does not exist.') + else: + metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis) + storage_name = internal_name if metadata_data is not None else None + storage_id = metadata_data['id'] if metadata_data is not None else None + if storage_name and storage_id: + client = cls(storage_name=storage_name, storage_id=storage_id, redis=redis) + async with client._get_pipeline() as pipe: + await client._update_metadata(pipe, update_accessed_at=True) + else: + now = datetime.now(timezone.utc) + metadata = metadata_model( + id=crypto_random_object_id(), + name=name, + created_at=now, + accessed_at=now, + modified_at=now, + **extra_metadata_fields, + ) + client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis) + created = await client._create_metadata_and_storage(internal_name, metadata.model_dump()) + if not created: + metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis, with_wait=True) + client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis) + + await client._ensure_scripts_loaded() + return client + + async def _load_scripts(self) -> None: + """Load Lua scripts in Redis.""" + return + + async def _ensure_scripts_loaded(self) -> None: + """Ensure Lua scripts are loaded in Redis.""" + if not self._scripts_loaded: + await self._load_scripts() + self._scripts_loaded = True + @asynccontextmanager async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pipeline]: """Create a new Redis pipeline for this storage.""" @@ -76,15 +173,94 @@ async def _create_script(self, script_name: str) -> AsyncScript: return self._redis.register_script(script_content) - async def _create_metadata_and_storage(self, metadata: dict) -> None: - metadata_key = f'{self._MAIN_KEY}:{self._storage_name}:metadata' - index_id_to_name = f'{self._MAIN_KEY}:id_to_name:{metadata["id"]}' + async def _create_metadata_and_storage(self, storage_name: str, metadata: dict) -> bool: + index_id_to_name = f'{self._MAIN_KEY}:id_to_name' + index_name_to_id = f'{self._MAIN_KEY}:name_to_id' metadata['created_at'] = metadata['created_at'].isoformat() metadata['accessed_at'] = metadata['accessed_at'].isoformat() metadata['modified_at'] = metadata['modified_at'].isoformat() - name = metadata['name'] if metadata['name'] is not None else self._DEFAULT_NAME - # Use a transaction to ensure atomicity + + name_to_id = await await_redis_response(self._redis.hsetnx(index_id_to_name, storage_name, metadata['id'])) + if not name_to_id: + return False + async with self._get_pipeline() as pipe: - await await_redis_response(pipe.json().set(metadata_key, '$', metadata, nx=True)) - await await_redis_response(pipe.set(index_id_to_name, name, nx=True)) + await await_redis_response(pipe.hsetnx(index_name_to_id, metadata['id'], storage_name)) + await await_redis_response(pipe.json().set(self.metadata_key, '$', metadata)) + await await_redis_response(pipe.lpush(f'{self._MAIN_KEY}:{storage_name}:created_signal', 1)) + await self._create_storage(pipe) + + return True + + async def _drop(self, extra_keys: list[str]) -> None: + async with self._get_pipeline() as pipe: + await pipe.delete(self.metadata_key) + await pipe.delete(f'{self._MAIN_KEY}:id_to_name', self._storage_id) + await pipe.delete(f'{self._MAIN_KEY}:name_to_id', self._storage_name) + for key in extra_keys: + await pipe.delete(key) + + async def _purge(self, extra_keys: list[str], metadata_kwargs: dict) -> None: + async with self._get_pipeline() as pipe: + for key in extra_keys: + await pipe.delete(key) + await self._update_metadata(pipe, **metadata_kwargs) + await self._create_storage(pipe) + + @overload + async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -> DatasetMetadata: ... + @overload + async def _get_metadata(self, metadata_model: type[KeyValueStoreMetadata]) -> KeyValueStoreMetadata: ... + @overload + async def _get_metadata(self, metadata_model: type[RequestQueueMetadata]) -> RequestQueueMetadata: ... + + async def _get_metadata( + self, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata] + ) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata: + """Retrieve client metadata.""" + metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis) + if metadata_dict is None: + raise ValueError(f'{self._CLIENT_TYPE} with name "{self._storage_name}" does not exist.') + + return metadata_model.model_validate(metadata_dict) + + @abstractmethod + async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs: Any) -> None: + """Pipeline operations storage-specific metadata updates. + + Must be implemented by concrete classes. + + Args: + pipeline: The Redis pipeline to use for the update. + **kwargs: Storage-specific update parameters. + """ + + async def _update_metadata( + self, + pipeline: Pipeline, + *, + update_accessed_at: bool = False, + update_modified_at: bool = False, + **kwargs: Any, + ) -> None: + """Update storage metadata combining common and specific fields. + + Args: + pipeline: The Redis pipeline to use for the update. + update_accessed_at: Whether to update accessed_at timestamp. + update_modified_at: Whether to update modified_at timestamp. + **kwargs: Additional arguments for _specific_update_metadata. + """ + now = datetime.now(timezone.utc) + + if update_accessed_at: + await await_redis_response( + pipeline.json().set(self.metadata_key, '$.accessed_at', now.isoformat(), nx=False, xx=True) + ) + if update_modified_at: + await await_redis_response( + pipeline.json().set(self.metadata_key, '$.modified_at', now.isoformat(), nx=False, xx=True) + ) + + await self._specific_update_metadata(pipeline, **kwargs) diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index e246a4bb8a..46250e1c1b 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -1,12 +1,10 @@ from __future__ import annotations -from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING, Any, cast from typing_extensions import override -from crawlee._utils.crypto import crypto_random_object_id from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata @@ -39,23 +37,20 @@ class RedisDatasetClient(DatasetClient, RedisClientMixin): _MAIN_KEY = 'dataset' - def __init__( - self, - dataset_name: str, - redis: Redis, - ) -> None: + _CLIENT_TYPE = 'Dataset' + """Human-readable client type for error messages.""" + + def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: """Initialize a new instance. Preferably use the `MemoryDatasetClient.open` class method to create a new instance. """ - super().__init__(storage_name=dataset_name, redis=redis) + super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis) - @override - async def get_metadata(self) -> DatasetMetadata: - metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis) - if metadata_dict is None: - raise ValueError(f'Dataset with name "{self._storage_name}" does not exist.') - return DatasetMetadata.model_validate(metadata_dict) + @property + def items_key(self) -> str: + """Return the Redis key for the items of this dataset.""" + return f'{self._MAIN_KEY}:{self._storage_name}:items' @classmethod async def open( @@ -81,57 +76,26 @@ async def open( Returns: An instance for the opened or created storage client. """ - search_name = name or alias or cls._DEFAULT_NAME - if id: - dataset_name = await cls._get_metadata_name_by_id(id=id, redis=redis) - if dataset_name is None: - raise ValueError(f'Dataset with ID "{id}" does not exist.') - else: - metadata_data = await cls._get_metadata_by_name(name=search_name, redis=redis) - dataset_name = search_name if metadata_data is not None else None - if dataset_name: - client = cls(dataset_name=dataset_name, redis=redis) - async with client._get_pipeline() as pipe: - await client._update_metadata(pipe, update_accessed_at=True) - else: - now = datetime.now(timezone.utc) - metadata = DatasetMetadata( - id=crypto_random_object_id(), - name=name, - created_at=now, - accessed_at=now, - modified_at=now, - item_count=0, - ) - dataset_name = name or alias or cls._DEFAULT_NAME - client = cls(dataset_name=dataset_name, redis=redis) - await client._create_metadata_and_storage(metadata.model_dump()) - return client + return await cls._open( + id=id, + name=name, + alias=alias, + redis=redis, + metadata_model=DatasetMetadata, + extra_metadata_fields={'item_count': 0}, + ) @override - async def _create_storage(self, pipeline: Pipeline) -> None: - items_key = f'{self._MAIN_KEY}:{self._storage_name}:items' - await await_redis_response(pipeline.json().set(items_key, '$', [])) + async def get_metadata(self) -> DatasetMetadata: + return await self._get_metadata(DatasetMetadata) @override async def drop(self) -> None: - storage_id = (await self.get_metadata()).id - async with self._get_pipeline() as pipe: - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:metadata') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:items') - await pipe.delete(f'{self._MAIN_KEY}:id_to_name:{storage_id}') + await self._drop(extra_keys=[self.items_key]) @override async def purge(self) -> None: - async with self._get_pipeline() as pipe: - await self._create_storage(pipe) - - await self._update_metadata( - pipe, - update_accessed_at=True, - update_modified_at=True, - new_item_count=0, - ) + await self._purge(extra_keys=[self.items_key], metadata_kwargs={'new_item_count': 0}) @override async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: @@ -140,7 +104,7 @@ async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: async with self._get_pipeline() as pipe: # Incorrect signature for args type in redis-py - pipe.json().arrappend(f'{self._MAIN_KEY}:{self._storage_name}:items', '$', *data) # type: ignore[arg-type] + pipe.json().arrappend(self.items_key, '$', *data) # type: ignore[arg-type] delta_item_count = len(data) await self._update_metadata( pipe, update_accessed_at=True, update_modified_at=True, delta_item_count=delta_item_count @@ -184,7 +148,6 @@ async def get_data( metadata = await self.get_metadata() total = metadata.item_count - items_key = f'{self._MAIN_KEY}:{self._storage_name}:items' json_path = '$' # Apply sorting and pagination @@ -206,7 +169,7 @@ async def get_data( if json_path == '$': json_path = '$[*]' - data = await await_redis_response(self._redis.json().get(items_key, json_path)) + data = await await_redis_response(self._redis.json().get(self.items_key, json_path)) if data is None: data = [] @@ -266,7 +229,6 @@ async def iterate_items( metadata = await self.get_metadata() total_items = metadata.item_count - items_key = f'{self._MAIN_KEY}:{self._storage_name}:items' # Calculate actual range based on parameters start_idx = offset @@ -292,7 +254,7 @@ async def iterate_items( json_path = f'$[{batch_start}:{batch_end}]' # Get batch of items - batch_items = await await_redis_response(self._redis.json().get(items_key, json_path)) + batch_items = await await_redis_response(self._redis.json().get(self.items_key, json_path)) # Handle case where batch_items might be None or not a list if batch_items is None: @@ -313,38 +275,29 @@ async def iterate_items( async with self._get_pipeline() as pipe: await self._update_metadata(pipe, update_accessed_at=True) - async def _update_metadata( + @override + async def _create_storage(self, pipeline: Pipeline) -> None: + await await_redis_response(pipeline.json().set(self.items_key, '$', [])) + + @override + async def _specific_update_metadata( self, pipeline: Pipeline, *, new_item_count: int | None = None, delta_item_count: int | None = None, - update_accessed_at: bool = False, - update_modified_at: bool = False, + **_kwargs: Any, ) -> None: """Update the dataset metadata with current information. Args: pipeline: The Redis pipeline to use for the update. new_item_count: If provided, update the item count to this value. - update_accessed_at: If True, update the `accessed_at` timestamp to the current time. - update_modified_at: If True, update the `modified_at` timestamp to the current time. delta_item_count: If provided, increment the item count by this value. """ - metadata_key = f'{self._MAIN_KEY}:{self._storage_name}:metadata' - now = datetime.now(timezone.utc) - - if update_accessed_at: - await await_redis_response( - pipeline.json().set(metadata_key, '$.accessed_at', now.isoformat(), nx=False, xx=True) - ) - if update_modified_at: - await await_redis_response( - pipeline.json().set(metadata_key, '$.modified_at', now.isoformat(), nx=False, xx=True) - ) if new_item_count is not None: await await_redis_response( - pipeline.json().set(metadata_key, '$.item_count', new_item_count, nx=False, xx=True) + pipeline.json().set(self.metadata_key, '$.item_count', new_item_count, nx=False, xx=True) ) elif delta_item_count is not None: - await await_redis_response(pipeline.json().numincrby(metadata_key, '$.item_count', delta_item_count)) + await await_redis_response(pipeline.json().numincrby(self.metadata_key, '$.item_count', delta_item_count)) diff --git a/src/crawlee/storage_clients/_redis/_key_value_store_client.py b/src/crawlee/storage_clients/_redis/_key_value_store_client.py index d23578202a..e9bbf69e23 100644 --- a/src/crawlee/storage_clients/_redis/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_redis/_key_value_store_client.py @@ -1,13 +1,11 @@ from __future__ import annotations import json -from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING, Any from typing_extensions import override -from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.file import infer_mime_type from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata @@ -40,23 +38,25 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin): _MAIN_KEY = 'key-value-store' - def __init__( - self, - dataset_name: str, - redis: Redis, - ) -> None: + _CLIENT_TYPE = 'Key-value store' + """Human-readable client type for error messages.""" + + def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: """Initialize a new instance. Preferably use the `MemoryDatasetClient.open` class method to create a new instance. """ - super().__init__(storage_name=dataset_name, redis=redis) + super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis) - @override - async def get_metadata(self) -> KeyValueStoreMetadata: - metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis) - if metadata_dict is None: - raise ValueError(f'Dataset with name "{self._storage_name}" does not exist.') - return KeyValueStoreMetadata.model_validate(metadata_dict) + @property + def items_key(self) -> str: + """Return the Redis key for the items of this storage.""" + return f'{self._MAIN_KEY}:{self._storage_name}:items' + + @property + def metadata_items_key(self) -> str: + """Return the Redis key for the items metadata of this storage.""" + return f'{self._MAIN_KEY}:{self._storage_name}:metadata_items' @classmethod async def open( @@ -82,51 +82,26 @@ async def open( Returns: An instance for the opened or created storage client. """ - search_name = name or alias or cls._DEFAULT_NAME - if id: - dataset_name = await cls._get_metadata_name_by_id(id=id, redis=redis) - if dataset_name is None: - raise ValueError(f'Dataset with ID "{id}" does not exist.') - else: - metadata_data = await cls._get_metadata_by_name(name=search_name, redis=redis) - dataset_name = search_name if metadata_data is not None else None - if dataset_name: - client = cls(dataset_name=dataset_name, redis=redis) - async with client._get_pipeline() as pipe: - await client._update_metadata(pipe, update_accessed_at=True) - else: - now = datetime.now(timezone.utc) - metadata = KeyValueStoreMetadata( - id=crypto_random_object_id(), - name=name, - created_at=now, - accessed_at=now, - modified_at=now, - ) - dataset_name = name or alias or cls._DEFAULT_NAME - client = cls(dataset_name=dataset_name, redis=redis) - await client._create_metadata_and_storage(metadata.model_dump()) - return client + return await cls._open( + id=id, + name=name, + alias=alias, + redis=redis, + metadata_model=KeyValueStoreMetadata, + extra_metadata_fields={}, + ) + + @override + async def get_metadata(self) -> KeyValueStoreMetadata: + return await self._get_metadata(KeyValueStoreMetadata) @override async def drop(self) -> None: - storage_id = (await self.get_metadata()).id - async with self._get_pipeline() as pipe: - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:metadata') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:items') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:metadata_items') - await pipe.delete(f'{self._MAIN_KEY}:id_to_name:{storage_id}') + await self._drop(extra_keys=[self.items_key, self.metadata_items_key]) @override async def purge(self) -> None: - async with self._get_pipeline() as pipe: - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:items') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:metadata_items') - await self._update_metadata( - pipe, - update_accessed_at=True, - update_modified_at=True, - ) + await self._purge(extra_keys=[self.items_key, self.metadata_items_key], metadata_kwargs={}) @override async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None: @@ -157,11 +132,11 @@ async def set_value(self, *, key: str, value: Any, content_type: str | None = No async with self._get_pipeline() as pipe: # redis-py typing issue - await await_redis_response(pipe.hset(f'{self._MAIN_KEY}:{self._storage_name}:items', key, value_bytes)) # type: ignore[arg-type] + await await_redis_response(pipe.hset(self.items_key, key, value_bytes)) # type: ignore[arg-type] await await_redis_response( pipe.hset( - f'{self._MAIN_KEY}:{self._storage_name}:metadata_items', + self.metadata_items_key, key, item_metadata.model_dump_json(), ) @@ -170,9 +145,7 @@ async def set_value(self, *, key: str, value: Any, content_type: str | None = No @override async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: - serialized_metadata_item = await await_redis_response( - self._redis.hget(f'{self._MAIN_KEY}:{self._storage_name}:metadata_items', key) - ) + serialized_metadata_item = await await_redis_response(self._redis.hget(self.metadata_items_key, key)) if not isinstance(serialized_metadata_item, (str, bytes, bytearray)): logger.warning(f'Metadata for key "{key}" is missing or invalid.') @@ -187,7 +160,7 @@ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: # Query the record by key # redis-py typing issue value_bytes: bytes | None = await await_redis_response( - self._redis.hget(f'{self._MAIN_KEY}:{self._storage_name}:items', key) # type: ignore[arg-type] + self._redis.hget(self.items_key, key) # type: ignore[arg-type] ) if value_bytes is None: @@ -217,8 +190,8 @@ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: @override async def delete_value(self, *, key: str) -> None: async with self._get_pipeline() as pipe: - await await_redis_response(pipe.hdel(f'{self._MAIN_KEY}:{self._storage_name}:items', key)) - await await_redis_response(pipe.hdel(f'{self._MAIN_KEY}:{self._storage_name}:metadata_items', key)) + await await_redis_response(pipe.hdel(self.items_key, key)) + await await_redis_response(pipe.hdel(self.metadata_items_key, key)) await self._update_metadata(pipe, update_accessed_at=True, update_modified_at=True) @override @@ -228,9 +201,7 @@ async def iterate_keys( exclusive_start_key: str | None = None, limit: int | None = None, ) -> AsyncIterator[KeyValueStoreRecordMetadata]: - items_data = await await_redis_response( - self._redis.hgetall(f'{self._MAIN_KEY}:{self._storage_name}:metadata_items') - ) + items_data = await await_redis_response(self._redis.hgetall(self.metadata_items_key)) if not items_data: return # No items to iterate over @@ -268,7 +239,7 @@ async def get_public_url(self, *, key: str) -> str: @override async def record_exists(self, *, key: str) -> bool: async with self._get_pipeline(with_execute=False) as pipe: - await await_redis_response(pipe.hexists(f'{self._MAIN_KEY}:{self._storage_name}:items', key)) + await await_redis_response(pipe.hexists(self.items_key, key)) await self._update_metadata( pipe, update_accessed_at=True, @@ -277,28 +248,7 @@ async def record_exists(self, *, key: str) -> bool: return bool(results[0]) - async def _update_metadata( - self, - pipeline: Pipeline, - *, - update_accessed_at: bool = False, - update_modified_at: bool = False, - ) -> None: - """Update the dataset metadata with current information. - - Args: - pipeline: The Redis pipeline to use for the update. - update_accessed_at: If True, update the `accessed_at` timestamp to the current time. - update_modified_at: If True, update the `modified_at` timestamp to the current time. - """ - metadata_key = f'{self._MAIN_KEY}:{self._storage_name}:metadata' - now = datetime.now(timezone.utc) - - if update_accessed_at: - await await_redis_response( - pipeline.json().set(metadata_key, '$.accessed_at', now.isoformat(), nx=False, xx=True) - ) - if update_modified_at: - await await_redis_response( - pipeline.json().set(metadata_key, '$.modified_at', now.isoformat(), nx=False, xx=True) - ) + @override + async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs: Any) -> None: + # No specific fields to update for Redis key-value stores. + return diff --git a/src/crawlee/storage_clients/_redis/_request_queue_client.py b/src/crawlee/storage_clients/_redis/_request_queue_client.py index 11c68bc9c7..55099c7938 100644 --- a/src/crawlee/storage_clients/_redis/_request_queue_client.py +++ b/src/crawlee/storage_clients/_redis/_request_queue_client.py @@ -2,12 +2,10 @@ import json from collections import deque -from contextlib import suppress from datetime import datetime, timezone from logging import getLogger -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any -from redis.exceptions import ResponseError from typing_extensions import override from crawlee import Request @@ -39,24 +37,23 @@ class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin): data sharing across different processes. """ - _MAX_BATCH_FETCH_SIZE = 10 - - _BLOCK_REQUEST_TIME = 300_000 # milliseconds - _DEFAULT_NAME = 'default' _MAIN_KEY = 'request_queue' - def __init__( - self, - dataset_name: str, - redis: Redis, - ) -> None: + _CLIENT_TYPE = 'Request queue' + """Human-readable client type for error messages.""" + + _MAX_BATCH_FETCH_SIZE = 10 + + _BLOCK_REQUEST_TIME = 300_000 # milliseconds + + def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: """Initialize a new instance. Preferably use the `MemoryDatasetClient.open` class method to create a new instance. """ - super().__init__(storage_name=dataset_name, redis=redis) + super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis) self._pending_fetch_cache: deque[Request] = deque() """Cache for requests: ordered by sequence number.""" @@ -70,23 +67,30 @@ def __init__( self._add_requests_script: AsyncScript | None = None - self._scripts_loaded = False + @property + def added_filter_key(self) -> str: + """Return the Redis key for the added requests Bloom filter.""" + return f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter' - async def _ensure_scripts_loaded(self) -> None: - """Ensure Lua scripts are loaded in Redis.""" - if not self._scripts_loaded: - self._fetch_script = await self._create_script('atomic_fetch_request.lua') - self._reclaim_stale_script = await self._create_script('reclaim_stale_requests.lua') - self._add_requests_script = await self._create_script('atomic_add_requests.lua') + @property + def handled_filter_key(self) -> str: + """Return the Redis key for the handled requests Bloom filter.""" + return f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter' - self._scripts_loaded = True + @property + def queue_key(self) -> str: + """Return the Redis key for the request queue.""" + return f'{self._MAIN_KEY}:{self._storage_name}:queue' - @override - async def get_metadata(self) -> RequestQueueMetadata: - metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis) - if metadata_dict is None: - raise ValueError(f'Dataset with name "{self._storage_name}" does not exist.') - return RequestQueueMetadata.model_validate(metadata_dict) + @property + def data_key(self) -> str: + """Return the Redis key for the request data hash.""" + return f'{self._MAIN_KEY}:{self._storage_name}:data' + + @property + def in_progress_key(self) -> str: + """Return the Redis key for the in-progress requests hash.""" + return f'{self._MAIN_KEY}:{self._storage_name}:in_progress' @classmethod async def open( @@ -112,75 +116,52 @@ async def open( Returns: An instance for the opened or created storage client. """ - search_name = name or alias or cls._DEFAULT_NAME - if id: - dataset_name = await cls._get_metadata_name_by_id(id=id, redis=redis) - if dataset_name is None: - raise ValueError(f'Dataset with ID "{id}" does not exist.') - else: - metadata_data = await cls._get_metadata_by_name(name=search_name, redis=redis) - dataset_name = search_name if metadata_data is not None else None - if dataset_name: - client = cls(dataset_name=dataset_name, redis=redis) - async with client._get_pipeline() as pipe: - await client._update_metadata(pipe, update_accessed_at=True) - else: - now = datetime.now(timezone.utc) - metadata = RequestQueueMetadata( - id=crypto_random_object_id(), - name=name, - created_at=now, - accessed_at=now, - modified_at=now, - had_multiple_clients=False, - handled_request_count=0, - pending_request_count=0, - total_request_count=0, - ) - dataset_name = name or alias or cls._DEFAULT_NAME - client = cls(dataset_name=dataset_name, redis=redis) - with suppress(ResponseError): - await client._create_metadata_and_storage(metadata.model_dump()) - - await client._ensure_scripts_loaded() - return client + return await cls._open( + id=id, + name=name, + alias=alias, + redis=redis, + metadata_model=RequestQueueMetadata, + extra_metadata_fields={ + 'had_multiple_clients': False, + 'handled_request_count': 0, + 'pending_request_count': 0, + 'total_request_count': 0, + }, + ) @override - async def _create_storage(self, pipeline: Pipeline) -> None: - added_bloom_filter_key = f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter' - handled_bloom_filter_key = f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter' - await await_redis_response(pipeline.bf().create(added_bloom_filter_key, 0.1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] - await await_redis_response(pipeline.bf().create(handled_bloom_filter_key, 0.1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] + async def get_metadata(self) -> RequestQueueMetadata: + return await self._get_metadata(RequestQueueMetadata) @override async def drop(self) -> None: - storage_id = (await self.get_metadata()).id - async with self._get_pipeline() as pipe: - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:metadata') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:queue') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:data') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:in_progress') - await pipe.delete(f'{self._MAIN_KEY}:id_to_name:{storage_id}') + await self._drop( + extra_keys=[ + self.added_filter_key, + self.handled_filter_key, + self.queue_key, + self.data_key, + self.in_progress_key, + ] + ) @override async def purge(self) -> None: - async with self._get_pipeline() as pipe: - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:queue') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:data') - await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:in_progress') - - await self._create_storage(pipe) - - await self._update_metadata( - pipe, - update_accessed_at=True, - update_modified_at=True, - new_pending_request_count=0, - ) + await self._purge( + extra_keys=[ + self.added_filter_key, + self.handled_filter_key, + self.queue_key, + self.data_key, + self.in_progress_key, + ], + metadata_kwargs={ + 'update_accessed_at': True, + 'update_modified_at': True, + 'new_pending_request_count': 0, + }, + ) @override async def add_batch_of_requests( @@ -198,16 +179,11 @@ async def add_batch_of_requests( delta_pending = 0 delta_total = 0 - added_bloom_filter_key = f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter' - handled_bloom_filter_key = f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter' - queue_key = f'{self._MAIN_KEY}:{self._storage_name}:queue' - data_key = f'{self._MAIN_KEY}:{self._storage_name}:data' - requests_by_unique_key = {req.unique_key: req for req in requests} unique_keys = list(requests_by_unique_key.keys()) async with self._get_pipeline(with_execute=False) as pipe: - await await_redis_response(pipe.bf().mexists(added_bloom_filter_key, *unique_keys)) # type: ignore[no-untyped-call] - await await_redis_response(pipe.bf().mexists(handled_bloom_filter_key, *unique_keys)) # type: ignore[no-untyped-call] + await await_redis_response(pipe.bf().mexists(self.added_filter_key, *unique_keys)) # type: ignore[no-untyped-call] + await await_redis_response(pipe.bf().mexists(self.handled_filter_key, *unique_keys)) # type: ignore[no-untyped-call] results = await pipe.execute() @@ -250,7 +226,7 @@ async def add_batch_of_requests( if new_unique_keys: script_results = await self._add_requests_script( - keys=[added_bloom_filter_key, queue_key, data_key], + keys=[self.added_filter_key, self.queue_key, self.data_key], args=[int(forefront), json.dumps(new_unique_keys), json.dumps(new_request_data)], ) actually_added = set(json.loads(script_results)) @@ -299,14 +275,10 @@ async def fetch_next_request(self) -> Request | None: if self._fetch_script is None: raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') - queue_key = f'{self._MAIN_KEY}:{self._storage_name}:queue' - in_progress_key = f'{self._MAIN_KEY}:{self._storage_name}:in_progress' - data_key = f'{self._MAIN_KEY}:{self._storage_name}:data' - blocked_until_timestamp = int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME requests_json = await self._fetch_script( - keys=[queue_key, in_progress_key, data_key], + keys=[self.queue_key, self.in_progress_key, self.data_key], args=[self.client_key, blocked_until_timestamp, self._MAX_BATCH_FETCH_SIZE], ) @@ -322,24 +294,9 @@ async def fetch_next_request(self) -> Request | None: return requests[0] - async def _reclaim_stale_requests(self) -> None: - # Mypy workaround - if self._reclaim_stale_script is None: - raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') - - in_progress_key = f'{self._MAIN_KEY}:{self._storage_name}:in_progress' - queue_key = f'{self._MAIN_KEY}:{self._storage_name}:queue' - data_key = f'{self._MAIN_KEY}:{self._storage_name}:data' - - current_time = int(datetime.now(tz=timezone.utc).timestamp() * 1000) - - await self._reclaim_stale_script(keys=[in_progress_key, queue_key, data_key], args=[current_time]) - @override async def get_request(self, unique_key: str) -> Request | None: - data_key = f'{self._MAIN_KEY}:{self._storage_name}:data' - - request_data = await await_redis_response(self._redis.hget(data_key, unique_key)) + request_data = await await_redis_response(self._redis.hget(self.data_key, unique_key)) if isinstance(request_data, (str, bytes, bytearray)): return Request.model_validate_json(request_data) @@ -349,20 +306,17 @@ async def get_request(self, unique_key: str) -> Request | None: @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: # Check if the request is in progress. - in_progress_key = f'{self._MAIN_KEY}:{self._storage_name}:in_progress' - handled_bloom_filter_key = f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter' - data_key = f'{self._MAIN_KEY}:{self._storage_name}:data' - check_in_progress = await await_redis_response(self._redis.hexists(in_progress_key, request.unique_key)) + check_in_progress = await await_redis_response(self._redis.hexists(self.in_progress_key, request.unique_key)) if not check_in_progress: logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.') return None async with self._get_pipeline() as pipe: - await await_redis_response(pipe.bf().add(handled_bloom_filter_key, request.unique_key)) # type: ignore[no-untyped-call] + await await_redis_response(pipe.bf().add(self.handled_filter_key, request.unique_key)) # type: ignore[no-untyped-call] - await await_redis_response(pipe.hdel(in_progress_key, request.unique_key)) - await await_redis_response(pipe.hdel(data_key, request.unique_key)) + await await_redis_response(pipe.hdel(self.in_progress_key, request.unique_key)) + await await_redis_response(pipe.hdel(self.data_key, request.unique_key)) await self._update_metadata( pipe, @@ -385,10 +339,7 @@ async def reclaim_request( *, forefront: bool = False, ) -> ProcessedRequest | None: - in_progress_key = f'{self._MAIN_KEY}:{self._storage_name}:in_progress' - queue_key = f'{self._MAIN_KEY}:{self._storage_name}:queue' - - check_in_progress = await await_redis_response(self._redis.hexists(in_progress_key, request.unique_key)) + check_in_progress = await await_redis_response(self._redis.hexists(self.in_progress_key, request.unique_key)) if not check_in_progress: logger.info(f'Reclaiming request {request.unique_key} that is not in progress.') return None @@ -401,15 +352,15 @@ async def reclaim_request( await await_redis_response( pipe.hset( - in_progress_key, + self.in_progress_key, request.unique_key, f'{{"client_id":"{self.client_key}","blocked_until_timestamp":{blocked_until_timestamp}}}', ) ) self._pending_fetch_cache.appendleft(request) else: - await await_redis_response(pipe.rpush(queue_key, request.unique_key)) - await await_redis_response(pipe.hdel(in_progress_key, request.unique_key)) + await await_redis_response(pipe.rpush(self.queue_key, request.unique_key)) + await await_redis_response(pipe.hdel(self.in_progress_key, request.unique_key)) await self._update_metadata( pipe, update_modified_at=True, @@ -436,12 +387,33 @@ async def is_empty(self) -> bool: return metadata.pending_request_count == 0 - async def _update_metadata( + async def _load_scripts(self) -> None: + """Ensure Lua scripts are loaded in Redis.""" + self._fetch_script = await self._create_script('atomic_fetch_request.lua') + self._reclaim_stale_script = await self._create_script('reclaim_stale_requests.lua') + self._add_requests_script = await self._create_script('atomic_add_requests.lua') + + @override + async def _create_storage(self, pipeline: Pipeline) -> None: + await await_redis_response(pipeline.bf().create(self.added_filter_key, 0.1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] + await await_redis_response(pipeline.bf().create(self.handled_filter_key, 0.1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] + + async def _reclaim_stale_requests(self) -> None: + # Mypy workaround + if self._reclaim_stale_script is None: + raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') + + current_time = int(datetime.now(tz=timezone.utc).timestamp() * 1000) + + await self._reclaim_stale_script( + keys=[self.in_progress_key, self.queue_key, self.data_key], args=[current_time] + ) + + @override + async def _specific_update_metadata( self, pipeline: Pipeline, *, - update_accessed_at: bool = False, - update_modified_at: bool = False, delta_handled_request_count: int | None = None, new_handled_request_count: int | None = None, delta_pending_request_count: int | None = None, @@ -449,13 +421,12 @@ async def _update_metadata( delta_total_request_count: int | None = None, new_total_request_count: int | None = None, update_had_multiple_clients: bool = False, + **_kwargs: Any, ) -> None: - """Update the request queue metadata with current information. + """Update the dataset metadata with current information. Args: pipeline: The Redis pipeline to use for the update. - update_accessed_at: If True, update the `accessed_at` timestamp to the current time. - update_modified_at: If True, update the `modified_at` timestamp to the current time. new_handled_request_count: If provided, update the handled_request_count to this value. new_pending_request_count: If provided, update the pending_request_count to this value. new_total_request_count: If provided, update the total_request_count to this value. @@ -464,53 +435,42 @@ async def _update_metadata( delta_total_request_count: If provided, add this value to the total_request_count. update_had_multiple_clients: If True, set had_multiple_clients to True. """ - now = datetime.now(timezone.utc) - - metadata_key = f'{self._MAIN_KEY}:{self._storage_name}:metadata' - now = datetime.now(timezone.utc) - - if update_accessed_at: - await await_redis_response( - pipeline.json().set(metadata_key, '$.accessed_at', now.isoformat(), nx=False, xx=True) - ) - if update_modified_at: - await await_redis_response( - pipeline.json().set(metadata_key, '$.modified_at', now.isoformat(), nx=False, xx=True) - ) if new_pending_request_count is not None: await await_redis_response( pipeline.json().set( - metadata_key, '$.pending_request_count', new_pending_request_count, nx=False, xx=True + self.metadata_key, '$.pending_request_count', new_pending_request_count, nx=False, xx=True ) ) elif delta_pending_request_count is not None: await await_redis_response( - pipeline.json().numincrby(metadata_key, '$.pending_request_count', delta_pending_request_count) + pipeline.json().numincrby(self.metadata_key, '$.pending_request_count', delta_pending_request_count) ) if new_handled_request_count is not None: await await_redis_response( pipeline.json().set( - metadata_key, '$.handled_request_count', new_handled_request_count, nx=False, xx=True + self.metadata_key, '$.handled_request_count', new_handled_request_count, nx=False, xx=True ) ) elif delta_handled_request_count is not None: await await_redis_response( - pipeline.json().numincrby(metadata_key, '$.handled_request_count', delta_handled_request_count) + pipeline.json().numincrby(self.metadata_key, '$.handled_request_count', delta_handled_request_count) ) if new_total_request_count is not None: await await_redis_response( - pipeline.json().set(metadata_key, '$.total_request_count', new_total_request_count, nx=False, xx=True) + pipeline.json().set( + self.metadata_key, '$.total_request_count', new_total_request_count, nx=False, xx=True + ) ) elif delta_total_request_count is not None: await await_redis_response( - pipeline.json().numincrby(metadata_key, '$.total_request_count', delta_total_request_count) + pipeline.json().numincrby(self.metadata_key, '$.total_request_count', delta_total_request_count) ) if update_had_multiple_clients: await await_redis_response( pipeline.json().set( - metadata_key, '$.had_multiple_clients', update_had_multiple_clients, nx=False, xx=True + self.metadata_key, '$.had_multiple_clients', update_had_multiple_clients, nx=False, xx=True ) ) diff --git a/src/crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua b/src/crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua index a6d9434e00..0cf169cfa6 100644 --- a/src/crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +++ b/src/crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua @@ -24,12 +24,8 @@ repeat -- Check if timed out if current_time > data.blocked_until_timestamp then -- Atomically remove from in_progress and add back to queue - req_obj = cjson.decode(redis.call('hget', data_key, unique_key) or '{}') redis.call('hdel', in_progress_key, unique_key) - if req_obj.forefront then - redis.call('lpush', queue_key, unique_key) - else - redis.call('rpush', queue_key, unique_key) + redis.call('rpush', queue_key, unique_key) count = count + 1 end end From 75d81d878d4e35750f16e40b862854feae610942 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 14 Sep 2025 01:33:13 +0000 Subject: [PATCH 05/12] up docs --- .../storage_clients/_redis/_client_mixin.py | 49 +++++--- .../storage_clients/_redis/_dataset_client.py | 66 +++++++--- .../_redis/_key_value_store_client.py | 58 +++++---- .../_redis/_request_queue_client.py | 116 +++++++++++++----- .../storage_clients/_redis/_storage_client.py | 28 +++-- 5 files changed, 220 insertions(+), 97 deletions(-) diff --git a/src/crawlee/storage_clients/_redis/_client_mixin.py b/src/crawlee/storage_clients/_redis/_client_mixin.py index 5d0764d908..8458058fd3 100644 --- a/src/crawlee/storage_clients/_redis/_client_mixin.py +++ b/src/crawlee/storage_clients/_redis/_client_mixin.py @@ -31,18 +31,19 @@ class MetadataUpdateParams(TypedDict, total=False): update_accessed_at: NotRequired[bool] update_modified_at: NotRequired[bool] - force: NotRequired[bool] class RedisClientMixin(ABC): - """Mixin class for SQL clients. + """Mixin class for Redis clients. - This mixin provides common SQL operations and basic methods for SQL storage clients. + This mixin provides common Redis operations and basic methods for Redis storage clients. """ _DEFAULT_NAME = 'default' + """Default storage name in key prefix when none provided.""" _MAIN_KEY: ClassVar[str] + """Main Redis key prefix for this storage type.""" _CLIENT_TYPE: ClassVar[str] """Human-readable client type for error messages.""" @@ -66,8 +67,17 @@ def metadata_key(self) -> str: @classmethod async def _get_metadata_by_name(cls, name: str, redis: Redis, *, with_wait: bool = False) -> dict | None: + """Retrieve metadata by storage name. + + Args: + name: The name of the storage. + redis: The Redis client instance. + with_wait: Whether to wait for the storage to be created if it doesn't exist. + """ if with_wait: + # Wait for the creation signal (max 30 seconds) await await_redis_response(redis.blpop([f'{cls._MAIN_KEY}:{name}:created_signal'], timeout=30)) + # Signal consumed, push it back for other waiters await await_redis_response(redis.lpush(f'{cls._MAIN_KEY}:{name}:created_signal', 1)) response = await await_redis_response(redis.json().get(f'{cls._MAIN_KEY}:{name}:metadata')) @@ -78,6 +88,12 @@ async def _get_metadata_by_name(cls, name: str, redis: Redis, *, with_wait: bool @classmethod async def _get_metadata_name_by_id(cls, id: str, redis: Redis) -> str | None: + """Retrieve storage name by ID from id_to_name index. + + Args: + id: The ID of the storage. + redis: The Redis client instance. + """ return await await_redis_response(redis.get(f'{cls._MAIN_KEY}:id_to_name:{id}')) @classmethod @@ -91,16 +107,12 @@ async def _open( redis: Redis, extra_metadata_fields: dict[str, Any], ) -> Self: - """Open or create a new Redis dataset client. - - This method creates a new Redis dataset instance. Unlike persistent storage implementations, Redis - datasets don't check for existing datasets with the same name or ID since all data exists only in memory - and is lost when the process terminates. + """Open or create a new Redis storage client. Args: - id: The ID of the dataset. If not provided, a random ID will be generated. - name: The name of the dataset for named (global scope) storages. - alias: The alias of the dataset for unnamed (run scope) storages. + id: The ID of the storage. If not provided, a random ID will be generated. + name: The name of the storage for named (global scope) storages. + alias: The alias of the storage for unnamed (run scope) storages. redis: Redis client instance. metadata_model: Pydantic model for metadata validation. extra_metadata_fields: Storage-specific metadata fields. @@ -110,19 +122,22 @@ async def _open( """ internal_name = name or alias or cls._DEFAULT_NAME storage_id: str | None = None + # Determine if storage exists by ID or name if id: storage_name = await cls._get_metadata_name_by_id(id=id, redis=redis) storage_id = id if storage_name is None: - raise ValueError(f'Dataset with ID "{id}" does not exist.') + raise ValueError(f'{cls._CLIENT_TYPE} with ID "{id}" does not exist.') else: metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis) storage_name = internal_name if metadata_data is not None else None storage_id = metadata_data['id'] if metadata_data is not None else None + # If both storage_name and storage_id are found, open existing storage if storage_name and storage_id: client = cls(storage_name=storage_name, storage_id=storage_id, redis=redis) async with client._get_pipeline() as pipe: await client._update_metadata(pipe, update_accessed_at=True) + # Otherwise, create a new storage else: now = datetime.now(timezone.utc) metadata = metadata_model( @@ -135,10 +150,12 @@ async def _open( ) client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis) created = await client._create_metadata_and_storage(internal_name, metadata.model_dump()) + # The client was probably not created due to a race condition. Let's try to open it using the name. if not created: metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis, with_wait=True) client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis) + # Ensure Lua scripts are loaded await client._ensure_scripts_loaded() return client @@ -154,7 +171,7 @@ async def _ensure_scripts_loaded(self) -> None: @asynccontextmanager async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pipeline]: - """Create a new Redis pipeline for this storage.""" + """Create a new Redis pipeline.""" async with self._redis.pipeline() as pipe: try: pipe.multi() # type: ignore[no-untyped-call] @@ -164,6 +181,7 @@ async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pip await pipe.execute() async def _create_storage(self, pipeline: Pipeline) -> None: + """Create the actual storage structure in Redis.""" _pipeline = pipeline # To avoid unused variable mypy error async def _create_script(self, script_name: str) -> AsyncScript: @@ -180,10 +198,13 @@ async def _create_metadata_and_storage(self, storage_name: str, metadata: dict) metadata['accessed_at'] = metadata['accessed_at'].isoformat() metadata['modified_at'] = metadata['modified_at'].isoformat() + # Try to create name_to_id index entry, if it already exists, return False. name_to_id = await await_redis_response(self._redis.hsetnx(index_id_to_name, storage_name, metadata['id'])) + # If name already exists, return False. Probably an attempt at parallel creation. if not name_to_id: return False + # Create id_to_name index entry, metadata, and storage structure in a transaction. async with self._get_pipeline() as pipe: await await_redis_response(pipe.hsetnx(index_name_to_id, metadata['id'], storage_name)) await await_redis_response(pipe.json().set(self.metadata_key, '$', metadata)) @@ -201,7 +222,7 @@ async def _drop(self, extra_keys: list[str]) -> None: for key in extra_keys: await pipe.delete(key) - async def _purge(self, extra_keys: list[str], metadata_kwargs: dict) -> None: + async def _purge(self, extra_keys: list[str], metadata_kwargs: MetadataUpdateParams) -> None: async with self._get_pipeline() as pipe: for key in extra_keys: await pipe.delete(key) diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index 46250e1c1b..3ef83bc439 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -3,12 +3,12 @@ from logging import getLogger from typing import TYPE_CHECKING, Any, cast -from typing_extensions import override +from typing_extensions import NotRequired, override from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata -from ._client_mixin import RedisClientMixin +from ._client_mixin import MetadataUpdateParams, RedisClientMixin from ._utils import await_redis_response if TYPE_CHECKING: @@ -20,22 +20,33 @@ logger = getLogger(__name__) +class _DatasetMetadataUpdateParams(MetadataUpdateParams): + """Parameters for updating dataset metadata.""" + + new_item_count: NotRequired[int] + delta_item_count: NotRequired[int] + + class RedisDatasetClient(DatasetClient, RedisClientMixin): - """Memory implementation of the dataset client. + """Redis implementation of the dataset client. + + This client persists dataset items to Redis using JSON arrays for efficient storage and retrieval. + Items are stored as JSON objects with automatic ordering preservation through Redis list operations. - This client stores dataset items in memory using Python lists and dictionaries. No data is persisted - between process runs, meaning all stored data is lost when the program terminates. This implementation - is primarily useful for testing, development, and short-lived crawler operations where persistent - storage is not required. + The dataset data is stored in Redis using the following key pattern: + - `dataset:{name}:items` - Redis JSON array containing all dataset items. + - `dataset:{name}:metadata` - Redis JSON object containing dataset metadata. - The memory implementation provides fast access to data but is limited by available memory and - does not support data sharing across different processes. It supports all dataset operations including - sorting, filtering, and pagination, but performs them entirely in memory. + Items must be JSON-serializable dictionaries. Single items or lists of items can be pushed to the dataset. + The item ordering is preserved through Redis JSON array operations. All operations provide atomic consistency + through Redis transactions and pipeline operations. """ _DEFAULT_NAME = 'default' + """Default Dataset name key prefix when none provided.""" _MAIN_KEY = 'dataset' + """Main Redis key prefix for Dataset.""" _CLIENT_TYPE = 'Dataset' """Human-readable client type for error messages.""" @@ -43,7 +54,12 @@ class RedisDatasetClient(DatasetClient, RedisClientMixin): def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: """Initialize a new instance. - Preferably use the `MemoryDatasetClient.open` class method to create a new instance. + Preferably use the `RedisDatasetClient.open` class method to create a new instance. + + Args: + storage_name: Internal storage name used for Redis keys. + storage_id: Unique identifier for the dataset. + redis: Redis client instance. """ super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis) @@ -63,9 +79,9 @@ async def open( ) -> RedisDatasetClient: """Open or create a new Redis dataset client. - This method creates a new Redis dataset instance. Unlike persistent storage implementations, Redis - datasets don't check for existing datasets with the same name or ID since all data exists only in memory - and is lost when the process terminates. + This method attempts to open an existing dataset from the Redis database. If a dataset with the specified + ID or name exists, it loads the metadata from the database. If no existing store is found, a new one + is created. Args: id: The ID of the dataset. If not provided, a random ID will be generated. @@ -95,7 +111,12 @@ async def drop(self) -> None: @override async def purge(self) -> None: - await self._purge(extra_keys=[self.items_key], metadata_kwargs={'new_item_count': 0}) + await self._purge( + extra_keys=[self.items_key], + metadata_kwargs=_DatasetMetadataUpdateParams( + new_item_count=0, update_accessed_at=True, update_modified_at=True + ), + ) @override async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: @@ -107,7 +128,10 @@ async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: pipe.json().arrappend(self.items_key, '$', *data) # type: ignore[arg-type] delta_item_count = len(data) await self._update_metadata( - pipe, update_accessed_at=True, update_modified_at=True, delta_item_count=delta_item_count + pipe, + **_DatasetMetadataUpdateParams( + update_accessed_at=True, update_modified_at=True, delta_item_count=delta_item_count + ), ) @override @@ -181,7 +205,7 @@ async def get_data( data = list(reversed(data)) async with self._get_pipeline() as pipe: - await self._update_metadata(pipe, update_accessed_at=True) + await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True)) return DatasetItemsListPage( count=len(data), @@ -236,7 +260,7 @@ async def iterate_items( # Update accessed_at timestamp async with self._get_pipeline() as pipe: - await self._update_metadata(pipe, update_accessed_at=True) + await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True)) # Process items in batches for better network efficiency batch_size = 100 @@ -273,10 +297,12 @@ async def iterate_items( yield cast('dict[str, Any]', item) async with self._get_pipeline() as pipe: - await self._update_metadata(pipe, update_accessed_at=True) + await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True)) @override async def _create_storage(self, pipeline: Pipeline) -> None: + """Create the main dataset keys in Redis.""" + # Create an empty JSON array for items await await_redis_response(pipeline.json().set(self.items_key, '$', [])) @override @@ -288,7 +314,7 @@ async def _specific_update_metadata( delta_item_count: int | None = None, **_kwargs: Any, ) -> None: - """Update the dataset metadata with current information. + """Update the dataset metadata in the database. Args: pipeline: The Redis pipeline to use for the update. diff --git a/src/crawlee/storage_clients/_redis/_key_value_store_client.py b/src/crawlee/storage_clients/_redis/_key_value_store_client.py index e9bbf69e23..05af26baf8 100644 --- a/src/crawlee/storage_clients/_redis/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_redis/_key_value_store_client.py @@ -10,7 +10,7 @@ from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata -from ._client_mixin import RedisClientMixin +from ._client_mixin import MetadataUpdateParams, RedisClientMixin from ._utils import await_redis_response if TYPE_CHECKING: @@ -23,20 +23,29 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin): - """Memory implementation of the key-value store client. + """Redis implementation of the key-value store client. - This client stores data in memory as Python dictionaries. No data is persisted between - process runs, meaning all stored data is lost when the program terminates. This implementation - is primarily useful for testing, development, and short-lived crawler operations where - persistence is not required. + This client persists key-value data to Redis using hash data structures for efficient storage and retrieval. + Keys are mapped to values with automatic content type detection and size tracking for metadata management. - The memory implementation provides fast access to data but is limited by available memory and - does not support data sharing across different processes. + The key-value store data is stored in Redis using the following key pattern: + - `key-value-store:{name}:items` - Redis hash containing key-value pairs (values stored as binary data). + - `key-value-store:{name}:metadata_items` - Redis hash containing metadata for each key. + - `key-value-store:{name}:metadata` - Redis JSON object containing store metadata. + + Values are serialized based on their type: JSON objects are stored as UTF-8 encoded JSON strings, + text values as UTF-8 encoded strings, and binary data as-is. The implementation automatically handles + content type detection and maintains metadata about each record including size and MIME type information. + + All operations are atomic through Redis hash operations and pipeline transactions. The client supports + concurrent access through Redis's built-in atomic operations for hash fields. """ _DEFAULT_NAME = 'default' + """Default Key-Value Store name key prefix when none provided.""" _MAIN_KEY = 'key-value-store' + """Main Redis key prefix for Key-Value Store.""" _CLIENT_TYPE = 'Key-value store' """Human-readable client type for error messages.""" @@ -44,18 +53,18 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin): def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: """Initialize a new instance. - Preferably use the `MemoryDatasetClient.open` class method to create a new instance. + Preferably use the `RedisKeyValueStoreClient.open` class method to create a new instance. """ super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis) @property def items_key(self) -> str: - """Return the Redis key for the items of this storage.""" + """Return the Redis key for the items of KVS.""" return f'{self._MAIN_KEY}:{self._storage_name}:items' @property def metadata_items_key(self) -> str: - """Return the Redis key for the items metadata of this storage.""" + """Return the Redis key for the items metadata of KVS.""" return f'{self._MAIN_KEY}:{self._storage_name}:metadata_items' @classmethod @@ -67,16 +76,16 @@ async def open( alias: str | None, redis: Redis, ) -> RedisKeyValueStoreClient: - """Open or create a new Redis dataset client. + """Open or create a new Redis key-value store client. - This method creates a new Redis dataset instance. Unlike persistent storage implementations, Redis - datasets don't check for existing datasets with the same name or ID since all data exists only in memory - and is lost when the process terminates. + This method attempts to open an existing key-value store from the Redis database. If a store with the specified + ID or name exists, it loads the metadata from the database. If no existing store is found, a new one + is created. Args: - id: The ID of the dataset. If not provided, a random ID will be generated. - name: The name of the dataset for named (global scope) storages. - alias: The alias of the dataset for unnamed (run scope) storages. + id: The ID of the key-value store. If not provided, a random ID will be generated. + name: The name of the key-value store for named (global scope) storages. + alias: The alias of the key-value store for unnamed (run scope) storages. redis: Redis client instance. Returns: @@ -101,7 +110,10 @@ async def drop(self) -> None: @override async def purge(self) -> None: - await self._purge(extra_keys=[self.items_key, self.metadata_items_key], metadata_kwargs={}) + await self._purge( + extra_keys=[self.items_key, self.metadata_items_key], + metadata_kwargs=MetadataUpdateParams(update_accessed_at=True, update_modified_at=True), + ) @override async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None: @@ -141,7 +153,7 @@ async def set_value(self, *, key: str, value: Any, content_type: str | None = No item_metadata.model_dump_json(), ) ) - await self._update_metadata(pipe, update_accessed_at=True, update_modified_at=True) + await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True)) @override async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: @@ -192,7 +204,7 @@ async def delete_value(self, *, key: str) -> None: async with self._get_pipeline() as pipe: await await_redis_response(pipe.hdel(self.items_key, key)) await await_redis_response(pipe.hdel(self.metadata_items_key, key)) - await self._update_metadata(pipe, update_accessed_at=True, update_modified_at=True) + await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True)) @override async def iterate_keys( @@ -229,7 +241,7 @@ async def iterate_keys( async with self._get_pipeline() as pipe: await self._update_metadata( pipe, - update_accessed_at=True, + **MetadataUpdateParams(update_accessed_at=True), ) @override @@ -242,7 +254,7 @@ async def record_exists(self, *, key: str) -> bool: await await_redis_response(pipe.hexists(self.items_key, key)) await self._update_metadata( pipe, - update_accessed_at=True, + **MetadataUpdateParams(update_accessed_at=True), ) results = await pipe.execute() diff --git a/src/crawlee/storage_clients/_redis/_request_queue_client.py b/src/crawlee/storage_clients/_redis/_request_queue_client.py index 55099c7938..77e80c734a 100644 --- a/src/crawlee/storage_clients/_redis/_request_queue_client.py +++ b/src/crawlee/storage_clients/_redis/_request_queue_client.py @@ -2,18 +2,18 @@ import json from collections import deque -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from logging import getLogger from typing import TYPE_CHECKING, Any -from typing_extensions import override +from typing_extensions import NotRequired, override from crawlee import Request from crawlee._utils.crypto import crypto_random_object_id from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata -from ._client_mixin import RedisClientMixin +from ._client_mixin import MetadataUpdateParams, RedisClientMixin from ._utils import await_redis_response if TYPE_CHECKING: @@ -26,32 +26,65 @@ logger = getLogger(__name__) -class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin): - """Memory implementation of the request queue client. +class _QueueMetadataUpdateParams(MetadataUpdateParams): + """Parameters for updating queue metadata.""" + + new_handled_request_count: NotRequired[int] + new_pending_request_count: NotRequired[int] + new_total_request_count: NotRequired[int] + delta_handled_request_count: NotRequired[int] + delta_pending_request_count: NotRequired[int] + delta_total_request_count: NotRequired[int] + recalculate: NotRequired[bool] + update_had_multiple_clients: NotRequired[bool] - No data is persisted between process runs, which means all requests are lost when the program terminates. - This implementation is primarily useful for testing, development, and short-lived crawler runs where - persistence is not required. - This client provides fast access to request data but is limited by available memory and does not support - data sharing across different processes. +class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin): + """Redis implementation of the request queue client. + + This client persists requests to Redis using multiple data structures for efficient queue operations, + deduplication, and concurrent access safety. Requests are stored with FIFO ordering and support + both regular and forefront (high-priority) insertion modes. + + The implementation uses Bloom filters for efficient request deduplication and Redis lists for + queue operations. Request blocking and client coordination is handled through Redis hashes + with timestamp-based expiration for stale request recovery. + + The request queue data is stored in Redis using the following key patterns: + - `request_queue:{name}:queue` - Redis list for FIFO request ordering + - `request_queue:{name}:data` - Redis hash storing serialized Request objects by unique_key + - `request_queue:{name}:in_progress` - Redis hash tracking requests currently being processed + - `request_queue:{name}:added_bloom_filter` - Bloom filter for added request deduplication + - `request_queue:{name}:handled_bloom_filter` - Bloom filter for completed request tracking + - `request_queue:{name}:metadata` - Redis JSON object containing queue metadata + + Requests are serialized to JSON for storage and maintain proper FIFO ordering through Redis list + operations. The implementation provides concurrent access safety through atomic Lua scripts, + Bloom filter operations, and Redis's built-in atomicity guarantees for individual operations. """ _DEFAULT_NAME = 'default' + """Default Request Queue name key prefix when none provided.""" _MAIN_KEY = 'request_queue' + """Main Redis key prefix for Request Queue.""" _CLIENT_TYPE = 'Request queue' """Human-readable client type for error messages.""" _MAX_BATCH_FETCH_SIZE = 10 + """Maximum number of requests to fetch in a single batch operation.""" _BLOCK_REQUEST_TIME = 300_000 # milliseconds + """Time in milliseconds to block a fetched request before it can be reclaimed.""" + + _RECLAIM_INTERVAL = timedelta(seconds=30) + """Interval to check for stale requests to reclaim.""" def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: """Initialize a new instance. - Preferably use the `MemoryDatasetClient.open` class method to create a new instance. + Preferably use the `RedisRequestQueueClient.open` class method to create a new instance. """ super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis) @@ -61,12 +94,13 @@ def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: self.client_key = crypto_random_object_id(length=32)[:32] """Unique identifier for this client instance.""" + # Lua scripts for atomic operations self._fetch_script: AsyncScript | None = None - self._reclaim_stale_script: AsyncScript | None = None - self._add_requests_script: AsyncScript | None = None + self._next_reclaim_stale: None | datetime = None + @property def added_filter_key(self) -> str: """Return the Redis key for the added requests Bloom filter.""" @@ -101,11 +135,11 @@ async def open( alias: str | None, redis: Redis, ) -> RedisRequestQueueClient: - """Open or create a new memory request queue client. + """Open or create a new Redis request queue client. - This method creates a new in-memory request queue instance. Unlike persistent storage implementations, - memory queues don't check for existing queues with the same name or ID since all data exists only - in memory and is lost when the process terminates. + This method attempts to open an existing request queue from the Redis database. If a queue with the specified + ID or name exists, it loads the metadata from the database. If no existing queue is found, a new one + is created. Args: id: The ID of the request queue. If not provided, a random ID will be generated. @@ -156,11 +190,11 @@ async def purge(self) -> None: self.data_key, self.in_progress_key, ], - metadata_kwargs={ - 'update_accessed_at': True, - 'update_modified_at': True, - 'new_pending_request_count': 0, - }, + metadata_kwargs=_QueueMetadataUpdateParams( + update_accessed_at=True, + update_modified_at=True, + new_pending_request_count=0, + ), ) @override @@ -181,6 +215,7 @@ async def add_batch_of_requests( requests_by_unique_key = {req.unique_key: req for req in requests} unique_keys = list(requests_by_unique_key.keys()) + # Check which requests are already added or handled async with self._get_pipeline(with_execute=False) as pipe: await await_redis_response(pipe.bf().mexists(self.added_filter_key, *unique_keys)) # type: ignore[no-untyped-call] await await_redis_response(pipe.bf().mexists(self.handled_filter_key, *unique_keys)) # type: ignore[no-untyped-call] @@ -225,6 +260,7 @@ async def add_batch_of_requests( new_request_data[unique_key] = request.model_dump_json() if new_unique_keys: + # Add new requests to the queue atomically, get back which were actually added script_results = await self._add_requests_script( keys=[self.added_filter_key, self.queue_key, self.data_key], args=[int(forefront), json.dumps(new_unique_keys), json.dumps(new_request_data)], @@ -255,10 +291,12 @@ async def add_batch_of_requests( async with self._get_pipeline() as pipe: await self._update_metadata( pipe, - update_accessed_at=True, - update_modified_at=True, - delta_pending_request_count=delta_pending, - delta_total_request_count=delta_total, + **_QueueMetadataUpdateParams( + update_accessed_at=True, + update_modified_at=True, + delta_pending_request_count=delta_pending, + delta_total_request_count=delta_total, + ), ) return AddRequestsResponse( @@ -277,13 +315,14 @@ async def fetch_next_request(self) -> Request | None: blocked_until_timestamp = int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME + # The script retrieves requests from the queue and places them in the in_progress hash. requests_json = await self._fetch_script( keys=[self.queue_key, self.in_progress_key, self.data_key], args=[self.client_key, blocked_until_timestamp, self._MAX_BATCH_FETCH_SIZE], ) async with self._get_pipeline() as pipe: - await self._update_metadata(pipe, update_accessed_at=True) + await self._update_metadata(pipe, **_QueueMetadataUpdateParams(update_accessed_at=True)) if not requests_json: return None @@ -320,10 +359,12 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | await self._update_metadata( pipe, - update_accessed_at=True, - update_modified_at=True, - delta_handled_request_count=1, - delta_pending_request_count=-1, + **_QueueMetadataUpdateParams( + update_accessed_at=True, + update_modified_at=True, + delta_handled_request_count=1, + delta_pending_request_count=-1, + ), ) return ProcessedRequest( @@ -363,8 +404,10 @@ async def reclaim_request( await await_redis_response(pipe.hdel(self.in_progress_key, request.unique_key)) await self._update_metadata( pipe, - update_modified_at=True, - update_accessed_at=True, + **_QueueMetadataUpdateParams( + update_modified_at=True, + update_accessed_at=True, + ), ) return ProcessedRequest( @@ -383,6 +426,11 @@ async def is_empty(self) -> bool: if self._pending_fetch_cache: return False + # Reclaim stale requests if needed + if self._next_reclaim_stale is None or datetime.now(tz=timezone.utc) >= self._next_reclaim_stale: + await self._reclaim_stale_requests() + self._next_reclaim_stale = datetime.now(tz=timezone.utc) + self._RECLAIM_INTERVAL + metadata = await self.get_metadata() return metadata.pending_request_count == 0 @@ -395,10 +443,12 @@ async def _load_scripts(self) -> None: @override async def _create_storage(self, pipeline: Pipeline) -> None: + # Create Bloom filters for added and handled requests await await_redis_response(pipeline.bf().create(self.added_filter_key, 0.1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] await await_redis_response(pipeline.bf().create(self.handled_filter_key, 0.1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] async def _reclaim_stale_requests(self) -> None: + """Reclaim requests that have been in progress for too long.""" # Mypy workaround if self._reclaim_stale_script is None: raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') diff --git a/src/crawlee/storage_clients/_redis/_storage_client.py b/src/crawlee/storage_clients/_redis/_storage_client.py index 9022699b3f..aed41779b1 100644 --- a/src/crawlee/storage_clients/_redis/_storage_client.py +++ b/src/crawlee/storage_clients/_redis/_storage_client.py @@ -1,5 +1,7 @@ from __future__ import annotations +import warnings + from redis.asyncio import Redis from typing_extensions import override @@ -17,13 +19,17 @@ class RedisStorageClient(StorageClient): """Redis implementation of the storage client. This storage client provides access to datasets, key-value stores, and request queues that persist data - to a Redis database. Each storage type uses a different key pattern to store and retrieve data. + to a Redis database v8.0+. Each storage type uses Redis-specific data structures and key patterns for + efficient storage and retrieval. - The client accepts either a database connection string or a pre-configured AsyncEngine. If neither is - provided, it creates a default SQLite database 'crawlee.db' in the storage directory. + The client accepts either a Redis connection string or a pre-configured Redis client instance. + Exactly one of these parameters must be provided during initialization. - Database schema is automatically created during initialization. SQLite databases receive performance - optimizations including WAL mode and increased cache size. + Storage types use the following Redis data structures: + - **Datasets**: Redis JSON arrays for item storage with metadata in JSON objects + - **Key-Value Stores**: Redis hashes for key-value pairs with separate metadata storage + - **Request Queues**: Redis lists for FIFO queuing, hashes for request data and in-progress tracking, + and Bloom filters for request deduplication Warning: This is an experimental feature. The behavior and interface may change in future versions. @@ -35,10 +41,11 @@ def __init__( connection_string: str | None = None, redis: Redis | None = None, ) -> None: - """Initialize the SQL storage client. + """Initialize the Redis storage client. Args: - connection_string: Database connection string. + connection_string: Redis connection string (e.g., "redis://localhost:6379"). + Supports standard Redis URL format with optional database selection. redis: Pre-configured Redis client instance. """ if redis is not None and connection_string is not None: @@ -53,6 +60,13 @@ def __init__( elif connection_string is not None: self._redis = Redis.from_url(connection_string) + # Call the notification only once + warnings.warn( + 'The RedisStorageClient is experimental and may change or be removed in future releases.', + category=UserWarning, + stacklevel=2, + ) + @override async def create_dataset_client( self, From 31a1fa9fb7819fa888b8fbec829ae875d328c991 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 14 Sep 2025 03:13:01 +0000 Subject: [PATCH 06/12] update guide --- .../redis_storage_client_basic_example.py | 10 ++ ...is_storage_client_configuration_example.py | 27 +++ docs/guides/storage_clients.mdx | 166 ++++++++++++++++++ .../storage_clients/_redis/_dataset_client.py | 6 +- .../_redis/_key_value_store_client.py | 8 +- .../_redis/_request_queue_client.py | 14 +- 6 files changed, 217 insertions(+), 14 deletions(-) create mode 100644 docs/guides/code_examples/storage_clients/redis_storage_client_basic_example.py create mode 100644 docs/guides/code_examples/storage_clients/redis_storage_client_configuration_example.py diff --git a/docs/guides/code_examples/storage_clients/redis_storage_client_basic_example.py b/docs/guides/code_examples/storage_clients/redis_storage_client_basic_example.py new file mode 100644 index 0000000000..e787069d94 --- /dev/null +++ b/docs/guides/code_examples/storage_clients/redis_storage_client_basic_example.py @@ -0,0 +1,10 @@ +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import RedisStorageClient + +# Create a new instance of storage client using connection string. +# 'redis://localhost:6379' is the just placeholder, replace it with your actual +# connection string. +storage_client = RedisStorageClient(connection_string='redis://localhost:6379') + +# And pass it to the crawler. +crawler = ParselCrawler(storage_client=storage_client) diff --git a/docs/guides/code_examples/storage_clients/redis_storage_client_configuration_example.py b/docs/guides/code_examples/storage_clients/redis_storage_client_configuration_example.py new file mode 100644 index 0000000000..ad1863aa23 --- /dev/null +++ b/docs/guides/code_examples/storage_clients/redis_storage_client_configuration_example.py @@ -0,0 +1,27 @@ +from redis.asyncio import Redis + +from crawlee.configuration import Configuration +from crawlee.crawlers import ParselCrawler +from crawlee.storage_clients import RedisStorageClient + +# Create a new instance of storage client using a Redis client with custom settings. +# Replace host and port with your actual Redis server configuration. +# Other Redis client settings can be adjusted as needed. +storage_client = RedisStorageClient( + redis=Redis( + host='localhost', + port=6379, + retry_on_timeout=True, + socket_keepalive=True, + socket_connect_timeout=10, + ) +) + +# Create a configuration with custom settings. +configuration = Configuration(purge_on_start=False) + +# And pass them to the crawler. +crawler = ParselCrawler( + storage_client=storage_client, + configuration=configuration, +) diff --git a/docs/guides/storage_clients.mdx b/docs/guides/storage_clients.mdx index 0c2a14ffe9..889563d568 100644 --- a/docs/guides/storage_clients.mdx +++ b/docs/guides/storage_clients.mdx @@ -8,12 +8,15 @@ import ApiLink from '@site/src/components/ApiLink'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import CodeBlock from '@theme/CodeBlock'; import MemoryStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/memory_storage_client_basic_example.py'; import FileSystemStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_basic_example.py'; import FileSystemStorageClientConfigurationExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_configuration_example.py'; import CustomStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/custom_storage_client_example.py'; import RegisteringStorageClientsExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/registering_storage_clients_example.py'; +import RedisStorageClientBasicExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_basic_example.py'; +import RedisStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_configuration_example.py'; Storage clients provide a unified interface for interacting with `Dataset`, `KeyValueStore`, and `RequestQueue`, regardless of the underlying implementation. They handle operations like creating, reading, updating, and deleting storage instances, as well as managing data persistence and cleanup. This abstraction makes it easy to switch between different environments, such as local development and cloud production setups. @@ -50,6 +53,8 @@ class FileSystemStorageClient class MemoryStorageClient +class RedisStorageClient + class ApifyStorageClient %% ======================== @@ -58,6 +63,7 @@ class ApifyStorageClient StorageClient --|> FileSystemStorageClient StorageClient --|> MemoryStorageClient +StorageClient --|> RedisStorageClient StorageClient --|> ApifyStorageClient ``` @@ -125,6 +131,166 @@ The `MemoryStorageClient` does not persist data between runs. All data is lost w {MemoryStorageClientBasicExample} +## Redis storage client + +:::warning Experimental feature +The `RedisStorageClient` is experimental. Its API and behavior may change in future releases. +::: + +The `RedisStorageClient` provides persistent storage using Redis database. It supports concurrent access from multiple independent clients or processes and uses Redis native data structures for efficient operations. + +:::note dependencies +The `RedisStorageClient` is not included in the core Crawlee package. +To use it, you need to install Crawlee with the Redis extra dependency: + +pip install 'crawlee[redis]' + +Additionally, Redis version 8.0 or higher is required. +::: + +The client requires either a Redis connection string or a pre-configured Redis client instance. Use a pre-configured client when you need custom Redis settings such as connection pooling, timeouts, or SSL/TLS encryption. + + + {RedisStorageClientBasicExample} + + +Data is organized using Redis key patterns. Below are the main data structures used for each storage type: + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Storage Client +%% ======================== + +class RedisDatasetClient { + <> +} + +%% ======================== +%% Dataset Keys +%% ======================== + +class Dataset_Keys { + datasets:[name]:items - JSON Array + datasets:[name]:metadata - JSON Object +} + +class Datasets_Indexes { + datasets:id_to_name - Hash + datasets:name_to_id - Hash +} + +%% ======================== +%% Client to Keys arrows +%% ======================== + +RedisDatasetClient --> Dataset_Keys +RedisDatasetClient --> Datasets_Indexes +``` + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Storage Clients +%% ======================== + +class RedisKeyValueStoreClient { + <> +} + +%% ======================== +%% Key-Value Store Keys +%% ======================== + +class Key_Value_Store_Keys { + key_value_stores:[name]:items - Hash + key_value_stores:[name]:metadata_items - Hash + key_value_stores:[name]:metadata - JSON Object +} + +class Key_Value_Stores_Indexes { + key_value_stores:id_to_name - Hash + key_value_stores:name_to_id - Hash +} + +%% ======================== +%% Client to Keys arrows +%% ======================== + +RedisKeyValueStoreClient --> Key_Value_Store_Keys +RedisKeyValueStoreClient --> Key_Value_Stores_Indexes +``` + +```mermaid +--- +config: + class: + hideEmptyMembersBox: true +--- + +classDiagram + +%% ======================== +%% Storage Clients +%% ======================== + +class RedisRequestQueueClient { + <> +} + +%% ======================== +%% Request Queue Keys +%% ======================== + +class Request_Queue_Keys{ + request_queues:[name]:queue - List + request_queues:[name]:data - Hash + request_queues:[name]:in_progress - Hash + request_queues:[name]:added_bloom_filter - Bloom Filter + request_queues:[name]:handled_bloom_filter - Bloom Filter + request_queues:[name]:metadata - JSON Object +} + +class Request_Queues_Indexes { + request_queues:id_to_name - Hash + request_queues:name_to_id - Hash +} + +%% ======================== +%% Client to Keys arrows +%% ======================== + +RedisRequestQueueClient --> Request_Queue_Keys +RedisRequestQueueClient --> Request_Queues_Indexes +``` + +Configuration options for the `RedisStorageClient` can be set through environment variables or the `Configuration` class: + +- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start. + +Configuration options for the `RedisStorageClient` can be set via constructor arguments: + +- **`connection_string`** – Redis connection string, e.g. `redis://localhost:6379/0`. +- **`redis`** – Pre-configured Redis client instance (optional). + + + {RedisStorageClientConfigurationExample} + + ## Creating a custom storage client A storage client consists of two parts: the storage client factory and individual storage type clients. The `StorageClient` acts as a factory that creates specific clients (`DatasetClient`, `KeyValueStoreClient`, `RequestQueueClient`) where the actual storage logic is implemented. diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index 3ef83bc439..0110028da6 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -34,8 +34,8 @@ class RedisDatasetClient(DatasetClient, RedisClientMixin): Items are stored as JSON objects with automatic ordering preservation through Redis list operations. The dataset data is stored in Redis using the following key pattern: - - `dataset:{name}:items` - Redis JSON array containing all dataset items. - - `dataset:{name}:metadata` - Redis JSON object containing dataset metadata. + - `datasets:{name}:items` - Redis JSON array containing all dataset items. + - `datasets:{name}:metadata` - Redis JSON object containing dataset metadata. Items must be JSON-serializable dictionaries. Single items or lists of items can be pushed to the dataset. The item ordering is preserved through Redis JSON array operations. All operations provide atomic consistency @@ -45,7 +45,7 @@ class RedisDatasetClient(DatasetClient, RedisClientMixin): _DEFAULT_NAME = 'default' """Default Dataset name key prefix when none provided.""" - _MAIN_KEY = 'dataset' + _MAIN_KEY = 'datasets' """Main Redis key prefix for Dataset.""" _CLIENT_TYPE = 'Dataset' diff --git a/src/crawlee/storage_clients/_redis/_key_value_store_client.py b/src/crawlee/storage_clients/_redis/_key_value_store_client.py index 05af26baf8..04638814cb 100644 --- a/src/crawlee/storage_clients/_redis/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_redis/_key_value_store_client.py @@ -29,9 +29,9 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin): Keys are mapped to values with automatic content type detection and size tracking for metadata management. The key-value store data is stored in Redis using the following key pattern: - - `key-value-store:{name}:items` - Redis hash containing key-value pairs (values stored as binary data). - - `key-value-store:{name}:metadata_items` - Redis hash containing metadata for each key. - - `key-value-store:{name}:metadata` - Redis JSON object containing store metadata. + - `key_value_stores:{name}:items` - Redis hash containing key-value pairs (values stored as binary data). + - `key_value_stores:{name}:metadata_items` - Redis hash containing metadata for each key. + - `key_value_stores:{name}:metadata` - Redis JSON object containing store metadata. Values are serialized based on their type: JSON objects are stored as UTF-8 encoded JSON strings, text values as UTF-8 encoded strings, and binary data as-is. The implementation automatically handles @@ -44,7 +44,7 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin): _DEFAULT_NAME = 'default' """Default Key-Value Store name key prefix when none provided.""" - _MAIN_KEY = 'key-value-store' + _MAIN_KEY = 'key_value_stores' """Main Redis key prefix for Key-Value Store.""" _CLIENT_TYPE = 'Key-value store' diff --git a/src/crawlee/storage_clients/_redis/_request_queue_client.py b/src/crawlee/storage_clients/_redis/_request_queue_client.py index 77e80c734a..73740bd23e 100644 --- a/src/crawlee/storage_clients/_redis/_request_queue_client.py +++ b/src/crawlee/storage_clients/_redis/_request_queue_client.py @@ -51,12 +51,12 @@ class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin): with timestamp-based expiration for stale request recovery. The request queue data is stored in Redis using the following key patterns: - - `request_queue:{name}:queue` - Redis list for FIFO request ordering - - `request_queue:{name}:data` - Redis hash storing serialized Request objects by unique_key - - `request_queue:{name}:in_progress` - Redis hash tracking requests currently being processed - - `request_queue:{name}:added_bloom_filter` - Bloom filter for added request deduplication - - `request_queue:{name}:handled_bloom_filter` - Bloom filter for completed request tracking - - `request_queue:{name}:metadata` - Redis JSON object containing queue metadata + - `request_queues:{name}:queue` - Redis list for FIFO request ordering + - `request_queues:{name}:data` - Redis hash storing serialized Request objects by unique_key + - `request_queues:{name}:in_progress` - Redis hash tracking requests currently being processed + - `request_queues:{name}:added_bloom_filter` - Bloom filter for added request deduplication + - `request_queues:{name}:handled_bloom_filter` - Bloom filter for completed request tracking + - `request_queues:{name}:metadata` - Redis JSON object containing queue metadata Requests are serialized to JSON for storage and maintain proper FIFO ordering through Redis list operations. The implementation provides concurrent access safety through atomic Lua scripts, @@ -66,7 +66,7 @@ class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin): _DEFAULT_NAME = 'default' """Default Request Queue name key prefix when none provided.""" - _MAIN_KEY = 'request_queue' + _MAIN_KEY = 'request_queues' """Main Redis key prefix for Request Queue.""" _CLIENT_TYPE = 'Request queue' From d46ffbecff5a5384c1168f3f8d05f41b3493618d Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 14 Sep 2025 03:18:33 +0000 Subject: [PATCH 07/12] add in built-id --- docs/guides/storage_clients.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/guides/storage_clients.mdx b/docs/guides/storage_clients.mdx index 889563d568..f78401c247 100644 --- a/docs/guides/storage_clients.mdx +++ b/docs/guides/storage_clients.mdx @@ -26,6 +26,7 @@ Crawlee provides three main storage client implementations: - `FileSystemStorageClient` - Provides persistent file system storage with in-memory caching. - `MemoryStorageClient` - Stores data in memory with no persistence. +- `RedisStorageClient` – Provides persistent storage using a [Redis](https://redis.io/) database v8.0+. Requires installing the extra dependency: 'crawlee[redis]'. - [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python). ```mermaid From 5b77ab63dd8b40303836dfd081dd59b20e53ec2b Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 15 Sep 2025 17:17:26 +0000 Subject: [PATCH 08/12] add tests for Redis clients --- .../storage_clients/_redis/_client_mixin.py | 14 +- .../storage_clients/_redis/_dataset_client.py | 14 +- .../_redis/_key_value_store_client.py | 27 +- .../_redis/_request_queue_client.py | 62 ++--- tests/unit/conftest.py | 9 + .../_redis/test_redis_dataset_client.py | 178 +++++++++++++ .../_redis/test_redis_kvs_client.py | 244 ++++++++++++++++++ .../_redis/test_redis_rq_client.py | 244 ++++++++++++++++++ tests/unit/storages/test_dataset.py | 5 +- tests/unit/storages/test_key_value_store.py | 5 +- tests/unit/storages/test_request_queue.py | 5 +- 11 files changed, 745 insertions(+), 62 deletions(-) create mode 100644 tests/unit/storage_clients/_redis/test_redis_dataset_client.py create mode 100644 tests/unit/storage_clients/_redis/test_redis_kvs_client.py create mode 100644 tests/unit/storage_clients/_redis/test_redis_rq_client.py diff --git a/src/crawlee/storage_clients/_redis/_client_mixin.py b/src/crawlee/storage_clients/_redis/_client_mixin.py index 8458058fd3..5401db4c2b 100644 --- a/src/crawlee/storage_clients/_redis/_client_mixin.py +++ b/src/crawlee/storage_clients/_redis/_client_mixin.py @@ -94,7 +94,12 @@ async def _get_metadata_name_by_id(cls, id: str, redis: Redis) -> str | None: id: The ID of the storage. redis: The Redis client instance. """ - return await await_redis_response(redis.get(f'{cls._MAIN_KEY}:id_to_name:{id}')) + name = await await_redis_response(redis.hget(f'{cls._MAIN_KEY}:id_to_name', id)) + if isinstance(name, str) or name is None: + return name + if isinstance(name, bytes): + return name.decode('utf-8') + return None @classmethod async def _open( @@ -199,14 +204,14 @@ async def _create_metadata_and_storage(self, storage_name: str, metadata: dict) metadata['modified_at'] = metadata['modified_at'].isoformat() # Try to create name_to_id index entry, if it already exists, return False. - name_to_id = await await_redis_response(self._redis.hsetnx(index_id_to_name, storage_name, metadata['id'])) + name_to_id = await await_redis_response(self._redis.hsetnx(index_name_to_id, storage_name, metadata['id'])) # If name already exists, return False. Probably an attempt at parallel creation. if not name_to_id: return False # Create id_to_name index entry, metadata, and storage structure in a transaction. async with self._get_pipeline() as pipe: - await await_redis_response(pipe.hsetnx(index_name_to_id, metadata['id'], storage_name)) + await await_redis_response(pipe.hsetnx(index_id_to_name, metadata['id'], storage_name)) await await_redis_response(pipe.json().set(self.metadata_key, '$', metadata)) await await_redis_response(pipe.lpush(f'{self._MAIN_KEY}:{storage_name}:created_signal', 1)) @@ -219,6 +224,7 @@ async def _drop(self, extra_keys: list[str]) -> None: await pipe.delete(self.metadata_key) await pipe.delete(f'{self._MAIN_KEY}:id_to_name', self._storage_id) await pipe.delete(f'{self._MAIN_KEY}:name_to_id', self._storage_name) + await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:created_signal') for key in extra_keys: await pipe.delete(key) @@ -243,6 +249,8 @@ async def _get_metadata( metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis) if metadata_dict is None: raise ValueError(f'{self._CLIENT_TYPE} with name "{self._storage_name}" does not exist.') + async with self._get_pipeline() as pipe: + await self._update_metadata(pipe, update_accessed_at=True) return metadata_model.model_validate(metadata_dict) diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index 0110028da6..1cf2260ee2 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -64,7 +64,7 @@ def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis) @property - def items_key(self) -> str: + def _items_key(self) -> str: """Return the Redis key for the items of this dataset.""" return f'{self._MAIN_KEY}:{self._storage_name}:items' @@ -107,12 +107,12 @@ async def get_metadata(self) -> DatasetMetadata: @override async def drop(self) -> None: - await self._drop(extra_keys=[self.items_key]) + await self._drop(extra_keys=[self._items_key]) @override async def purge(self) -> None: await self._purge( - extra_keys=[self.items_key], + extra_keys=[self._items_key], metadata_kwargs=_DatasetMetadataUpdateParams( new_item_count=0, update_accessed_at=True, update_modified_at=True ), @@ -125,7 +125,7 @@ async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: async with self._get_pipeline() as pipe: # Incorrect signature for args type in redis-py - pipe.json().arrappend(self.items_key, '$', *data) # type: ignore[arg-type] + pipe.json().arrappend(self._items_key, '$', *data) # type: ignore[arg-type] delta_item_count = len(data) await self._update_metadata( pipe, @@ -193,7 +193,7 @@ async def get_data( if json_path == '$': json_path = '$[*]' - data = await await_redis_response(self._redis.json().get(self.items_key, json_path)) + data = await await_redis_response(self._redis.json().get(self._items_key, json_path)) if data is None: data = [] @@ -278,7 +278,7 @@ async def iterate_items( json_path = f'$[{batch_start}:{batch_end}]' # Get batch of items - batch_items = await await_redis_response(self._redis.json().get(self.items_key, json_path)) + batch_items = await await_redis_response(self._redis.json().get(self._items_key, json_path)) # Handle case where batch_items might be None or not a list if batch_items is None: @@ -303,7 +303,7 @@ async def iterate_items( async def _create_storage(self, pipeline: Pipeline) -> None: """Create the main dataset keys in Redis.""" # Create an empty JSON array for items - await await_redis_response(pipeline.json().set(self.items_key, '$', [])) + await await_redis_response(pipeline.json().set(self._items_key, '$', [])) @override async def _specific_update_metadata( diff --git a/src/crawlee/storage_clients/_redis/_key_value_store_client.py b/src/crawlee/storage_clients/_redis/_key_value_store_client.py index 04638814cb..47e2b84a95 100644 --- a/src/crawlee/storage_clients/_redis/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_redis/_key_value_store_client.py @@ -58,12 +58,12 @@ def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis) @property - def items_key(self) -> str: + def _items_key(self) -> str: """Return the Redis key for the items of KVS.""" return f'{self._MAIN_KEY}:{self._storage_name}:items' @property - def metadata_items_key(self) -> str: + def _metadata_items_key(self) -> str: """Return the Redis key for the items metadata of KVS.""" return f'{self._MAIN_KEY}:{self._storage_name}:metadata_items' @@ -106,12 +106,12 @@ async def get_metadata(self) -> KeyValueStoreMetadata: @override async def drop(self) -> None: - await self._drop(extra_keys=[self.items_key, self.metadata_items_key]) + await self._drop(extra_keys=[self._items_key, self._metadata_items_key]) @override async def purge(self) -> None: await self._purge( - extra_keys=[self.items_key, self.metadata_items_key], + extra_keys=[self._items_key, self._metadata_items_key], metadata_kwargs=MetadataUpdateParams(update_accessed_at=True, update_modified_at=True), ) @@ -144,11 +144,11 @@ async def set_value(self, *, key: str, value: Any, content_type: str | None = No async with self._get_pipeline() as pipe: # redis-py typing issue - await await_redis_response(pipe.hset(self.items_key, key, value_bytes)) # type: ignore[arg-type] + await await_redis_response(pipe.hset(self._items_key, key, value_bytes)) # type: ignore[arg-type] await await_redis_response( pipe.hset( - self.metadata_items_key, + self._metadata_items_key, key, item_metadata.model_dump_json(), ) @@ -157,7 +157,10 @@ async def set_value(self, *, key: str, value: Any, content_type: str | None = No @override async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: - serialized_metadata_item = await await_redis_response(self._redis.hget(self.metadata_items_key, key)) + serialized_metadata_item = await await_redis_response(self._redis.hget(self._metadata_items_key, key)) + + async with self._get_pipeline() as pipe: + await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True)) if not isinstance(serialized_metadata_item, (str, bytes, bytearray)): logger.warning(f'Metadata for key "{key}" is missing or invalid.') @@ -172,7 +175,7 @@ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: # Query the record by key # redis-py typing issue value_bytes: bytes | None = await await_redis_response( - self._redis.hget(self.items_key, key) # type: ignore[arg-type] + self._redis.hget(self._items_key, key) # type: ignore[arg-type] ) if value_bytes is None: @@ -202,8 +205,8 @@ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: @override async def delete_value(self, *, key: str) -> None: async with self._get_pipeline() as pipe: - await await_redis_response(pipe.hdel(self.items_key, key)) - await await_redis_response(pipe.hdel(self.metadata_items_key, key)) + await await_redis_response(pipe.hdel(self._items_key, key)) + await await_redis_response(pipe.hdel(self._metadata_items_key, key)) await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True)) @override @@ -213,7 +216,7 @@ async def iterate_keys( exclusive_start_key: str | None = None, limit: int | None = None, ) -> AsyncIterator[KeyValueStoreRecordMetadata]: - items_data = await await_redis_response(self._redis.hgetall(self.metadata_items_key)) + items_data = await await_redis_response(self._redis.hgetall(self._metadata_items_key)) if not items_data: return # No items to iterate over @@ -251,7 +254,7 @@ async def get_public_url(self, *, key: str) -> str: @override async def record_exists(self, *, key: str) -> bool: async with self._get_pipeline(with_execute=False) as pipe: - await await_redis_response(pipe.hexists(self.items_key, key)) + await await_redis_response(pipe.hexists(self._items_key, key)) await self._update_metadata( pipe, **MetadataUpdateParams(update_accessed_at=True), diff --git a/src/crawlee/storage_clients/_redis/_request_queue_client.py b/src/crawlee/storage_clients/_redis/_request_queue_client.py index 73740bd23e..941f700009 100644 --- a/src/crawlee/storage_clients/_redis/_request_queue_client.py +++ b/src/crawlee/storage_clients/_redis/_request_queue_client.py @@ -102,27 +102,27 @@ def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: self._next_reclaim_stale: None | datetime = None @property - def added_filter_key(self) -> str: + def _added_filter_key(self) -> str: """Return the Redis key for the added requests Bloom filter.""" return f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter' @property - def handled_filter_key(self) -> str: + def _handled_filter_key(self) -> str: """Return the Redis key for the handled requests Bloom filter.""" return f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter' @property - def queue_key(self) -> str: + def _queue_key(self) -> str: """Return the Redis key for the request queue.""" return f'{self._MAIN_KEY}:{self._storage_name}:queue' @property - def data_key(self) -> str: + def _data_key(self) -> str: """Return the Redis key for the request data hash.""" return f'{self._MAIN_KEY}:{self._storage_name}:data' @property - def in_progress_key(self) -> str: + def _in_progress_key(self) -> str: """Return the Redis key for the in-progress requests hash.""" return f'{self._MAIN_KEY}:{self._storage_name}:in_progress' @@ -172,11 +172,11 @@ async def get_metadata(self) -> RequestQueueMetadata: async def drop(self) -> None: await self._drop( extra_keys=[ - self.added_filter_key, - self.handled_filter_key, - self.queue_key, - self.data_key, - self.in_progress_key, + self._added_filter_key, + self._handled_filter_key, + self._queue_key, + self._data_key, + self._in_progress_key, ] ) @@ -184,11 +184,11 @@ async def drop(self) -> None: async def purge(self) -> None: await self._purge( extra_keys=[ - self.added_filter_key, - self.handled_filter_key, - self.queue_key, - self.data_key, - self.in_progress_key, + self._added_filter_key, + self._handled_filter_key, + self._queue_key, + self._data_key, + self._in_progress_key, ], metadata_kwargs=_QueueMetadataUpdateParams( update_accessed_at=True, @@ -217,8 +217,8 @@ async def add_batch_of_requests( unique_keys = list(requests_by_unique_key.keys()) # Check which requests are already added or handled async with self._get_pipeline(with_execute=False) as pipe: - await await_redis_response(pipe.bf().mexists(self.added_filter_key, *unique_keys)) # type: ignore[no-untyped-call] - await await_redis_response(pipe.bf().mexists(self.handled_filter_key, *unique_keys)) # type: ignore[no-untyped-call] + await await_redis_response(pipe.bf().mexists(self._added_filter_key, *unique_keys)) # type: ignore[no-untyped-call] + await await_redis_response(pipe.bf().mexists(self._handled_filter_key, *unique_keys)) # type: ignore[no-untyped-call] results = await pipe.execute() @@ -262,7 +262,7 @@ async def add_batch_of_requests( if new_unique_keys: # Add new requests to the queue atomically, get back which were actually added script_results = await self._add_requests_script( - keys=[self.added_filter_key, self.queue_key, self.data_key], + keys=[self._added_filter_key, self._queue_key, self._data_key], args=[int(forefront), json.dumps(new_unique_keys), json.dumps(new_request_data)], ) actually_added = set(json.loads(script_results)) @@ -317,7 +317,7 @@ async def fetch_next_request(self) -> Request | None: # The script retrieves requests from the queue and places them in the in_progress hash. requests_json = await self._fetch_script( - keys=[self.queue_key, self.in_progress_key, self.data_key], + keys=[self._queue_key, self._in_progress_key, self._data_key], args=[self.client_key, blocked_until_timestamp, self._MAX_BATCH_FETCH_SIZE], ) @@ -335,7 +335,7 @@ async def fetch_next_request(self) -> Request | None: @override async def get_request(self, unique_key: str) -> Request | None: - request_data = await await_redis_response(self._redis.hget(self.data_key, unique_key)) + request_data = await await_redis_response(self._redis.hget(self._data_key, unique_key)) if isinstance(request_data, (str, bytes, bytearray)): return Request.model_validate_json(request_data) @@ -346,16 +346,16 @@ async def get_request(self, unique_key: str) -> Request | None: async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: # Check if the request is in progress. - check_in_progress = await await_redis_response(self._redis.hexists(self.in_progress_key, request.unique_key)) + check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key)) if not check_in_progress: logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.') return None async with self._get_pipeline() as pipe: - await await_redis_response(pipe.bf().add(self.handled_filter_key, request.unique_key)) # type: ignore[no-untyped-call] + await await_redis_response(pipe.bf().add(self._handled_filter_key, request.unique_key)) # type: ignore[no-untyped-call] - await await_redis_response(pipe.hdel(self.in_progress_key, request.unique_key)) - await await_redis_response(pipe.hdel(self.data_key, request.unique_key)) + await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key)) + await await_redis_response(pipe.hdel(self._data_key, request.unique_key)) await self._update_metadata( pipe, @@ -380,7 +380,7 @@ async def reclaim_request( *, forefront: bool = False, ) -> ProcessedRequest | None: - check_in_progress = await await_redis_response(self._redis.hexists(self.in_progress_key, request.unique_key)) + check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key)) if not check_in_progress: logger.info(f'Reclaiming request {request.unique_key} that is not in progress.') return None @@ -393,15 +393,15 @@ async def reclaim_request( await await_redis_response( pipe.hset( - self.in_progress_key, + self._in_progress_key, request.unique_key, f'{{"client_id":"{self.client_key}","blocked_until_timestamp":{blocked_until_timestamp}}}', ) ) self._pending_fetch_cache.appendleft(request) else: - await await_redis_response(pipe.rpush(self.queue_key, request.unique_key)) - await await_redis_response(pipe.hdel(self.in_progress_key, request.unique_key)) + await await_redis_response(pipe.rpush(self._queue_key, request.unique_key)) + await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key)) await self._update_metadata( pipe, **_QueueMetadataUpdateParams( @@ -444,8 +444,8 @@ async def _load_scripts(self) -> None: @override async def _create_storage(self, pipeline: Pipeline) -> None: # Create Bloom filters for added and handled requests - await await_redis_response(pipeline.bf().create(self.added_filter_key, 0.1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] - await await_redis_response(pipeline.bf().create(self.handled_filter_key, 0.1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] + await await_redis_response(pipeline.bf().create(self._added_filter_key, 1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] + await await_redis_response(pipeline.bf().create(self._handled_filter_key, 1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] async def _reclaim_stale_requests(self) -> None: """Reclaim requests that have been in progress for too long.""" @@ -456,7 +456,7 @@ async def _reclaim_stale_requests(self) -> None: current_time = int(datetime.now(tz=timezone.utc).timestamp() * 1000) await self._reclaim_stale_script( - keys=[self.in_progress_key, self.queue_key, self.data_key], args=[current_time] + keys=[self._in_progress_key, self._queue_key, self._data_key], args=[current_time] ) @override diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 7d758e5ee5..c90bf03eb7 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -5,6 +5,7 @@ import logging import os +import warnings from typing import TYPE_CHECKING, cast import pytest @@ -29,6 +30,14 @@ from crawlee.http_clients._base import HttpClient +@pytest.fixture +async def suppress_user_warning() -> AsyncGenerator[None, None]: + """Suppress user warnings during tests.""" + with warnings.catch_warnings(): + warnings.simplefilter('ignore', UserWarning) + yield + + @pytest.fixture def prepare_test_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Callable[[], None]: """Prepare the testing environment by resetting the global state before each test. diff --git a/tests/unit/storage_clients/_redis/test_redis_dataset_client.py b/tests/unit/storage_clients/_redis/test_redis_dataset_client.py new file mode 100644 index 0000000000..010e868074 --- /dev/null +++ b/tests/unit/storage_clients/_redis/test_redis_dataset_client.py @@ -0,0 +1,178 @@ +from __future__ import annotations + +import asyncio +from typing import TYPE_CHECKING + +import pytest + +from crawlee.storage_clients import RedisStorageClient +from crawlee.storage_clients._redis._utils import await_redis_response + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + from fakeredis import FakeAsyncRedis + + from crawlee.storage_clients._redis import RedisDatasetClient + + +@pytest.fixture +async def dataset_client( + redis_client: FakeAsyncRedis, + suppress_user_warning: None, # noqa: ARG001 +) -> AsyncGenerator[RedisDatasetClient, None]: + """A fixture for a Redis dataset client.""" + client = await RedisStorageClient(redis=redis_client).create_dataset_client( + name='test_dataset', + ) + yield client + await client.drop() + + +async def test_base_keys_creation(dataset_client: RedisDatasetClient) -> None: + """Test that Redis dataset client creates proper keys.""" + metadata = await dataset_client.get_metadata() + name = await await_redis_response(dataset_client.redis.hget('datasets:id_to_name', metadata.id)) + + assert name is not None + assert (name.decode() if isinstance(name, bytes) else name) == 'test_dataset' + + dataset_id = await await_redis_response(dataset_client.redis.hget('datasets:name_to_id', 'test_dataset')) + + assert dataset_id is not None + assert (dataset_id.decode() if isinstance(dataset_id, bytes) else dataset_id) == metadata.id + + items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$')) + assert items is not None + assert len(items) == 0 + + metadata_data = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:metadata')) + + assert isinstance(metadata_data, dict) + assert metadata_data['id'] == metadata.id # type: ignore[unreachable] # py-json typing is broken + + +async def test_record_and_content_verification(dataset_client: RedisDatasetClient) -> None: + """Test that data is properly persisted to Redis with correct content.""" + item = {'key': 'value', 'number': 42} + await dataset_client.push_data(item) + + # Verify metadata record + metadata = await dataset_client.get_metadata() + assert metadata.item_count == 1 + assert metadata.created_at is not None + assert metadata.modified_at is not None + assert metadata.accessed_at is not None + + # Verify records in Redis + all_items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$')) + + assert all_items is not None + assert len(all_items) == 1 + + # Verify actual file content + assert all_items[0] == item + + # Test multiple records + items = [{'id': 1, 'name': 'Item 1'}, {'id': 2, 'name': 'Item 2'}, {'id': 3, 'name': 'Item 3'}] + await dataset_client.push_data(items) + + all_items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$')) + assert all_items is not None + assert len(all_items) == 4 + + +async def test_drop_removes_records(dataset_client: RedisDatasetClient) -> None: + """Test that dropping a dataset removes all records from Redis.""" + await dataset_client.push_data({'test': 'data'}) + + metadata = await dataset_client.get_metadata() + name = await await_redis_response(dataset_client.redis.hget('datasets:id_to_name', metadata.id)) + dataset_id = await await_redis_response(dataset_client.redis.hget('datasets:name_to_id', 'test_dataset')) + items = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$')) + + assert name is not None + assert (name.decode() if isinstance(name, bytes) else name) == 'test_dataset' + assert dataset_id is not None + assert (dataset_id.decode() if isinstance(dataset_id, bytes) else dataset_id) == metadata.id + assert items is not None + assert len(items) == 1 + + # Drop the dataset + await dataset_client.drop() + + # Verify removal of all records + name_after_drop = await await_redis_response(dataset_client.redis.hget('datasets:id_to_name', metadata.id)) + dataset_id_after_drop = await await_redis_response(dataset_client.redis.hget('datasets:name_to_id', 'test_dataset')) + items_after_drop = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:items', '$')) + + assert name_after_drop is None + assert dataset_id_after_drop is None + assert items_after_drop is None + + +async def test_metadata_record_updates(dataset_client: RedisDatasetClient) -> None: + """Test that metadata record is updated correctly after operations.""" + # Record initial timestamps + metadata = await dataset_client.get_metadata() + initial_created = metadata.created_at + initial_accessed = metadata.accessed_at + initial_modified = metadata.modified_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates accessed_at + await dataset_client.get_data() + + # Verify timestamps + metadata = await dataset_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.accessed_at > initial_accessed + assert metadata.modified_at == initial_modified + + accessed_after_get = metadata.accessed_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform an operation that updates modified_at + await dataset_client.push_data({'new': 'item'}) + + # Verify timestamps again + metadata = await dataset_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.modified_at > initial_modified + assert metadata.accessed_at > accessed_after_get + + # Verify metadata file is updated in Redis + metadata_json = await await_redis_response(dataset_client.redis.json().get(f'datasets:{metadata.name}:metadata')) + + assert isinstance(metadata_json, dict) + assert metadata_json['item_count'] == 1 # type: ignore[unreachable] # py-json typing is broken + + +async def test_data_persistence_across_reopens(redis_client: FakeAsyncRedis) -> None: + """Test that data persists correctly when reopening the same dataset.""" + storage_client = RedisStorageClient(redis=redis_client) + + # Create dataset and add data + original_client = await storage_client.create_dataset_client( + name='persistence-test', + ) + + test_data = {'test_item': 'test_value', 'id': 123} + await original_client.push_data(test_data) + + dataset_id = (await original_client.get_metadata()).id + + # Reopen by ID and verify data persists + reopened_client = await storage_client.create_dataset_client( + id=dataset_id, + ) + + data = await reopened_client.get_data() + assert len(data.items) == 1 + assert data.items[0] == test_data + + await reopened_client.drop() diff --git a/tests/unit/storage_clients/_redis/test_redis_kvs_client.py b/tests/unit/storage_clients/_redis/test_redis_kvs_client.py new file mode 100644 index 0000000000..0b083a38d0 --- /dev/null +++ b/tests/unit/storage_clients/_redis/test_redis_kvs_client.py @@ -0,0 +1,244 @@ +from __future__ import annotations + +import asyncio +import json +from typing import TYPE_CHECKING + +import pytest + +from crawlee.storage_clients import RedisStorageClient +from crawlee.storage_clients._redis._utils import await_redis_response + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + from fakeredis import FakeAsyncRedis + + from crawlee.storage_clients._redis import RedisKeyValueStoreClient + + +@pytest.fixture +async def kvs_client( + redis_client: FakeAsyncRedis, + suppress_user_warning: None, # noqa: ARG001 +) -> AsyncGenerator[RedisKeyValueStoreClient, None]: + """A fixture for a Redis KVS client.""" + client = await RedisStorageClient(redis=redis_client).create_kvs_client( + name='test_kvs', + ) + yield client + await client.drop() + + +async def test_base_keys_creation(kvs_client: RedisKeyValueStoreClient) -> None: + """Test that Redis KVS client creates proper keys.""" + metadata = await kvs_client.get_metadata() + name = await await_redis_response(kvs_client.redis.hget('key_value_stores:id_to_name', metadata.id)) + + assert name is not None + assert (name.decode() if isinstance(name, bytes) else name) == 'test_kvs' + + kvs_id = await await_redis_response(kvs_client.redis.hget('key_value_stores:name_to_id', 'test_kvs')) + + assert kvs_id is not None + assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id + + metadata_data = await await_redis_response(kvs_client.redis.json().get('key_value_stores:test_kvs:metadata')) + + assert isinstance(metadata_data, dict) + assert metadata_data['id'] == metadata.id # type: ignore[unreachable] # py-json typing is broken + + +async def test_value_record_creation_and_content(kvs_client: RedisKeyValueStoreClient) -> None: + """Test that values are properly persisted to records with correct content and metadata.""" + test_key = 'test-key' + test_value = 'Hello, world!' + await kvs_client.set_value(key=test_key, value=test_value) + + # Check if the records were created + records_key = 'key_value_stores:test_kvs:items' + records_items_metadata = 'key_value_stores:test_kvs:metadata_items' + record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key)) + metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key)) + assert record_exists is True + assert metadata_exists is True + + # Check record content + content = await await_redis_response(kvs_client.redis.hget(records_key, test_key)) + content = content.decode() if isinstance(content, bytes) else content + assert content == test_value + + # Check record metadata + record_metadata = await await_redis_response(kvs_client.redis.hget(records_items_metadata, test_key)) + assert record_metadata is not None + assert isinstance(record_metadata, (str, bytes)) + metadata = json.loads(record_metadata) + + # Check record metadata + assert metadata['key'] == test_key + assert metadata['content_type'] == 'text/plain; charset=utf-8' + assert metadata['size'] == len(test_value.encode('utf-8')) + + # Verify retrieval works correctly + check_value = await kvs_client.get_value(key=test_key) + assert check_value is not None + assert check_value.value == test_value + + +async def test_binary_data_persistence(kvs_client: RedisKeyValueStoreClient) -> None: + """Test that binary data is stored correctly without corruption.""" + test_key = 'test-binary' + test_value = b'\x00\x01\x02\x03\x04' + records_key = 'key_value_stores:test_kvs:items' + records_items_metadata = 'key_value_stores:test_kvs:metadata_items' + await kvs_client.set_value(key=test_key, value=test_value) + + # Verify binary file exists + record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key)) + metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key)) + assert record_exists is True + assert metadata_exists is True + + # Verify binary content is preserved + content = await await_redis_response(kvs_client.redis.hget(records_key, test_key)) + assert content == test_value + + # Verify retrieval works correctly + record = await kvs_client.get_value(key=test_key) + assert record is not None + assert record.value == test_value + assert record.content_type == 'application/octet-stream' + + +async def test_json_serialization_to_record(kvs_client: RedisKeyValueStoreClient) -> None: + """Test that JSON objects are properly serialized to records.""" + test_key = 'test-json' + test_value = {'name': 'John', 'age': 30, 'items': [1, 2, 3]} + await kvs_client.set_value(key=test_key, value=test_value) + + # Check if record content is valid JSON + records_key = 'key_value_stores:test_kvs:items' + record = await await_redis_response(kvs_client.redis.hget(records_key, test_key)) + assert record is not None + assert isinstance(record, (str, bytes)) + assert json.loads(record) == test_value + + +async def test_records_deletion_on_value_delete(kvs_client: RedisKeyValueStoreClient) -> None: + """Test that deleting a value removes its records from Redis.""" + test_key = 'test-delete' + test_value = 'Delete me' + records_key = 'key_value_stores:test_kvs:items' + records_items_metadata = 'key_value_stores:test_kvs:metadata_items' + + # Set a value + await kvs_client.set_value(key=test_key, value=test_value) + + # Verify records exist + record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key)) + metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key)) + assert record_exists is True + assert metadata_exists is True + + # Delete the value + await kvs_client.delete_value(key=test_key) + + # Verify files were deleted + record_exists = await await_redis_response(kvs_client.redis.hexists(records_key, test_key)) + metadata_exists = await await_redis_response(kvs_client.redis.hexists(records_items_metadata, test_key)) + assert record_exists is False + assert metadata_exists is False + + +async def test_drop_removes_keys(kvs_client: RedisKeyValueStoreClient) -> None: + """Test that drop removes the entire store directory from disk.""" + await kvs_client.set_value(key='test', value='test-value') + + metadata = await kvs_client.get_metadata() + name = await await_redis_response(kvs_client.redis.hget('key_value_stores:id_to_name', metadata.id)) + kvs_id = await await_redis_response(kvs_client.redis.hget('key_value_stores:name_to_id', 'test_kvs')) + items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:items')) + metadata_items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:metadata_items')) + + assert name is not None + assert (name.decode() if isinstance(name, bytes) else name) == 'test_kvs' + assert kvs_id is not None + assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id + assert items is not None + assert items != {} + assert metadata_items is not None + assert metadata_items != {} + + # Drop the store + await kvs_client.drop() + + name = await await_redis_response(kvs_client.redis.hget('key_value_stores:id_to_name', metadata.id)) + kvs_id = await await_redis_response(kvs_client.redis.hget('key_value_stores:name_to_id', 'test_kvs')) + items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:items')) + metadata_items = await await_redis_response(kvs_client.redis.hgetall('key_value_stores:test_kvs:metadata_items')) + assert name is None + assert kvs_id is None + assert items == {} + assert metadata_items == {} + + +async def test_metadata_record_updates(kvs_client: RedisKeyValueStoreClient) -> None: + """Test that read/write operations properly update metadata file timestamps.""" + # Record initial timestamps + metadata = await kvs_client.get_metadata() + initial_created = metadata.created_at + initial_accessed = metadata.accessed_at + initial_modified = metadata.modified_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform a read operation + await kvs_client.get_value(key='nonexistent') + + # Verify accessed timestamp was updated + metadata = await kvs_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.accessed_at > initial_accessed + assert metadata.modified_at == initial_modified + + accessed_after_read = metadata.accessed_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform a write operation + await kvs_client.set_value(key='test', value='test-value') + + # Verify modified timestamp was updated + metadata = await kvs_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.modified_at > initial_modified + assert metadata.accessed_at > accessed_after_read + + +async def test_data_persistence_across_reopens(redis_client: FakeAsyncRedis) -> None: + """Test that data persists correctly when reopening the same KVS.""" + storage_client = RedisStorageClient(redis=redis_client) + + # Create KVS and add data + original_client = await storage_client.create_kvs_client( + name='persistence-test', + ) + + test_key = 'persistent-key' + test_value = 'persistent-value' + await original_client.set_value(key=test_key, value=test_value) + + kvs_id = (await original_client.get_metadata()).id + + # Reopen by ID and verify data persists + reopened_client = await storage_client.create_kvs_client( + id=kvs_id, + ) + + record = await reopened_client.get_value(key=test_key) + assert record is not None + assert record.value == test_value + + await reopened_client.drop() diff --git a/tests/unit/storage_clients/_redis/test_redis_rq_client.py b/tests/unit/storage_clients/_redis/test_redis_rq_client.py new file mode 100644 index 0000000000..1233ee76a3 --- /dev/null +++ b/tests/unit/storage_clients/_redis/test_redis_rq_client.py @@ -0,0 +1,244 @@ +from __future__ import annotations + +import asyncio +import json +from typing import TYPE_CHECKING + +import pytest + +from crawlee import Request +from crawlee.storage_clients import RedisStorageClient +from crawlee.storage_clients._redis._utils import await_redis_response + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + from fakeredis import FakeAsyncRedis + + from crawlee.storage_clients._redis import RedisRequestQueueClient + + +@pytest.fixture +async def rq_client( + redis_client: FakeAsyncRedis, + suppress_user_warning: None, # noqa: ARG001 +) -> AsyncGenerator[RedisRequestQueueClient, None]: + """A fixture for a Redis RQ client.""" + client = await RedisStorageClient(redis=redis_client).create_rq_client( + name='test_request_queue', + ) + yield client + await client.drop() + + +async def test_base_keys_creation(rq_client: RedisRequestQueueClient) -> None: + """Test that Redis RQ client creates proper keys.""" + + metadata = await rq_client.get_metadata() + name = await await_redis_response(rq_client.redis.hget('request_queues:id_to_name', metadata.id)) + + assert name is not None + assert (name.decode() if isinstance(name, bytes) else name) == 'test_request_queue' + + kvs_id = await await_redis_response(rq_client.redis.hget('request_queues:name_to_id', 'test_request_queue')) + + assert kvs_id is not None + assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id + + added_bf = await await_redis_response( + rq_client.redis.bf().info('request_queues:test_request_queue:added_bloom_filter') # type: ignore[no-untyped-call] + ) + assert added_bf is not None + + handled_bf = await await_redis_response( + rq_client.redis.bf().info('request_queues:test_request_queue:handled_bloom_filter') # type: ignore[no-untyped-call] + ) + assert handled_bf is not None + + metadata_data = await await_redis_response(rq_client.redis.json().get('request_queues:test_request_queue:metadata')) + + assert isinstance(metadata_data, dict) + assert metadata_data['id'] == metadata.id # type: ignore[unreachable] # py-json typing is broken + + +async def test_request_records_persistence(rq_client: RedisRequestQueueClient) -> None: + """Test that requests are properly persisted to Redis.""" + requests = [ + Request.from_url('https://example.com/1'), + Request.from_url('https://example.com/2'), + Request.from_url('https://example.com/3'), + ] + + await rq_client.add_batch_of_requests(requests) + + # Verify request records are created + request_queue_response = await await_redis_response( + rq_client.redis.lmpop(1, 'request_queues:test_request_queue:queue', direction='left', count=10) # type: ignore[arg-type] # redis-py typing is broken + ) + assert request_queue_response is not None + assert isinstance(request_queue_response, list) + request_keys = request_queue_response[1] + + assert len(request_keys) == 3 + + # Verify actual request file content + requests_records_data = await await_redis_response( + rq_client.redis.hgetall('request_queues:test_request_queue:data') + ) + assert isinstance(requests_records_data, dict) + + for key in request_keys: + request_data = json.loads(requests_records_data[key]) + assert 'url' in request_data + assert request_data['url'].startswith('https://example.com/') + + +async def test_drop_removes_records(rq_client: RedisRequestQueueClient) -> None: + """Test that drop removes all request records from Redis.""" + await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) + + rq_queue = 'request_queues:test_request_queue:queue' + rq_data = 'request_queues:test_request_queue:data' + added_bf = 'request_queues:test_request_queue:added_bloom_filter' + handled_bf = 'request_queues:test_request_queue:handled_bloom_filter' + metadata_key = 'request_queues:test_request_queue:metadata' + + metadata = await rq_client.get_metadata() + name = await await_redis_response(rq_client.redis.hget('request_queues:id_to_name', metadata.id)) + + assert name is not None + assert (name.decode() if isinstance(name, bytes) else name) == 'test_request_queue' + + rq_id = await await_redis_response(rq_client.redis.hget('request_queues:name_to_id', 'test_request_queue')) + assert rq_id is not None + assert rq_id.decode() if isinstance(rq_id, bytes) else rq_id + + rq_queue_exists = await await_redis_response(rq_client.redis.exists(rq_queue)) + rq_data_exists = await await_redis_response(rq_client.redis.exists(rq_data)) + added_bf_exists = await await_redis_response(rq_client.redis.exists(added_bf)) + handled_bf_exists = await await_redis_response(rq_client.redis.exists(handled_bf)) + metadata_exists = await await_redis_response(rq_client.redis.exists(metadata_key)) + assert rq_queue_exists == 1 + assert rq_data_exists == 1 + assert added_bf_exists == 1 + assert handled_bf_exists == 1 + assert metadata_exists == 1 + + # Drop the request queue + await rq_client.drop() + + # Verify removal of all records + name_after_drop = await await_redis_response(rq_client.redis.hget('request_queues:id_to_name', metadata.id)) + rq_id_after_drop = await await_redis_response( + rq_client.redis.hget('request_queues:name_to_id', 'test_request_queue') + ) + rq_queue_exists = await await_redis_response(rq_client.redis.exists(rq_queue)) + rq_data_exists = await await_redis_response(rq_client.redis.exists(rq_data)) + added_bf_exists = await await_redis_response(rq_client.redis.exists(added_bf)) + handled_bf_exists = await await_redis_response(rq_client.redis.exists(handled_bf)) + metadata_exists = await await_redis_response(rq_client.redis.exists(metadata_key)) + assert name_after_drop is None + assert rq_id_after_drop is None + assert rq_queue_exists == 0 + assert rq_data_exists == 0 + assert added_bf_exists == 0 + assert handled_bf_exists == 0 + assert metadata_exists == 0 + + +async def test_metadata_file_updates(rq_client: RedisRequestQueueClient) -> None: + """Test that metadata file is updated correctly after operations.""" + # Record initial timestamps + metadata = await rq_client.get_metadata() + initial_created = metadata.created_at + initial_accessed = metadata.accessed_at + initial_modified = metadata.modified_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform a read operation + await rq_client.is_empty() + + # Verify accessed timestamp was updated + metadata = await rq_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.accessed_at > initial_accessed + assert metadata.modified_at == initial_modified + + accessed_after_read = metadata.accessed_at + + # Wait a moment to ensure timestamps can change + await asyncio.sleep(0.01) + + # Perform a write operation + await rq_client.add_batch_of_requests([Request.from_url('https://example.com')]) + + # Verify modified timestamp was updated + metadata = await rq_client.get_metadata() + assert metadata.created_at == initial_created + assert metadata.modified_at > initial_modified + assert metadata.accessed_at > accessed_after_read + + # Verify metadata file is updated in Redis + metadata_json = await await_redis_response(rq_client.redis.json().get('request_queues:test_request_queue:metadata')) + assert isinstance(metadata_json, dict) + assert metadata_json['total_request_count'] == 1 # type: ignore[unreachable] # py-json typing is broken + + +async def test_data_persistence_across_reopens(redis_client: FakeAsyncRedis) -> None: + """Test that requests persist correctly when reopening the same RQ.""" + storage_client = RedisStorageClient(redis=redis_client) + + # Create RQ and add requests + original_client = await storage_client.create_rq_client( + name='persistence-test', + ) + + test_requests = [ + Request.from_url('https://example.com/1'), + Request.from_url('https://example.com/2'), + ] + await original_client.add_batch_of_requests(test_requests) + + rq_id = (await original_client.get_metadata()).id + + # Reopen by ID and verify requests persist + reopened_client = await storage_client.create_rq_client( + id=rq_id, + ) + + metadata = await reopened_client.get_metadata() + assert metadata.total_request_count == 2 + + # Fetch requests to verify they're still there + request1 = await reopened_client.fetch_next_request() + request2 = await reopened_client.fetch_next_request() + + assert request1 is not None + assert request2 is not None + assert {request1.url, request2.url} == {'https://example.com/1', 'https://example.com/2'} + + await reopened_client.drop() + + +async def test_get_request(rq_client: RedisRequestQueueClient) -> None: + """Test that get_request works correctly.""" + requests = [ + Request.from_url('https://example.com/1'), + Request.from_url('https://example.com/2'), + Request.from_url('https://example.com/3'), + ] + + added_requests = await rq_client.add_batch_of_requests(requests) + assert len(added_requests.processed_requests) == 3 + + for req in requests: + fetched_request = await rq_client.get_request(req.unique_key) + assert fetched_request is not None + assert fetched_request.unique_key == req.unique_key + assert fetched_request.url == req.url + + # Test fetching a non-existent request + non_existent = await rq_client.get_request('non-existent-id') + assert non_existent is None diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py index 2107a0a1c6..0f7f3ad796 100644 --- a/tests/unit/storages/test_dataset.py +++ b/tests/unit/storages/test_dataset.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING import pytest @@ -18,7 +18,6 @@ from typing import Any from fakeredis import FakeAsyncRedis - from redis.asyncio import Redis from crawlee.storage_clients import StorageClient @@ -30,7 +29,7 @@ def storage_client(request: pytest.FixtureRequest, redis_client: FakeAsyncRedis) return MemoryStorageClient() if request.param == 'redis': - return RedisStorageClient(redis=cast('Redis', redis_client)) + return RedisStorageClient(redis=redis_client) return FileSystemStorageClient() diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index 42030dade5..cfde51acc4 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -4,7 +4,7 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING import pytest @@ -18,7 +18,6 @@ from pathlib import Path from fakeredis import FakeAsyncRedis - from redis.asyncio import Redis from crawlee.storage_clients import StorageClient @@ -30,7 +29,7 @@ def storage_client(request: pytest.FixtureRequest, redis_client: FakeAsyncRedis) return MemoryStorageClient() if request.param == 'redis': - return RedisStorageClient(redis=cast('Redis', redis_client)) + return RedisStorageClient(redis=redis_client) return FileSystemStorageClient() diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index ae6cbead32..5624536c9c 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -4,7 +4,7 @@ from __future__ import annotations import asyncio -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING import pytest @@ -18,7 +18,6 @@ from pathlib import Path from fakeredis import FakeAsyncRedis - from redis.asyncio import Redis from crawlee.storage_clients import StorageClient @@ -30,7 +29,7 @@ def storage_client(request: pytest.FixtureRequest, redis_client: FakeAsyncRedis) return MemoryStorageClient() if request.param == 'redis': - return RedisStorageClient(redis=cast('Redis', redis_client)) + return RedisStorageClient(redis=redis_client) return FileSystemStorageClient() From 7c84ed1cfcc2c8bf99075b8762708457a1864d51 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 15 Sep 2025 18:41:49 +0000 Subject: [PATCH 09/12] suppress warnings --- src/crawlee/storage_clients/_redis/_client_mixin.py | 8 ++++---- .../storage_clients/_redis/_key_value_store_client.py | 6 ------ .../storage_clients/_redis/test_redis_dataset_client.py | 1 + .../unit/storage_clients/_redis/test_redis_kvs_client.py | 1 + tests/unit/storage_clients/_redis/test_redis_rq_client.py | 1 + tests/unit/storages/test_dataset.py | 6 +++++- tests/unit/storages/test_key_value_store.py | 6 +++++- tests/unit/storages/test_request_queue.py | 6 +++++- 8 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/crawlee/storage_clients/_redis/_client_mixin.py b/src/crawlee/storage_clients/_redis/_client_mixin.py index 5401db4c2b..aa8973740d 100644 --- a/src/crawlee/storage_clients/_redis/_client_mixin.py +++ b/src/crawlee/storage_clients/_redis/_client_mixin.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -from abc import ABC, abstractmethod from contextlib import asynccontextmanager from datetime import datetime, timezone from logging import getLogger @@ -33,7 +32,7 @@ class MetadataUpdateParams(TypedDict, total=False): update_modified_at: NotRequired[bool] -class RedisClientMixin(ABC): +class RedisClientMixin: """Mixin class for Redis clients. This mixin provides common Redis operations and basic methods for Redis storage clients. @@ -187,7 +186,7 @@ async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pip async def _create_storage(self, pipeline: Pipeline) -> None: """Create the actual storage structure in Redis.""" - _pipeline = pipeline # To avoid unused variable mypy error + _ = pipeline # To avoid unused variable mypy error async def _create_script(self, script_name: str) -> AsyncScript: """Load a Lua script from a file and return a Script object.""" @@ -254,7 +253,6 @@ async def _get_metadata( return metadata_model.model_validate(metadata_dict) - @abstractmethod async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs: Any) -> None: """Pipeline operations storage-specific metadata updates. @@ -264,6 +262,8 @@ async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs: Any) -> pipeline: The Redis pipeline to use for the update. **kwargs: Storage-specific update parameters. """ + _ = pipeline # To avoid unused variable mypy error + _ = kwargs async def _update_metadata( self, diff --git a/src/crawlee/storage_clients/_redis/_key_value_store_client.py b/src/crawlee/storage_clients/_redis/_key_value_store_client.py index 47e2b84a95..2db54ba743 100644 --- a/src/crawlee/storage_clients/_redis/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_redis/_key_value_store_client.py @@ -17,7 +17,6 @@ from collections.abc import AsyncIterator from redis.asyncio import Redis - from redis.asyncio.client import Pipeline logger = getLogger(__name__) @@ -262,8 +261,3 @@ async def record_exists(self, *, key: str) -> bool: results = await pipe.execute() return bool(results[0]) - - @override - async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs: Any) -> None: - # No specific fields to update for Redis key-value stores. - return diff --git a/tests/unit/storage_clients/_redis/test_redis_dataset_client.py b/tests/unit/storage_clients/_redis/test_redis_dataset_client.py index 010e868074..5ca17ad610 100644 --- a/tests/unit/storage_clients/_redis/test_redis_dataset_client.py +++ b/tests/unit/storage_clients/_redis/test_redis_dataset_client.py @@ -152,6 +152,7 @@ async def test_metadata_record_updates(dataset_client: RedisDatasetClient) -> No assert metadata_json['item_count'] == 1 # type: ignore[unreachable] # py-json typing is broken +@pytest.mark.usefixtures('suppress_user_warning') async def test_data_persistence_across_reopens(redis_client: FakeAsyncRedis) -> None: """Test that data persists correctly when reopening the same dataset.""" storage_client = RedisStorageClient(redis=redis_client) diff --git a/tests/unit/storage_clients/_redis/test_redis_kvs_client.py b/tests/unit/storage_clients/_redis/test_redis_kvs_client.py index 0b083a38d0..92948bb181 100644 --- a/tests/unit/storage_clients/_redis/test_redis_kvs_client.py +++ b/tests/unit/storage_clients/_redis/test_redis_kvs_client.py @@ -217,6 +217,7 @@ async def test_metadata_record_updates(kvs_client: RedisKeyValueStoreClient) -> assert metadata.accessed_at > accessed_after_read +@pytest.mark.usefixtures('suppress_user_warning') async def test_data_persistence_across_reopens(redis_client: FakeAsyncRedis) -> None: """Test that data persists correctly when reopening the same KVS.""" storage_client = RedisStorageClient(redis=redis_client) diff --git a/tests/unit/storage_clients/_redis/test_redis_rq_client.py b/tests/unit/storage_clients/_redis/test_redis_rq_client.py index 1233ee76a3..bbb4773dc2 100644 --- a/tests/unit/storage_clients/_redis/test_redis_rq_client.py +++ b/tests/unit/storage_clients/_redis/test_redis_rq_client.py @@ -186,6 +186,7 @@ async def test_metadata_file_updates(rq_client: RedisRequestQueueClient) -> None assert metadata_json['total_request_count'] == 1 # type: ignore[unreachable] # py-json typing is broken +@pytest.mark.usefixtures('suppress_user_warning') async def test_data_persistence_across_reopens(redis_client: FakeAsyncRedis) -> None: """Test that requests persist correctly when reopening the same RQ.""" storage_client = RedisStorageClient(redis=redis_client) diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py index 0f7f3ad796..77c6dfc4ed 100644 --- a/tests/unit/storages/test_dataset.py +++ b/tests/unit/storages/test_dataset.py @@ -23,7 +23,11 @@ @pytest.fixture(params=['memory', 'file_system', 'redis']) -def storage_client(request: pytest.FixtureRequest, redis_client: FakeAsyncRedis) -> StorageClient: +def storage_client( + request: pytest.FixtureRequest, + redis_client: FakeAsyncRedis, + suppress_user_warning: None, # noqa: ARG001 +) -> StorageClient: """Parameterized fixture to test with different storage clients.""" if request.param == 'memory': return MemoryStorageClient() diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index cfde51acc4..a5602bd9ec 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -23,7 +23,11 @@ @pytest.fixture(params=['memory', 'file_system', 'redis']) -def storage_client(request: pytest.FixtureRequest, redis_client: FakeAsyncRedis) -> StorageClient: +def storage_client( + request: pytest.FixtureRequest, + redis_client: FakeAsyncRedis, + suppress_user_warning: None, # noqa: ARG001 +) -> StorageClient: """Parameterized fixture to test with different storage clients.""" if request.param == 'memory': return MemoryStorageClient() diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index 5624536c9c..8c5ba5bb73 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -23,7 +23,11 @@ @pytest.fixture(params=['memory', 'file_system', 'redis']) -def storage_client(request: pytest.FixtureRequest, redis_client: FakeAsyncRedis) -> StorageClient: +def storage_client( + request: pytest.FixtureRequest, + redis_client: FakeAsyncRedis, + suppress_user_warning: None, # noqa: ARG001 +) -> StorageClient: """Parameterized fixture to test with different storage clients.""" if request.param == 'memory': return MemoryStorageClient() From 122c923e8205ed3c6d91ceb9fad9a58c5eb14e98 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 28 Sep 2025 18:36:11 +0000 Subject: [PATCH 10/12] add default dedup strategy --- .../storage_clients/_redis/_client_mixin.py | 8 +- .../storage_clients/_redis/_dataset_client.py | 1 + .../_redis/_key_value_store_client.py | 1 + .../_redis/_request_queue_client.py | 108 +++++++++++++----- .../storage_clients/_redis/_storage_client.py | 9 ++ ...ests.lua => atomic_bloom_add_requests.lua} | 0 .../lua_scripts/atomic_set_add_requests.lua | 37 ++++++ .../_redis/test_redis_rq_client.py | 57 ++++++--- 8 files changed, 169 insertions(+), 52 deletions(-) rename src/crawlee/storage_clients/_redis/lua_scripts/{atomic_add_requests.lua => atomic_bloom_add_requests.lua} (100%) create mode 100644 src/crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua diff --git a/src/crawlee/storage_clients/_redis/_client_mixin.py b/src/crawlee/storage_clients/_redis/_client_mixin.py index aa8973740d..ba7fbc51d3 100644 --- a/src/crawlee/storage_clients/_redis/_client_mixin.py +++ b/src/crawlee/storage_clients/_redis/_client_mixin.py @@ -110,6 +110,7 @@ async def _open( metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata], redis: Redis, extra_metadata_fields: dict[str, Any], + instance_kwargs: dict[str, Any], ) -> Self: """Open or create a new Redis storage client. @@ -120,6 +121,7 @@ async def _open( redis: Redis client instance. metadata_model: Pydantic model for metadata validation. extra_metadata_fields: Storage-specific metadata fields. + instance_kwargs: Additional arguments for the client constructor. Returns: An instance for the opened or created storage client. @@ -138,7 +140,7 @@ async def _open( storage_id = metadata_data['id'] if metadata_data is not None else None # If both storage_name and storage_id are found, open existing storage if storage_name and storage_id: - client = cls(storage_name=storage_name, storage_id=storage_id, redis=redis) + client = cls(storage_name=storage_name, storage_id=storage_id, redis=redis, **instance_kwargs) async with client._get_pipeline() as pipe: await client._update_metadata(pipe, update_accessed_at=True) # Otherwise, create a new storage @@ -152,12 +154,12 @@ async def _open( modified_at=now, **extra_metadata_fields, ) - client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis) + client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis, **instance_kwargs) created = await client._create_metadata_and_storage(internal_name, metadata.model_dump()) # The client was probably not created due to a race condition. Let's try to open it using the name. if not created: metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis, with_wait=True) - client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis) + client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis, **instance_kwargs) # Ensure Lua scripts are loaded await client._ensure_scripts_loaded() diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index 1cf2260ee2..f0d2295e15 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -99,6 +99,7 @@ async def open( redis=redis, metadata_model=DatasetMetadata, extra_metadata_fields={'item_count': 0}, + instance_kwargs={}, ) @override diff --git a/src/crawlee/storage_clients/_redis/_key_value_store_client.py b/src/crawlee/storage_clients/_redis/_key_value_store_client.py index 2db54ba743..99f9665ea7 100644 --- a/src/crawlee/storage_clients/_redis/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_redis/_key_value_store_client.py @@ -97,6 +97,7 @@ async def open( redis=redis, metadata_model=KeyValueStoreMetadata, extra_metadata_fields={}, + instance_kwargs={}, ) @override diff --git a/src/crawlee/storage_clients/_redis/_request_queue_client.py b/src/crawlee/storage_clients/_redis/_request_queue_client.py index 941f700009..be69749555 100644 --- a/src/crawlee/storage_clients/_redis/_request_queue_client.py +++ b/src/crawlee/storage_clients/_redis/_request_queue_client.py @@ -4,7 +4,7 @@ from collections import deque from datetime import datetime, timedelta, timezone from logging import getLogger -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal from typing_extensions import NotRequired, override @@ -81,13 +81,22 @@ class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin): _RECLAIM_INTERVAL = timedelta(seconds=30) """Interval to check for stale requests to reclaim.""" - def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: + def __init__( + self, + storage_name: str, + storage_id: str, + redis: Redis, + dedup_strategy: Literal['default', 'bloom'] = 'default', + ) -> None: """Initialize a new instance. Preferably use the `RedisRequestQueueClient.open` class method to create a new instance. """ super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis) + self._dedup_strategy = dedup_strategy + """Deduplication strategy for the queue.""" + self._pending_fetch_cache: deque[Request] = deque() """Cache for requests: ordered by sequence number.""" @@ -104,13 +113,31 @@ def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None: @property def _added_filter_key(self) -> str: """Return the Redis key for the added requests Bloom filter.""" + if self._dedup_strategy != 'bloom': + raise RuntimeError('The added requests filter is only available with the bloom deduplication strategy.') return f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter' @property def _handled_filter_key(self) -> str: """Return the Redis key for the handled requests Bloom filter.""" + if self._dedup_strategy != 'bloom': + raise RuntimeError('The handled requests filter is only available with the bloom deduplication strategy.') return f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter' + @property + def _pending_set_key(self) -> str: + """Return the Redis key for the pending requests set.""" + if self._dedup_strategy != 'default': + raise RuntimeError('The pending requests set is only available with the default deduplication strategy.') + return f'{self._MAIN_KEY}:{self._storage_name}:pending_set' + + @property + def _handled_set_key(self) -> str: + """Return the Redis key for the handled requests set.""" + if self._dedup_strategy != 'default': + raise RuntimeError('The handled requests set is only available with the default deduplication strategy.') + return f'{self._MAIN_KEY}:{self._storage_name}:handled_set' + @property def _queue_key(self) -> str: """Return the Redis key for the request queue.""" @@ -134,6 +161,7 @@ async def open( name: str | None, alias: str | None, redis: Redis, + dedup_strategy: Literal['default', 'bloom'] = 'default', ) -> RedisRequestQueueClient: """Open or create a new Redis request queue client. @@ -146,6 +174,10 @@ async def open( name: The name of the dataset for named (global scope) storages. alias: The alias of the dataset for unnamed (run scope) storages. redis: Redis client instance. + dedup_strategy: Strategy for request queue deduplication. Options are: + - 'default': Uses Redis sets for exact deduplication. + - 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using + this approach, there is a possibility 1e-7 that requests will be skipped in the queue. Returns: An instance for the opened or created storage client. @@ -162,6 +194,7 @@ async def open( 'pending_request_count': 0, 'total_request_count': 0, }, + instance_kwargs={'dedup_strategy': dedup_strategy}, ) @override @@ -170,26 +203,26 @@ async def get_metadata(self) -> RequestQueueMetadata: @override async def drop(self) -> None: - await self._drop( - extra_keys=[ - self._added_filter_key, - self._handled_filter_key, - self._queue_key, - self._data_key, - self._in_progress_key, - ] - ) + if self._dedup_strategy == 'bloom': + extra_keys = [self._added_filter_key, self._handled_filter_key] + elif self._dedup_strategy == 'default': + extra_keys = [self._pending_set_key, self._handled_set_key] + else: + raise RuntimeError(f'Unknown deduplication strategy: {self._dedup_strategy}') + extra_keys.extend([self._queue_key, self._data_key, self._in_progress_key]) + await self._drop(extra_keys=extra_keys) @override async def purge(self) -> None: + if self._dedup_strategy == 'bloom': + extra_keys = [self._added_filter_key, self._handled_filter_key] + elif self._dedup_strategy == 'default': + extra_keys = [self._pending_set_key, self._handled_set_key] + else: + raise RuntimeError(f'Unknown deduplication strategy: {self._dedup_strategy}') + extra_keys.extend([self._queue_key, self._data_key, self._in_progress_key]) await self._purge( - extra_keys=[ - self._added_filter_key, - self._handled_filter_key, - self._queue_key, - self._data_key, - self._in_progress_key, - ], + extra_keys=extra_keys, metadata_kwargs=_QueueMetadataUpdateParams( update_accessed_at=True, update_modified_at=True, @@ -217,13 +250,17 @@ async def add_batch_of_requests( unique_keys = list(requests_by_unique_key.keys()) # Check which requests are already added or handled async with self._get_pipeline(with_execute=False) as pipe: - await await_redis_response(pipe.bf().mexists(self._added_filter_key, *unique_keys)) # type: ignore[no-untyped-call] - await await_redis_response(pipe.bf().mexists(self._handled_filter_key, *unique_keys)) # type: ignore[no-untyped-call] + if self._dedup_strategy == 'default': + await await_redis_response(pipe.smismember(self._pending_set_key, unique_keys)) + await await_redis_response(pipe.smismember(self._handled_set_key, unique_keys)) + elif self._dedup_strategy == 'bloom': + await await_redis_response(pipe.bf().mexists(self._added_filter_key, *unique_keys)) # type: ignore[no-untyped-call] + await await_redis_response(pipe.bf().mexists(self._handled_filter_key, *unique_keys)) # type: ignore[no-untyped-call] - results = await pipe.execute() + pipe_results = await pipe.execute() - added_flags = results[0] - handled_flags = results[1] + added_pending_flags = pipe_results[0] + handled_flags = pipe_results[1] new_unique_keys = [] new_request_data = {} @@ -243,7 +280,7 @@ async def add_batch_of_requests( continue # Already in queue - skip - if added_flags[i]: + if added_pending_flags[i]: processed_requests.append( ProcessedRequest( unique_key=unique_key, @@ -262,7 +299,11 @@ async def add_batch_of_requests( if new_unique_keys: # Add new requests to the queue atomically, get back which were actually added script_results = await self._add_requests_script( - keys=[self._added_filter_key, self._queue_key, self._data_key], + keys=[ + self._added_filter_key if self._dedup_strategy == 'bloom' else self._pending_set_key, + self._queue_key, + self._data_key, + ], args=[int(forefront), json.dumps(new_unique_keys), json.dumps(new_request_data)], ) actually_added = set(json.loads(script_results)) @@ -345,14 +386,17 @@ async def get_request(self, unique_key: str) -> Request | None: @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: # Check if the request is in progress. - check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key)) if not check_in_progress: logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.') return None async with self._get_pipeline() as pipe: - await await_redis_response(pipe.bf().add(self._handled_filter_key, request.unique_key)) # type: ignore[no-untyped-call] + if self._dedup_strategy == 'default': + await await_redis_response(pipe.sadd(self._handled_set_key, request.unique_key)) + await await_redis_response(pipe.srem(self._pending_set_key, request.unique_key)) + elif self._dedup_strategy == 'bloom': + await await_redis_response(pipe.bf().add(self._handled_filter_key, request.unique_key)) # type: ignore[no-untyped-call] await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key)) await await_redis_response(pipe.hdel(self._data_key, request.unique_key)) @@ -439,13 +483,17 @@ async def _load_scripts(self) -> None: """Ensure Lua scripts are loaded in Redis.""" self._fetch_script = await self._create_script('atomic_fetch_request.lua') self._reclaim_stale_script = await self._create_script('reclaim_stale_requests.lua') - self._add_requests_script = await self._create_script('atomic_add_requests.lua') + if self._dedup_strategy == 'bloom': + self._add_requests_script = await self._create_script('atomic_bloom_add_requests.lua') + elif self._dedup_strategy == 'default': + self._add_requests_script = await self._create_script('atomic_set_add_requests.lua') @override async def _create_storage(self, pipeline: Pipeline) -> None: # Create Bloom filters for added and handled requests - await await_redis_response(pipeline.bf().create(self._added_filter_key, 1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] - await await_redis_response(pipeline.bf().create(self._handled_filter_key, 1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] + if self._dedup_strategy == 'bloom': + await await_redis_response(pipeline.bf().create(self._added_filter_key, 1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] + await await_redis_response(pipeline.bf().create(self._handled_filter_key, 1e-7, 100000, expansion=10)) # type: ignore[no-untyped-call] async def _reclaim_stale_requests(self) -> None: """Reclaim requests that have been in progress for too long.""" diff --git a/src/crawlee/storage_clients/_redis/_storage_client.py b/src/crawlee/storage_clients/_redis/_storage_client.py index aed41779b1..039a3890c5 100644 --- a/src/crawlee/storage_clients/_redis/_storage_client.py +++ b/src/crawlee/storage_clients/_redis/_storage_client.py @@ -1,6 +1,7 @@ from __future__ import annotations import warnings +from typing import Literal from redis.asyncio import Redis from typing_extensions import override @@ -40,6 +41,7 @@ def __init__( *, connection_string: str | None = None, redis: Redis | None = None, + queue_dedup_strategy: Literal['default', 'bloom'] = 'default', ) -> None: """Initialize the Redis storage client. @@ -47,6 +49,10 @@ def __init__( connection_string: Redis connection string (e.g., "redis://localhost:6379"). Supports standard Redis URL format with optional database selection. redis: Pre-configured Redis client instance. + queue_dedup_strategy: Strategy for request queue deduplication. Options are: + - 'default': Uses Redis sets for exact deduplication. + - 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using + this approach, there is a possibility 1e-7 that requests will be skipped in the queue. """ if redis is not None and connection_string is not None: raise ValueError('Either redis or connection_string must be provided, not both.') @@ -60,6 +66,8 @@ def __init__( elif connection_string is not None: self._redis = Redis.from_url(connection_string) + self._queue_dedup_strategy = queue_dedup_strategy + # Call the notification only once warnings.warn( 'The RedisStorageClient is experimental and may change or be removed in future releases.', @@ -125,6 +133,7 @@ async def create_rq_client( name=name, alias=alias, redis=self._redis, + dedup_strategy=self._queue_dedup_strategy, ) await self._purge_if_needed(client, configuration) diff --git a/src/crawlee/storage_clients/_redis/lua_scripts/atomic_add_requests.lua b/src/crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua similarity index 100% rename from src/crawlee/storage_clients/_redis/lua_scripts/atomic_add_requests.lua rename to src/crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua diff --git a/src/crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua b/src/crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua new file mode 100644 index 0000000000..3154432a93 --- /dev/null +++ b/src/crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua @@ -0,0 +1,37 @@ +local added_filter_key = KEYS[1] +local queue_key = KEYS[2] +local data_key = KEYS[3] + +local forefront = ARGV[1] == '1' +local unique_keys = cjson.decode(ARGV[2]) +local requests_data = cjson.decode(ARGV[3]) + +-- Add and check which unique keys are actually new using Redis set +local actually_added = {} +local hset_args = {} + +-- Process each unique key +for _, unique_key in ipairs(unique_keys) do + -- Try to add the key to the set, returns 1 if added, 0 if already existed + local set_result = redis.call('sadd', added_filter_key, unique_key) + + if set_result == 1 then + -- This key was added by us (did not exist before) + table.insert(hset_args, unique_key) + table.insert(hset_args, requests_data[unique_key]) + table.insert(actually_added, unique_key) + end +end + +-- Add only those that are actually new +if #actually_added > 0 then + redis.call('hset', data_key, unpack(hset_args)) + + if forefront then + redis.call('lpush', queue_key, unpack(actually_added)) + else + redis.call('rpush', queue_key, unpack(actually_added)) + end +end + +return cjson.encode(actually_added) diff --git a/tests/unit/storage_clients/_redis/test_redis_rq_client.py b/tests/unit/storage_clients/_redis/test_redis_rq_client.py index bbb4773dc2..f695350d65 100644 --- a/tests/unit/storage_clients/_redis/test_redis_rq_client.py +++ b/tests/unit/storage_clients/_redis/test_redis_rq_client.py @@ -18,14 +18,15 @@ from crawlee.storage_clients._redis import RedisRequestQueueClient -@pytest.fixture +@pytest.fixture(params=['default', 'bloom']) async def rq_client( redis_client: FakeAsyncRedis, + request: pytest.FixtureRequest, suppress_user_warning: None, # noqa: ARG001 ) -> AsyncGenerator[RedisRequestQueueClient, None]: """A fixture for a Redis RQ client.""" - client = await RedisStorageClient(redis=redis_client).create_rq_client( - name='test_request_queue', + client = await RedisStorageClient(redis=redis_client, queue_dedup_strategy=request.param).create_rq_client( + name='test_request_queue' ) yield client await client.drop() @@ -45,15 +46,16 @@ async def test_base_keys_creation(rq_client: RedisRequestQueueClient) -> None: assert kvs_id is not None assert (kvs_id.decode() if isinstance(kvs_id, bytes) else kvs_id) == metadata.id - added_bf = await await_redis_response( - rq_client.redis.bf().info('request_queues:test_request_queue:added_bloom_filter') # type: ignore[no-untyped-call] - ) - assert added_bf is not None + if rq_client._dedup_strategy == 'bloom': + added_bf = await await_redis_response( + rq_client.redis.exists('request_queues:test_request_queue:added_bloom_filter') + ) + assert added_bf == 1 - handled_bf = await await_redis_response( - rq_client.redis.bf().info('request_queues:test_request_queue:handled_bloom_filter') # type: ignore[no-untyped-call] - ) - assert handled_bf is not None + handled_bf = await await_redis_response( + rq_client.redis.exists('request_queues:test_request_queue:handled_bloom_filter') + ) + assert handled_bf == 1 metadata_data = await await_redis_response(rq_client.redis.json().get('request_queues:test_request_queue:metadata')) @@ -101,6 +103,8 @@ async def test_drop_removes_records(rq_client: RedisRequestQueueClient) -> None: rq_data = 'request_queues:test_request_queue:data' added_bf = 'request_queues:test_request_queue:added_bloom_filter' handled_bf = 'request_queues:test_request_queue:handled_bloom_filter' + pending_set = 'request_queues:test_request_queue:pending_set' + handled_set = 'request_queues:test_request_queue:handled_set' metadata_key = 'request_queues:test_request_queue:metadata' metadata = await rq_client.get_metadata() @@ -115,15 +119,23 @@ async def test_drop_removes_records(rq_client: RedisRequestQueueClient) -> None: rq_queue_exists = await await_redis_response(rq_client.redis.exists(rq_queue)) rq_data_exists = await await_redis_response(rq_client.redis.exists(rq_data)) - added_bf_exists = await await_redis_response(rq_client.redis.exists(added_bf)) - handled_bf_exists = await await_redis_response(rq_client.redis.exists(handled_bf)) metadata_exists = await await_redis_response(rq_client.redis.exists(metadata_key)) assert rq_queue_exists == 1 assert rq_data_exists == 1 - assert added_bf_exists == 1 - assert handled_bf_exists == 1 assert metadata_exists == 1 + if rq_client._dedup_strategy == 'bloom': + added_bf_exists = await await_redis_response(rq_client.redis.exists(added_bf)) + handled_bf_exists = await await_redis_response(rq_client.redis.exists(handled_bf)) + assert added_bf_exists == 1 + assert handled_bf_exists == 1 + elif rq_client._dedup_strategy == 'default': + pending_set_exists = await await_redis_response(rq_client.redis.exists(pending_set)) + handled_set_exists = await await_redis_response(rq_client.redis.exists(handled_set)) + assert pending_set_exists == 1 + # No requests marked as handled + assert handled_set_exists == 0 + # Drop the request queue await rq_client.drop() @@ -134,17 +146,24 @@ async def test_drop_removes_records(rq_client: RedisRequestQueueClient) -> None: ) rq_queue_exists = await await_redis_response(rq_client.redis.exists(rq_queue)) rq_data_exists = await await_redis_response(rq_client.redis.exists(rq_data)) - added_bf_exists = await await_redis_response(rq_client.redis.exists(added_bf)) - handled_bf_exists = await await_redis_response(rq_client.redis.exists(handled_bf)) metadata_exists = await await_redis_response(rq_client.redis.exists(metadata_key)) assert name_after_drop is None assert rq_id_after_drop is None assert rq_queue_exists == 0 assert rq_data_exists == 0 - assert added_bf_exists == 0 - assert handled_bf_exists == 0 assert metadata_exists == 0 + if rq_client._dedup_strategy == 'bloom': + added_bf_exists = await await_redis_response(rq_client.redis.exists(added_bf)) + handled_bf_exists = await await_redis_response(rq_client.redis.exists(handled_bf)) + assert added_bf_exists == 0 + assert handled_bf_exists == 0 + elif rq_client._dedup_strategy == 'default': + pending_set_exists = await await_redis_response(rq_client.redis.exists(pending_set)) + handled_set_exists = await await_redis_response(rq_client.redis.exists(handled_set)) + assert pending_set_exists == 0 + assert handled_set_exists == 0 + async def test_metadata_file_updates(rq_client: RedisRequestQueueClient) -> None: """Test that metadata file is updated correctly after operations.""" From ec343869dc2b66f7d27f77377c045c1618e0fd5f Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 28 Sep 2025 18:56:08 +0000 Subject: [PATCH 11/12] up tests --- tests/unit/storages/conftest.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/unit/storages/conftest.py b/tests/unit/storages/conftest.py index cc54ef2a49..520522028c 100644 --- a/tests/unit/storages/conftest.py +++ b/tests/unit/storages/conftest.py @@ -17,7 +17,7 @@ from fakeredis import FakeAsyncRedis -@pytest.fixture(params=['memory', 'file_system', 'sql', 'redis']) +@pytest.fixture(params=['memory', 'file_system', 'sql', ('redis', 'default'), ('redis', 'bloom')]) def storage_client( request: pytest.FixtureRequest, redis_client: FakeAsyncRedis, @@ -25,12 +25,19 @@ def storage_client( ) -> StorageClient: """Parameterized fixture to test with different storage clients.""" storage_client: StorageClient - if request.param == 'memory': + + if isinstance(request.param, tuple): + storage_type, storage_config = request.param + else: + storage_type = request.param + storage_config = None + + if storage_type == 'memory': storage_client = MemoryStorageClient() - elif request.param == 'sql': + elif storage_type == 'sql': storage_client = SqlStorageClient() - elif request.param == 'redis': - storage_client = RedisStorageClient(redis=redis_client) + elif storage_type == 'redis' and storage_config in ('default', 'bloom'): + storage_client = RedisStorageClient(redis=redis_client, queue_dedup_strategy=storage_config) else: storage_client = FileSystemStorageClient() service_locator.set_storage_client(storage_client) From c1dda54456e9d545cf0d12db6a967503c49ed36e Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 28 Sep 2025 19:07:48 +0000 Subject: [PATCH 12/12] up docs --- docs/guides/storage_clients.mdx | 8 +++++--- .../storage_clients/_redis/_request_queue_client.py | 7 +++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/guides/storage_clients.mdx b/docs/guides/storage_clients.mdx index a632250416..114ea43e1e 100644 --- a/docs/guides/storage_clients.mdx +++ b/docs/guides/storage_clients.mdx @@ -319,7 +319,7 @@ For advanced scenarios, you can configure ` {SQLStorageClientConfigurationExample} -## Redis storage client +### Redis storage client :::warning Experimental feature The `RedisStorageClient` is experimental. Its API and behavior may change in future releases. @@ -448,8 +448,10 @@ class Request_Queue_Keys{ request_queues:[name]:queue - List request_queues:[name]:data - Hash request_queues:[name]:in_progress - Hash - request_queues:[name]:added_bloom_filter - Bloom Filter - request_queues:[name]:handled_bloom_filter - Bloom Filter + request_queues:[name]:added_bloom_filter - Bloom Filter | bloom queue_dedup_strategy + request_queues:[name]:handled_bloom_filter - Bloom Filter | bloom queue_dedup_strategy + request_queues:[name]:pending_set - Set | default queue_dedup_strategy + request_queues:[name]:handled_set - Set | default queue_dedup_strategy request_queues:[name]:metadata - JSON Object } diff --git a/src/crawlee/storage_clients/_redis/_request_queue_client.py b/src/crawlee/storage_clients/_redis/_request_queue_client.py index be69749555..85f8091aea 100644 --- a/src/crawlee/storage_clients/_redis/_request_queue_client.py +++ b/src/crawlee/storage_clients/_redis/_request_queue_client.py @@ -54,8 +54,11 @@ class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin): - `request_queues:{name}:queue` - Redis list for FIFO request ordering - `request_queues:{name}:data` - Redis hash storing serialized Request objects by unique_key - `request_queues:{name}:in_progress` - Redis hash tracking requests currently being processed - - `request_queues:{name}:added_bloom_filter` - Bloom filter for added request deduplication - - `request_queues:{name}:handled_bloom_filter` - Bloom filter for completed request tracking + - `request_queues:{name}:added_bloom_filter` - Bloom filter for added request deduplication (`bloom` dedup_strategy) + - `request_queues:{name}:handled_bloom_filter` - Bloom filter for completed request tracking (`bloom` + dedup_strategy) + - `request_queues:{name}:pending_set` - Redis set for added request deduplication (`default` dedup_strategy) + - `request_queues:{name}:handled_set` - Redis set for completed request tracking (`default` dedup_strategy) - `request_queues:{name}:metadata` - Redis JSON object containing queue metadata Requests are serialized to JSON for storage and maintain proper FIFO ordering through Redis list