From 5c437c9a67f041960e4b80cb5ff2eef66bc845d3 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 28 Apr 2025 14:04:57 +0200 Subject: [PATCH 01/44] Rm old Apify storage clients --- src/apify/apify_storage_client/__init__.py | 3 - .../_apify_storage_client.py | 72 ------- .../apify_storage_client/_dataset_client.py | 190 ------------------ .../_dataset_collection_client.py | 51 ----- .../_key_value_store_client.py | 109 ---------- .../_key_value_store_collection_client.py | 51 ----- .../_request_queue_client.py | 176 ---------------- .../_request_queue_collection_client.py | 51 ----- src/apify/apify_storage_client/py.typed | 0 9 files changed, 703 deletions(-) delete mode 100644 src/apify/apify_storage_client/__init__.py delete mode 100644 src/apify/apify_storage_client/_apify_storage_client.py delete mode 100644 src/apify/apify_storage_client/_dataset_client.py delete mode 100644 src/apify/apify_storage_client/_dataset_collection_client.py delete mode 100644 src/apify/apify_storage_client/_key_value_store_client.py delete mode 100644 src/apify/apify_storage_client/_key_value_store_collection_client.py delete mode 100644 src/apify/apify_storage_client/_request_queue_client.py delete mode 100644 src/apify/apify_storage_client/_request_queue_collection_client.py delete mode 100644 src/apify/apify_storage_client/py.typed diff --git a/src/apify/apify_storage_client/__init__.py b/src/apify/apify_storage_client/__init__.py deleted file mode 100644 index 8b6d517c..00000000 --- a/src/apify/apify_storage_client/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from apify.apify_storage_client._apify_storage_client import ApifyStorageClient - -__all__ = ['ApifyStorageClient'] diff --git a/src/apify/apify_storage_client/_apify_storage_client.py b/src/apify/apify_storage_client/_apify_storage_client.py deleted file mode 100644 index 51e3fc24..00000000 --- a/src/apify/apify_storage_client/_apify_storage_client.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from apify_client import ApifyClientAsync -from crawlee._utils.crypto import crypto_random_object_id -from crawlee.storage_clients import StorageClient - -from apify._utils import docs_group -from apify.apify_storage_client._dataset_client import DatasetClient -from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient -from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient -from apify.apify_storage_client._key_value_store_collection_client import KeyValueStoreCollectionClient -from apify.apify_storage_client._request_queue_client import RequestQueueClient -from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient - -if TYPE_CHECKING: - from apify._configuration import Configuration - - -@docs_group('Classes') -class ApifyStorageClient(StorageClient): - """A storage client implementation based on the Apify platform storage.""" - - def __init__(self, *, configuration: Configuration) -> None: - self._client_key = crypto_random_object_id() - self._apify_client = ApifyClientAsync( - token=configuration.token, - api_url=configuration.api_base_url, - max_retries=8, - min_delay_between_retries_millis=500, - timeout_secs=360, - ) - self._configuration = configuration - - @classmethod - def from_config(cls, config: Configuration) -> ApifyStorageClient: - return cls(configuration=config) - - @override - def dataset(self, id: str) -> DatasetClient: - return DatasetClient(self._apify_client.dataset(id)) - - @override - def datasets(self) -> DatasetCollectionClient: - return DatasetCollectionClient(self._apify_client.datasets()) - - @override - def key_value_store(self, id: str) -> KeyValueStoreClient: - return KeyValueStoreClient(self._apify_client.key_value_store(id), self._configuration.api_public_base_url) - - @override - def key_value_stores(self) -> KeyValueStoreCollectionClient: - return KeyValueStoreCollectionClient(self._apify_client.key_value_stores()) - - @override - def request_queue(self, id: str) -> RequestQueueClient: - return RequestQueueClient(self._apify_client.request_queue(id, client_key=self._client_key)) - - @override - def request_queues(self) -> RequestQueueCollectionClient: - return RequestQueueCollectionClient(self._apify_client.request_queues()) - - @override - async def purge_on_start(self) -> None: - pass - - @override - def get_rate_limit_errors(self) -> dict[int, int]: - return self._apify_client.stats.rate_limit_errors diff --git a/src/apify/apify_storage_client/_dataset_client.py b/src/apify/apify_storage_client/_dataset_client.py deleted file mode 100644 index 93c8d575..00000000 --- a/src/apify/apify_storage_client/_dataset_client.py +++ /dev/null @@ -1,190 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee.storage_clients._base import DatasetClient as BaseDatasetClient -from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata - -if TYPE_CHECKING: - from collections.abc import AsyncIterator - from contextlib import AbstractAsyncContextManager - - from httpx import Response - - from apify_client.clients import DatasetClientAsync - from crawlee._types import JsonSerializable - - -class DatasetClient(BaseDatasetClient): - """Dataset resource client implementation based on the Apify platform storage.""" - - def __init__(self, apify_dataset_client: DatasetClientAsync) -> None: - self._client = apify_dataset_client - - @override - async def get(self) -> DatasetMetadata | None: - result = await self._client.get() - return DatasetMetadata.model_validate(result) if result else None - - @override - async def update( - self, - *, - name: str | None = None, - ) -> DatasetMetadata: - return DatasetMetadata.model_validate( - await self._client.update( - name=name, - ) - ) - - @override - async def delete(self) -> None: - await self._client.delete() - - @override - async def list_items( - self, - *, - offset: int | None = 0, - limit: int | None = BaseDatasetClient._LIST_ITEMS_LIMIT, # noqa: SLF001 - clean: bool = False, - desc: bool = False, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool = False, - skip_hidden: bool = False, - flatten: list[str] | None = None, - view: str | None = None, - ) -> DatasetItemsListPage: - return DatasetItemsListPage.model_validate( - vars( - await self._client.list_items( - offset=offset, - limit=limit, - clean=clean, - desc=desc, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_hidden=skip_hidden, - flatten=flatten, - view=view, - ) - ) - ) - - @override - async def iterate_items( - self, - *, - offset: int = 0, - limit: int | None = None, - clean: bool = False, - desc: bool = False, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool = False, - skip_hidden: bool = False, - ) -> AsyncIterator[dict]: - async for item in self._client.iterate_items( - offset=offset, - limit=limit, - clean=clean, - desc=desc, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_hidden=skip_hidden, - ): - yield item - - @override - async def get_items_as_bytes( - self, - *, - item_format: str = 'json', - offset: int | None = None, - limit: int | None = None, - desc: bool = False, - clean: bool = False, - bom: bool = False, - delimiter: str | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool = False, - skip_header_row: bool = False, - skip_hidden: bool = False, - xml_root: str | None = None, - xml_row: str | None = None, - flatten: list[str] | None = None, - ) -> bytes: - return await self._client.get_items_as_bytes( - item_format=item_format, - offset=offset, - limit=limit, - desc=desc, - clean=clean, - bom=bom, - delimiter=delimiter, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_header_row=skip_header_row, - skip_hidden=skip_hidden, - xml_root=xml_root, - xml_row=xml_row, - flatten=flatten, - ) - - @override - async def stream_items( - self, - *, - item_format: str = 'json', - offset: int | None = None, - limit: int | None = None, - desc: bool = False, - clean: bool = False, - bom: bool = False, - delimiter: str | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool = False, - skip_header_row: bool = False, - skip_hidden: bool = False, - xml_root: str | None = None, - xml_row: str | None = None, - ) -> AbstractAsyncContextManager[Response | None]: - return self._client.stream_items( - item_format=item_format, - offset=offset, - limit=limit, - desc=desc, - clean=clean, - bom=bom, - delimiter=delimiter, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_header_row=skip_header_row, - skip_hidden=skip_hidden, - xml_root=xml_root, - xml_row=xml_row, - ) - - @override - async def push_items(self, items: JsonSerializable) -> None: - await self._client.push_items( - items=items, - ) diff --git a/src/apify/apify_storage_client/_dataset_collection_client.py b/src/apify/apify_storage_client/_dataset_collection_client.py deleted file mode 100644 index f8ffc3e8..00000000 --- a/src/apify/apify_storage_client/_dataset_collection_client.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee.storage_clients._base import DatasetCollectionClient as BaseDatasetCollectionClient -from crawlee.storage_clients.models import DatasetListPage, DatasetMetadata - -if TYPE_CHECKING: - from apify_client.clients import DatasetCollectionClientAsync - - -class DatasetCollectionClient(BaseDatasetCollectionClient): - """Dataset collection resource client implementation based on the Apify platform storage.""" - - def __init__(self, apify_dataset_collection_client: DatasetCollectionClientAsync) -> None: - self._client = apify_dataset_collection_client - - @override - async def get_or_create( - self, - *, - id: str | None = None, - name: str | None = None, - schema: dict | None = None, - ) -> DatasetMetadata: - return DatasetMetadata.model_validate( - await self._client.get_or_create( - name=id if id is not None else name, - schema=schema, - ) - ) - - @override - async def list( - self, - *, - unnamed: bool = False, - limit: int | None = None, - offset: int | None = None, - desc: bool = False, - ) -> DatasetListPage: - return DatasetListPage.model_validate( - await self._client.list( - unnamed=unnamed, - limit=limit, - offset=offset, - desc=desc, - ) - ) diff --git a/src/apify/apify_storage_client/_key_value_store_client.py b/src/apify/apify_storage_client/_key_value_store_client.py deleted file mode 100644 index 49883b3f..00000000 --- a/src/apify/apify_storage_client/_key_value_store_client.py +++ /dev/null @@ -1,109 +0,0 @@ -from __future__ import annotations - -from contextlib import asynccontextmanager -from typing import TYPE_CHECKING, Any - -from typing_extensions import override -from yarl import URL - -from crawlee.storage_clients._base import KeyValueStoreClient as BaseKeyValueStoreClient -from crawlee.storage_clients.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord - -from apify._crypto import create_hmac_signature - -if TYPE_CHECKING: - from collections.abc import AsyncIterator - from contextlib import AbstractAsyncContextManager - - from httpx import Response - - from apify_client.clients import KeyValueStoreClientAsync - - -class KeyValueStoreClient(BaseKeyValueStoreClient): - """Key-value store resource client implementation based on the Apify platform storage.""" - - def __init__(self, apify_key_value_store_client: KeyValueStoreClientAsync, api_public_base_url: str) -> None: - self._client = apify_key_value_store_client - self._api_public_base_url = api_public_base_url - - @override - async def get(self) -> KeyValueStoreMetadata | None: - result = await self._client.get() - return KeyValueStoreMetadata.model_validate(result) if result else None - - @override - async def update( - self, - *, - name: str | None = None, - ) -> KeyValueStoreMetadata: - return KeyValueStoreMetadata.model_validate(await self._client.update()) - - @override - async def delete(self) -> None: - await self._client.delete() - - @override - async def list_keys( - self, - *, - limit: int = 1000, - exclusive_start_key: str | None = None, - ) -> KeyValueStoreListKeysPage: - return KeyValueStoreListKeysPage.model_validate(await self._client.list_keys()) - - @override - async def get_record(self, key: str) -> KeyValueStoreRecord | None: - result = await self._client.get_record(key) - return KeyValueStoreRecord.model_validate(result) if result else None - - @override - async def get_record_as_bytes(self, key: str) -> KeyValueStoreRecord | None: - result = await self._client.get_record_as_bytes(key) - return KeyValueStoreRecord.model_validate(result) if result else None - - @override - async def stream_record(self, key: str) -> AbstractAsyncContextManager[KeyValueStoreRecord[Response] | None]: - return self._stream_record_internal(key) - - @asynccontextmanager - async def _stream_record_internal(self, key: str) -> AsyncIterator[KeyValueStoreRecord[Response] | None]: - async with self._client.stream_record(key) as response: - yield KeyValueStoreRecord.model_validate(response) - - @override - async def set_record(self, key: str, value: Any, content_type: str | None = None) -> None: - await self._client.set_record( - key=key, - value=value, - content_type=content_type, - ) - - @override - async def delete_record(self, key: str) -> None: - await self._client.delete_record( - key=key, - ) - - async def get_public_url(self, key: str) -> str: - """Get a URL for the given key that may be used to publicly access the value in the remote key-value store. - - Args: - key: The key for which the URL should be generated. - """ - if self._client.resource_id is None: - raise ValueError('resource_id cannot be None when generating a public URL') - - public_url = ( - URL(self._api_public_base_url) / 'v2' / 'key-value-stores' / self._client.resource_id / 'records' / key - ) - - key_value_store = await self.get() - - if key_value_store is not None and isinstance(key_value_store.model_extra, dict): - url_signing_secret_key = key_value_store.model_extra.get('urlSigningSecretKey') - if url_signing_secret_key: - public_url = public_url.with_query(signature=create_hmac_signature(url_signing_secret_key, key)) - - return str(public_url) diff --git a/src/apify/apify_storage_client/_key_value_store_collection_client.py b/src/apify/apify_storage_client/_key_value_store_collection_client.py deleted file mode 100644 index 0d4caca7..00000000 --- a/src/apify/apify_storage_client/_key_value_store_collection_client.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee.storage_clients._base import KeyValueStoreCollectionClient as BaseKeyValueStoreCollectionClient -from crawlee.storage_clients.models import KeyValueStoreListPage, KeyValueStoreMetadata - -if TYPE_CHECKING: - from apify_client.clients import KeyValueStoreCollectionClientAsync - - -class KeyValueStoreCollectionClient(BaseKeyValueStoreCollectionClient): - """Key-value store collection resource client implementation based on the Apify platform storage.""" - - def __init__(self, apify_dataset_collection_client: KeyValueStoreCollectionClientAsync) -> None: - self._client = apify_dataset_collection_client - - @override - async def get_or_create( - self, - *, - id: str | None = None, - name: str | None = None, - schema: dict | None = None, - ) -> KeyValueStoreMetadata: - return KeyValueStoreMetadata.model_validate( - await self._client.get_or_create( - name=id if id is not None else name, - schema=schema, - ) - ) - - @override - async def list( - self, - *, - unnamed: bool = False, - limit: int | None = None, - offset: int | None = None, - desc: bool = False, - ) -> KeyValueStoreListPage: - return KeyValueStoreListPage.model_validate( - await self._client.list( - unnamed=unnamed, - limit=limit, - offset=offset, - desc=desc, - ) - ) diff --git a/src/apify/apify_storage_client/_request_queue_client.py b/src/apify/apify_storage_client/_request_queue_client.py deleted file mode 100644 index 036eb2ab..00000000 --- a/src/apify/apify_storage_client/_request_queue_client.py +++ /dev/null @@ -1,176 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee import Request -from crawlee.storage_clients._base import RequestQueueClient as BaseRequestQueueClient -from crawlee.storage_clients.models import ( - BatchRequestsOperationResponse, - ProcessedRequest, - ProlongRequestLockResponse, - RequestQueueHead, - RequestQueueHeadWithLocks, - RequestQueueMetadata, -) - -if TYPE_CHECKING: - from collections.abc import Sequence - - from apify_client.clients import RequestQueueClientAsync - - -class RequestQueueClient(BaseRequestQueueClient): - """Request queue resource client implementation based on the Apify platform storage.""" - - def __init__(self, apify_request_queue_client: RequestQueueClientAsync) -> None: - self._client = apify_request_queue_client - - @override - async def get(self) -> RequestQueueMetadata | None: - result = await self._client.get() - return RequestQueueMetadata.model_validate({'resourceDirectory': ''} | result) if result else None - - @override - async def update( - self, - *, - name: str | None = None, - ) -> RequestQueueMetadata: - return RequestQueueMetadata.model_validate( - {'resourceDirectory': ''} - | await self._client.update( - name=name, - ) - ) - - @override - async def delete(self) -> None: - await self._client.delete() - - @override - async def list_head(self, *, limit: int | None = None) -> RequestQueueHead: - return RequestQueueHead.model_validate( - await self._client.list_head( - limit=limit, - ), - ) - - @override - async def list_and_lock_head(self, *, lock_secs: int, limit: int | None = None) -> RequestQueueHeadWithLocks: - return RequestQueueHeadWithLocks.model_validate( - await self._client.list_and_lock_head( - lock_secs=lock_secs, - limit=limit, - ) - ) - - @override - async def add_request( - self, - request: Request, - *, - forefront: bool = False, - ) -> ProcessedRequest: - return ProcessedRequest.model_validate( - {'id': request.id, 'uniqueKey': request.unique_key} - | await self._client.add_request( - request=request.model_dump( - by_alias=True, - exclude={ - 'id', - }, - ), - forefront=forefront, - ) - ) - - @override - async def get_request(self, request_id: str) -> Request | None: - result = await self._client.get_request(request_id) - return Request.model_validate(result) if result else None - - @override - async def update_request( - self, - request: Request, - *, - forefront: bool = False, - ) -> ProcessedRequest: - return ProcessedRequest.model_validate( - {'id': request.id, 'uniqueKey': request.unique_key} - | await self._client.update_request( - request=request.model_dump( - by_alias=True, - ), - forefront=forefront, - ) - ) - - @override - async def delete_request(self, request_id: str) -> None: - await self._client.delete_request(request_id) - - @override - async def prolong_request_lock( - self, - request_id: str, - *, - forefront: bool = False, - lock_secs: int, - ) -> ProlongRequestLockResponse: - return ProlongRequestLockResponse.model_validate( - await self._client.prolong_request_lock( - request_id=request_id, - forefront=forefront, - lock_secs=lock_secs, - ) - ) - - @override - async def delete_request_lock( - self, - request_id: str, - *, - forefront: bool = False, - ) -> None: - await self._client.delete_request_lock( - request_id=request_id, - forefront=forefront, - ) - - @override - async def batch_add_requests( - self, - requests: Sequence[Request], - *, - forefront: bool = False, - ) -> BatchRequestsOperationResponse: - return BatchRequestsOperationResponse.model_validate( - await self._client.batch_add_requests( - requests=[ - r.model_dump( - by_alias=True, - exclude={ - 'id', - }, - ) - for r in requests - ], - forefront=forefront, - ) - ) - - @override - async def batch_delete_requests(self, requests: list[Request]) -> BatchRequestsOperationResponse: - return BatchRequestsOperationResponse.model_validate( - await self._client.batch_delete_requests( - requests=[ - r.model_dump( - by_alias=True, - ) - for r in requests - ], - ) - ) diff --git a/src/apify/apify_storage_client/_request_queue_collection_client.py b/src/apify/apify_storage_client/_request_queue_collection_client.py deleted file mode 100644 index 5bf28836..00000000 --- a/src/apify/apify_storage_client/_request_queue_collection_client.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee.storage_clients._base import RequestQueueCollectionClient as BaseRequestQueueCollectionClient -from crawlee.storage_clients.models import RequestQueueListPage, RequestQueueMetadata - -if TYPE_CHECKING: - from apify_client.clients import RequestQueueCollectionClientAsync - - -class RequestQueueCollectionClient(BaseRequestQueueCollectionClient): - """Request queue collection resource client implementation based on the Apify platform storage.""" - - def __init__(self, apify_request_queue_collection_client: RequestQueueCollectionClientAsync) -> None: - self._client = apify_request_queue_collection_client - - @override - async def get_or_create( - self, - *, - id: str | None = None, - name: str | None = None, - schema: dict | None = None, - ) -> RequestQueueMetadata: - return RequestQueueMetadata.model_validate( - {'resourceDirectory': ''} - | await self._client.get_or_create( - name=id if id is not None else name, - ) - ) - - @override - async def list( - self, - *, - unnamed: bool = False, - limit: int | None = None, - offset: int | None = None, - desc: bool = False, - ) -> RequestQueueListPage: - return RequestQueueListPage.model_validate( - await self._client.list( - unnamed=unnamed, - limit=limit, - offset=offset, - desc=desc, - ) - ) diff --git a/src/apify/apify_storage_client/py.typed b/src/apify/apify_storage_client/py.typed deleted file mode 100644 index e69de29b..00000000 From bf55338bb54ef179784ac761e929ce96168470e1 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 9 May 2025 11:26:55 +0200 Subject: [PATCH 02/44] Add init version of new Apify storage clients --- src/apify/apify_storage_client/__init__.py | 11 + .../apify_storage_client/_dataset_client.py | 183 +++++ .../_key_value_store_client.py | 194 ++++++ .../_request_queue_client.py | 633 ++++++++++++++++++ .../apify_storage_client/_storage_client.py | 65 ++ src/apify/apify_storage_client/py.typed | 0 6 files changed, 1086 insertions(+) create mode 100644 src/apify/apify_storage_client/__init__.py create mode 100644 src/apify/apify_storage_client/_dataset_client.py create mode 100644 src/apify/apify_storage_client/_key_value_store_client.py create mode 100644 src/apify/apify_storage_client/_request_queue_client.py create mode 100644 src/apify/apify_storage_client/_storage_client.py create mode 100644 src/apify/apify_storage_client/py.typed diff --git a/src/apify/apify_storage_client/__init__.py b/src/apify/apify_storage_client/__init__.py new file mode 100644 index 00000000..4af7c8ee --- /dev/null +++ b/src/apify/apify_storage_client/__init__.py @@ -0,0 +1,11 @@ +from ._dataset_client import ApifyDatasetClient +from ._key_value_store_client import ApifyKeyValueStoreClient +from ._request_queue_client import ApifyRequestQueueClient +from ._storage_client import ApifyStorageClient + +__all__ = [ + 'ApifyDatasetClient', + 'ApifyKeyValueStoreClient', + 'ApifyRequestQueueClient', + 'ApifyStorageClient', +] diff --git a/src/apify/apify_storage_client/_dataset_client.py b/src/apify/apify_storage_client/_dataset_client.py new file mode 100644 index 00000000..12ded618 --- /dev/null +++ b/src/apify/apify_storage_client/_dataset_client.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +import asyncio +from logging import getLogger +from typing import TYPE_CHECKING, Any + +from typing_extensions import override + +from apify_client import ApifyClientAsync +from crawlee.storage_clients._base import DatasetClient +from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + from datetime import datetime + + from apify_client.clients import DatasetClientAsync + + from apify import Configuration + +logger = getLogger(__name__) + + +class ApifyDatasetClient(DatasetClient): + """An Apify platform implementation of the dataset client.""" + + def __init__( + self, + *, + id: str, + name: str | None, + created_at: datetime, + accessed_at: datetime, + modified_at: datetime, + item_count: int, + api_client: DatasetClientAsync, + ) -> None: + """Initialize a new instance. + + Preferably use the `ApifyDatasetClient.open` class method to create a new instance. + """ + self._metadata = DatasetMetadata( + id=id, + name=name, + created_at=created_at, + accessed_at=accessed_at, + modified_at=modified_at, + item_count=item_count, + ) + + self._api_client = api_client + """The Apify dataset client for API operations.""" + + self._lock = asyncio.Lock() + """A lock to ensure that only one operation is performed at a time.""" + + @override + @property + def metadata(self) -> DatasetMetadata: + return self._metadata + + @override + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> ApifyDatasetClient: + token = configuration.token + api_url = configuration.api_base_url + + # Otherwise, create a new one. + apify_client_async = ApifyClientAsync( + token=token, + api_url=api_url, + max_retries=8, + min_delay_between_retries_millis=500, + timeout_secs=360, + ) + + apify_datasets_client = apify_client_async.datasets() + + metadata = DatasetMetadata.model_validate( + await apify_datasets_client.get_or_create(name=id if id is not None else name), + ) + + apify_dataset_client = apify_client_async.dataset(dataset_id=metadata.id) + + return cls( + id=metadata.id, + name=metadata.name, + created_at=metadata.created_at, + accessed_at=metadata.accessed_at, + modified_at=metadata.modified_at, + item_count=metadata.item_count, + api_client=apify_dataset_client, + ) + + @override + async def purge(self) -> None: + # TODO: better + async with self._lock: + await self._api_client.delete() + + @override + async def drop(self) -> None: + async with self._lock: + await self._api_client.delete() + + @override + async def push_data(self, data: list[Any] | dict[str, Any]) -> None: + async with self._lock: + await self._api_client.push_items(items=data) + await self._update_metadata() + + @override + async def get_data( + self, + *, + offset: int = 0, + limit: int | None = 999_999_999_999, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + flatten: list[str] | None = None, + view: str | None = None, + ) -> DatasetItemsListPage: + response = await self._api_client.list_items( + offset=offset, + limit=limit, + clean=clean, + desc=desc, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_hidden=skip_hidden, + flatten=flatten, + view=view, + ) + result = DatasetItemsListPage.model_validate(vars(response)) + await self._update_metadata() + return result + + @override + async def iterate_items( + self, + *, + offset: int = 0, + limit: int | None = None, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + ) -> AsyncIterator[dict]: + async for item in self._api_client.iterate_items( + offset=offset, + limit=limit, + clean=clean, + desc=desc, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_hidden=skip_hidden, + ): + yield item + + await self._update_metadata() + + async def _update_metadata(self) -> None: + """Update the dataset metadata file with current information.""" + metadata = await self._api_client.get() + self._metadata = DatasetMetadata.model_validate(metadata) diff --git a/src/apify/apify_storage_client/_key_value_store_client.py b/src/apify/apify_storage_client/_key_value_store_client.py new file mode 100644 index 00000000..cf2b84f8 --- /dev/null +++ b/src/apify/apify_storage_client/_key_value_store_client.py @@ -0,0 +1,194 @@ +from __future__ import annotations + +import asyncio +from logging import getLogger +from typing import TYPE_CHECKING, Any + +from typing_extensions import override +from yarl import URL + +from apify_client import ApifyClientAsync +from crawlee.storage_clients._base import KeyValueStoreClient +from crawlee.storage_clients.models import ( + KeyValueStoreListKeysPage, + KeyValueStoreMetadata, + KeyValueStoreRecord, + KeyValueStoreRecordMetadata, +) + +from apify._crypto import create_hmac_signature + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + from datetime import datetime + + from apify_client.clients import KeyValueStoreClientAsync + + from apify import Configuration + +logger = getLogger(__name__) + + +class ApifyKeyValueStoreClient(KeyValueStoreClient): + """An Apify platform implementation of the key-value store client.""" + + def __init__( + self, + *, + id: str, + name: str | None, + created_at: datetime, + accessed_at: datetime, + modified_at: datetime, + api_client: KeyValueStoreClientAsync, + ) -> None: + """Initialize a new instance. + + Preferably use the `ApifyKeyValueStoreClient.open` class method to create a new instance. + """ + self._metadata = KeyValueStoreMetadata( + id=id, + name=name, + created_at=created_at, + accessed_at=accessed_at, + modified_at=modified_at, + ) + + self._api_client = api_client + """The Apify key-value store client for API operations.""" + + self._lock = asyncio.Lock() + """A lock to ensure that only one operation is performed at a time.""" + + @override + @property + def metadata(self) -> KeyValueStoreMetadata: + return self._metadata + + @override + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> ApifyKeyValueStoreClient: + token = configuration.token + api_url = configuration.api_base_url + + # Otherwise, create a new one. + apify_client_async = ApifyClientAsync( + token=token, + api_url=api_url, + max_retries=8, + min_delay_between_retries_millis=500, + timeout_secs=360, + ) + + apify_kvss_client = apify_client_async.key_value_stores() + + metadata = KeyValueStoreMetadata.model_validate( + await apify_kvss_client.get_or_create(name=id if id is not None else name), + ) + + apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=metadata.id) + + return cls( + id=metadata.id, + name=metadata.name, + created_at=metadata.created_at, + accessed_at=metadata.accessed_at, + modified_at=metadata.modified_at, + api_client=apify_kvs_client, + ) + + @override + async def purge(self) -> None: + # TODO: better + async with self._lock: + await self._api_client.delete() + + @override + async def drop(self) -> None: + async with self._lock: + await self._api_client.delete() + + @override + async def get_value(self, key: str) -> KeyValueStoreRecord | None: + response = await self._api_client.get_record(key) + record = KeyValueStoreRecord.model_validate(response) if response else None + await self._update_metadata() + return record + + @override + async def set_value(self, key: str, value: Any, content_type: str | None = None) -> None: + async with self._lock: + await self._api_client.set_record( + key=key, + value=value, + content_type=content_type, + ) + await self._update_metadata() + + @override + async def delete_value(self, key: str) -> None: + async with self._lock: + await self._api_client.delete_record(key=key) + await self._update_metadata() + + @override + async def iterate_keys( + self, + *, + exclusive_start_key: str | None = None, + limit: int | None = None, + ) -> AsyncIterator[KeyValueStoreRecordMetadata]: + count = 0 + + while True: + response = await self._api_client.list_keys(exclusive_start_key=exclusive_start_key) + list_key_page = KeyValueStoreListKeysPage.model_validate(response) + + for item in list_key_page.items: + yield item + count += 1 + + # If we've reached the limit, stop yielding + if limit and count >= limit: + break + + # If we've reached the limit or there are no more pages, exit the loop + if (limit and count >= limit) or not list_key_page.is_truncated: + break + + exclusive_start_key = list_key_page.next_exclusive_start_key + + await self._update_metadata() + + async def get_public_url(self, key: str) -> str: + """Get a URL for the given key that may be used to publicly access the value in the remote key-value store. + + Args: + key: The key for which the URL should be generated. + """ + if self._api_client.resource_id is None: + raise ValueError('resource_id cannot be None when generating a public URL') + + public_url = ( + URL(self._api_client.base_url) / 'v2' / 'key-value-stores' / self._api_client.resource_id / 'records' / key + ) + + key_value_store = self.metadata + + if key_value_store and key_value_store.model_extra: + url_signing_secret_key = key_value_store.model_extra.get('urlSigningSecretKey') + if url_signing_secret_key: + public_url = public_url.with_query(signature=create_hmac_signature(url_signing_secret_key, key)) + + return str(public_url) + + async def _update_metadata(self) -> None: + """Update the key-value store metadata with current information.""" + metadata = await self._api_client.get() + self._metadata = KeyValueStoreMetadata.model_validate(metadata) diff --git a/src/apify/apify_storage_client/_request_queue_client.py b/src/apify/apify_storage_client/_request_queue_client.py new file mode 100644 index 00000000..2dcb06a3 --- /dev/null +++ b/src/apify/apify_storage_client/_request_queue_client.py @@ -0,0 +1,633 @@ +from __future__ import annotations + +import asyncio +from collections import deque +from datetime import datetime, timedelta, timezone +from logging import getLogger +from typing import TYPE_CHECKING, Final + +from cachetools import LRUCache +from typing_extensions import override + +from apify_client import ApifyClientAsync +from crawlee import Request +from crawlee._utils.requests import unique_key_to_request_id +from crawlee.storage_clients._base import RequestQueueClient +from crawlee.storage_clients.models import ( + AddRequestsResponse, + CachedRequest, + ProcessedRequest, + ProlongRequestLockResponse, + RequestQueueHead, + RequestQueueMetadata, +) + +if TYPE_CHECKING: + from collections.abc import Sequence + + from apify_client.clients import RequestQueueClientAsync + + from apify import Configuration + +logger = getLogger(__name__) + + +class ApifyRequestQueueClient(RequestQueueClient): + """An Apify platform implementation of the request queue client.""" + + _DEFAULT_LOCK_TIME: Final[timedelta] = timedelta(minutes=3) + """The default lock time for requests in the queue.""" + + _MAX_CACHED_REQUESTS: Final[int] = 1_000_000 + """Maximum number of requests that can be cached.""" + + def __init__( + self, + *, + id: str, + name: str | None, + created_at: datetime, + accessed_at: datetime, + modified_at: datetime, + had_multiple_clients: bool, + handled_request_count: int, + pending_request_count: int, + stats: dict, + total_request_count: int, + api_client: RequestQueueClientAsync, + ) -> None: + """Initialize a new instance. + + Preferably use the `ApifyRequestQueueClient.open` class method to create a new instance. + """ + self._metadata = RequestQueueMetadata( + id=id, + name=name, + created_at=created_at, + accessed_at=accessed_at, + modified_at=modified_at, + had_multiple_clients=had_multiple_clients, + handled_request_count=handled_request_count, + pending_request_count=pending_request_count, + stats=stats, + total_request_count=total_request_count, + ) + + self._api_client = api_client + """The Apify request queue client for API operations.""" + + self._lock = asyncio.Lock() + """A lock to ensure that only one operation is performed at a time.""" + + self._queue_head = deque[str]() + """A deque to store request IDs in the queue head.""" + + self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=self._MAX_CACHED_REQUESTS) + """A cache to store request objects.""" + + self._queue_has_locked_requests: bool | None = None + """Whether the queue has requests locked by another client.""" + + self._should_check_for_forefront_requests = False + """Whether to check for forefront requests in the next list_head call.""" + + @override + @property + def metadata(self) -> RequestQueueMetadata: + return self._metadata + + @override + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> ApifyRequestQueueClient: + # Get API credentials + token = configuration.token + api_url = configuration.api_base_url + + # Create a new API client + apify_client_async = ApifyClientAsync( + token=token, + api_url=api_url, + max_retries=8, + min_delay_between_retries_millis=500, + timeout_secs=360, + ) + + apify_rqs_client = apify_client_async.request_queues() + + # Get or create the request queue + metadata = RequestQueueMetadata.model_validate( + await apify_rqs_client.get_or_create(name=id if id is not None else name), + ) + + apify_rq_client = apify_client_async.request_queue(request_queue_id=metadata.id) + + # Create the client instance + return cls( + id=metadata.id, + name=metadata.name, + created_at=metadata.created_at, + accessed_at=metadata.accessed_at, + modified_at=metadata.modified_at, + had_multiple_clients=metadata.had_multiple_clients, + handled_request_count=metadata.handled_request_count, + pending_request_count=metadata.pending_request_count, + stats=metadata.stats, + total_request_count=metadata.total_request_count, + api_client=apify_rq_client, + ) + + @override + async def purge(self) -> None: + # TODO: better + async with self._lock: + await self._api_client.delete() + + @override + async def drop(self) -> None: + async with self._lock: + await self._api_client.delete() + + @override + async def add_batch_of_requests( + self, + requests: Sequence[Request], + *, + forefront: bool = False, + ) -> AddRequestsResponse: + """Add a batch of requests to the queue. + + Args: + requests: The requests to add. + forefront: Whether to add the requests to the beginning of the queue. + + Returns: + Response containing information about the added requests. + """ + # Prepare requests for API by converting to dictionaries + requests_dict = [request.model_dump(by_alias=True) for request in requests] + + # Remove 'id' fields from requests as the API doesn't accept them + for request_dict in requests_dict: + if 'id' in request_dict: + del request_dict['id'] + + # Send requests to API + response = await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront) + + # Update metadata after adding requests + await self._update_metadata() + + return AddRequestsResponse.model_validate(response) + + @override + async def get_request(self, request_id: str) -> Request | None: + """Get a request by ID. + + Args: + request_id: The ID of the request to get. + + Returns: + The request or None if not found. + """ + response = await self._api_client.get_request(request_id) + await self._update_metadata() + + if response is None: + return None + + return Request.model_validate(**response) + + @override + async def fetch_next_request(self) -> Request | None: + """Return the next request in the queue to be processed. + + Once you successfully finish processing of the request, you need to call `mark_request_as_handled` + to mark the request as handled in the queue. If there was some error in processing the request, call + `reclaim_request` instead, so that the queue will give the request to some other consumer + in another call to the `fetch_next_request` method. + + Returns: + The request or `None` if there are no more pending requests. + """ + # Ensure the queue head has requests if available + await self._ensure_head_is_non_empty() + + # If queue head is empty after ensuring, there are no requests + if not self._queue_head: + return None + + # Get the next request ID from the queue head + next_request_id = self._queue_head.popleft() + request = await self._get_or_hydrate_request(next_request_id) + + # Handle potential inconsistency where request might not be in the main table yet + if request is None: + logger.debug( + 'Cannot find a request from the beginning of queue, will be retried later', + extra={'nextRequestId': next_request_id}, + ) + return None + + # If the request was already handled, skip it + if request.handled_at is not None: + logger.debug( + 'Request fetched from the beginning of queue was already handled', + extra={'nextRequestId': next_request_id}, + ) + return None + + return request + + @override + async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: + """Mark a request as handled after successful processing. + + Handled requests will never again be returned by the `fetch_next_request` method. + + Args: + request: The request to mark as handled. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + # Set the handled_at timestamp if not already set + if request.handled_at is None: + request.handled_at = datetime.now(tz=timezone.utc) + + try: + # Update the request in the API + processed_request = await self._update_request(request) + processed_request.unique_key = request.unique_key + + # Update the cache with the handled request + cache_key = unique_key_to_request_id(request.unique_key) + self._cache_request( + cache_key, + processed_request, + forefront=False, + hydrated_request=request, + ) + + # Update metadata after marking request as handled + await self._update_metadata() + except Exception as exc: + logger.debug(f'Error marking request {request.id} as handled: {exc!s}') + return None + else: + return processed_request + + @override + async def reclaim_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest | None: + """Reclaim a failed request back to the queue. + + The request will be returned for processing later again by another call to `fetch_next_request`. + + Args: + request: The request to return to the queue. + forefront: Whether to add the request to the head or the end of the queue. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + try: + # Update the request in the API + processed_request = await self._update_request(request, forefront=forefront) + processed_request.unique_key = request.unique_key + + # Update the cache + cache_key = unique_key_to_request_id(request.unique_key) + self._cache_request( + cache_key, + processed_request, + forefront=forefront, + hydrated_request=request, + ) + + # If we're adding to the forefront, we need to check for forefront requests + # in the next list_head call + if forefront: + self._should_check_for_forefront_requests = True + + # Try to release the lock on the request + try: + await self._delete_request_lock(request.id, forefront=forefront) + except Exception as err: + logger.debug(f'Failed to delete request lock for request {request.id}', exc_info=err) + + # Update metadata after reclaiming request + await self._update_metadata() + except Exception as exc: + logger.debug(f'Error reclaiming request {request.id}: {exc!s}') + return None + else: + return processed_request + + @override + async def is_empty(self) -> bool: + """Check if the queue is empty. + + Returns: + True if the queue is empty, False otherwise. + """ + head = await self._list_head(limit=1, lock_time=None) + return len(head.items) == 0 + + async def _ensure_head_is_non_empty(self) -> None: + """Ensure that the queue head has requests if they are available in the queue.""" + # If queue head has adequate requests, skip fetching more + if len(self._queue_head) > 1 and not self._should_check_for_forefront_requests: + return + + # Fetch requests from the API and populate the queue head + await self._list_head(lock_time=self._DEFAULT_LOCK_TIME) + + async def _get_or_hydrate_request(self, request_id: str) -> Request | None: + """Get a request by ID, either from cache or by fetching from API. + + Args: + request_id: The ID of the request to get. + + Returns: + The request if found and valid, otherwise None. + """ + # First check if the request is in our cache + cached_entry = self._requests_cache.get(request_id) + + if cached_entry and cached_entry.hydrated: + # If we have the request hydrated in cache, check if lock is expired + if cached_entry.lock_expires_at and cached_entry.lock_expires_at < datetime.now(tz=timezone.utc): + # Try to prolong the lock if it's expired + try: + lock_secs = int(self._DEFAULT_LOCK_TIME.total_seconds()) + response = await self._prolong_request_lock( + request_id, forefront=cached_entry.forefront, lock_secs=lock_secs + ) + cached_entry.lock_expires_at = response.lock_expires_at + except Exception: + # If prolonging the lock fails, we lost the request + logger.debug(f'Failed to prolong lock for request {request_id}, returning None') + return None + + return cached_entry.hydrated + + # If not in cache or not hydrated, fetch the request + try: + # Try to acquire or prolong the lock + lock_secs = int(self._DEFAULT_LOCK_TIME.total_seconds()) + await self._prolong_request_lock(request_id, forefront=False, lock_secs=lock_secs) + + # Fetch the request data + request = await self.get_request(request_id) + + # If request is not found, release lock and return None + if not request: + await self._delete_request_lock(request_id) + return None + + # Update cache with hydrated request + cache_key = unique_key_to_request_id(request.unique_key) + self._cache_request( + cache_key, + ProcessedRequest( + id=request_id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=request.handled_at is not None, + ), + forefront=False, + hydrated_request=request, + ) + except Exception as exc: + logger.debug(f'Error fetching or locking request {request_id}: {exc!s}') + return None + else: + return request + + async def _update_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest: + """Update a request in the queue. + + Args: + request: The updated request. + forefront: Whether to put the updated request in the beginning or the end of the queue. + + Returns: + The updated request + """ + response = await self._api_client.update_request( + request=request.model_dump(by_alias=True), + forefront=forefront, + ) + + return ProcessedRequest.model_validate( + {'id': request.id, 'uniqueKey': request.unique_key} | response, + ) + + async def _list_head( + self, + *, + lock_time: timedelta | None = None, + limit: int = 25, + ) -> RequestQueueHead: + """Retrieve requests from the beginning of the queue. + + Args: + lock_time: Duration for which to lock the retrieved requests. + If None, requests will not be locked. + limit: Maximum number of requests to retrieve. + + Returns: + A collection of requests from the beginning of the queue. + """ + # Return from cache if available and we're not checking for new forefront requests + if self._queue_head and not self._should_check_for_forefront_requests: + logger.debug(f'Using cached queue head with {len(self._queue_head)} requests') + + # Create a list of requests from the cached queue head + items = [] + for request_id in list(self._queue_head)[:limit]: + cached_request = self._requests_cache.get(request_id) + if cached_request and cached_request.hydrated: + items.append(cached_request.hydrated) + + return RequestQueueHead( + limit=limit, + had_multiple_clients=self._metadata.had_multiple_clients, + queue_modified_at=self._metadata.modified_at, + items=items, + queue_has_locked_requests=self._queue_has_locked_requests, + lock_time=lock_time, + ) + + # Otherwise fetch from API + lock_time = lock_time or self._DEFAULT_LOCK_TIME + lock_secs = int(lock_time.total_seconds()) + + response = await self._api_client.list_and_lock_head( + lock_secs=lock_secs, + limit=limit, + ) + + # Update the queue head cache + self._queue_has_locked_requests = response.get('queueHasLockedRequests', False) + + # Clear current queue head if we're checking for forefront requests + if self._should_check_for_forefront_requests: + self._queue_head.clear() + self._should_check_for_forefront_requests = False + + # Process and cache the requests + head_id_buffer = list[str]() + forefront_head_id_buffer = list[str]() + + for request_data in response.get('items', []): + request = Request.model_validate(request_data) + + # Skip requests without ID or unique key + if not request.id or not request.unique_key: + logger.debug( + 'Skipping request from queue head, missing ID or unique key', + extra={ + 'id': request.id, + 'unique_key': request.unique_key, + }, + ) + continue + + # Check if this request was already cached and if it was added to forefront + cache_key = unique_key_to_request_id(request.unique_key) + cached_request = self._requests_cache.get(cache_key) + forefront = cached_request.forefront if cached_request else False + + # Add to appropriate buffer based on forefront flag + if forefront: + forefront_head_id_buffer.insert(0, request.id) + else: + head_id_buffer.append(request.id) + + # Cache the request + self._cache_request( + cache_key, + ProcessedRequest( + id=request.id, + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=False, + ), + forefront=forefront, + hydrated_request=request, + ) + + # Update the queue head deque + for request_id in head_id_buffer: + self._queue_head.append(request_id) + + for request_id in forefront_head_id_buffer: + self._queue_head.appendleft(request_id) + + return RequestQueueHead.model_validate(response) + + async def _prolong_request_lock( + self, + request_id: str, + *, + forefront: bool = False, + lock_secs: int, + ) -> ProlongRequestLockResponse: + """Prolong the lock on a specific request in the queue. + + Args: + request_id: The identifier of the request whose lock is to be prolonged. + forefront: Whether to put the request in the beginning or the end of the queue after lock expires. + lock_secs: The additional amount of time, in seconds, that the request will remain locked. + + Returns: + A response containing the time at which the lock will expire. + """ + response = await self._api_client.prolong_request_lock( + request_id=request_id, + forefront=forefront, + lock_secs=lock_secs, + ) + + result = ProlongRequestLockResponse( + lock_expires_at=datetime.fromisoformat(response['lockExpiresAt'].replace('Z', '+00:00')) + ) + + # Update the cache with the new lock expiration + for cached_request in self._requests_cache.values(): + if cached_request.id == request_id: + cached_request.lock_expires_at = result.lock_expires_at + break + + return result + + async def _delete_request_lock( + self, + request_id: str, + *, + forefront: bool = False, + ) -> None: + """Delete the lock on a specific request in the queue. + + Args: + request_id: ID of the request to delete the lock. + forefront: Whether to put the request in the beginning or the end of the queue after the lock is deleted. + """ + try: + await self._api_client.delete_request_lock( + request_id=request_id, + forefront=forefront, + ) + + # Update the cache to remove the lock + for cached_request in self._requests_cache.values(): + if cached_request.id == request_id: + cached_request.lock_expires_at = None + break + except Exception as err: + logger.debug(f'Failed to delete request lock for request {request_id}', exc_info=err) + + def _cache_request( + self, + cache_key: str, + processed_request: ProcessedRequest, + *, + forefront: bool, + hydrated_request: Request | None = None, + ) -> None: + """Cache a request for future use. + + Args: + cache_key: The key to use for caching the request. + processed_request: The processed request information. + forefront: Whether the request was added to the forefront of the queue. + hydrated_request: The hydrated request object, if available. + """ + self._requests_cache[cache_key] = CachedRequest( + id=processed_request.id, + was_already_handled=processed_request.was_already_handled, + hydrated=hydrated_request, + lock_expires_at=None, + forefront=forefront, + ) + + async def _update_metadata(self) -> None: + """Update the request queue metadata with current information.""" + metadata = await self._api_client.get() + self._metadata = RequestQueueMetadata.model_validate(metadata) diff --git a/src/apify/apify_storage_client/_storage_client.py b/src/apify/apify_storage_client/_storage_client.py new file mode 100644 index 00000000..1d4d66dd --- /dev/null +++ b/src/apify/apify_storage_client/_storage_client.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from typing_extensions import override + +from crawlee.configuration import Configuration +from crawlee.storage_clients._base import StorageClient + +from ._dataset_client import ApifyDatasetClient +from ._key_value_store_client import ApifyKeyValueStoreClient +from ._request_queue_client import ApifyRequestQueueClient + + +class ApifyStorageClient(StorageClient): + """Apify storage client.""" + + @override + async def open_dataset_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> ApifyDatasetClient: + configuration = configuration or Configuration.get_global_configuration() + client = await ApifyDatasetClient.open(id=id, name=name, configuration=configuration) + + if configuration.purge_on_start: + await client.drop() + client = await ApifyDatasetClient.open(id=id, name=name, configuration=configuration) + + return client + + @override + async def open_key_value_store_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> ApifyKeyValueStoreClient: + configuration = configuration or Configuration.get_global_configuration() + client = await ApifyKeyValueStoreClient.open(id=id, name=name, configuration=configuration) + + if configuration.purge_on_start: + await client.drop() + client = await ApifyKeyValueStoreClient.open(id=id, name=name, configuration=configuration) + + return client + + @override + async def open_request_queue_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> ApifyRequestQueueClient: + configuration = configuration or Configuration.get_global_configuration() + client = await ApifyRequestQueueClient.open(id=id, name=name, configuration=configuration) + + if configuration.purge_on_start: + await client.drop() + client = await ApifyRequestQueueClient.open(id=id, name=name, configuration=configuration) + + return client diff --git a/src/apify/apify_storage_client/py.typed b/src/apify/apify_storage_client/py.typed new file mode 100644 index 00000000..e69de29b From 6b2f82b7568056edc13939b409ecc4cdb834e712 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 12 Jun 2025 14:44:39 +0200 Subject: [PATCH 03/44] Move specific models from Crawlee to SDK --- .../_key_value_store_client.py | 8 +- src/apify/apify_storage_client/_models.py | 88 +++++++++++++++++++ .../_request_queue_client.py | 11 +-- 3 files changed, 93 insertions(+), 14 deletions(-) create mode 100644 src/apify/apify_storage_client/_models.py diff --git a/src/apify/apify_storage_client/_key_value_store_client.py b/src/apify/apify_storage_client/_key_value_store_client.py index cf2b84f8..73463da6 100644 --- a/src/apify/apify_storage_client/_key_value_store_client.py +++ b/src/apify/apify_storage_client/_key_value_store_client.py @@ -9,12 +9,8 @@ from apify_client import ApifyClientAsync from crawlee.storage_clients._base import KeyValueStoreClient -from crawlee.storage_clients.models import ( - KeyValueStoreListKeysPage, - KeyValueStoreMetadata, - KeyValueStoreRecord, - KeyValueStoreRecordMetadata, -) +from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata +from ._models import KeyValueStoreListKeysPage from apify._crypto import create_hmac_signature diff --git a/src/apify/apify_storage_client/_models.py b/src/apify/apify_storage_client/_models.py new file mode 100644 index 00000000..dd94ec56 --- /dev/null +++ b/src/apify/apify_storage_client/_models.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +from datetime import datetime, timedelta +from typing import Annotated + +from pydantic import BaseModel, ConfigDict, Field + +from crawlee import Request +from crawlee._utils.docs import docs_group + + +@docs_group('Data structures') +class ProlongRequestLockResponse(BaseModel): + """Response to prolong request lock calls.""" + + model_config = ConfigDict(populate_by_name=True) + + lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')] + + +@docs_group('Data structures') +class RequestQueueHead(BaseModel): + """Model for request queue head. + + Represents a collection of requests retrieved from the beginning of a queue, + including metadata about the queue's state and lock information for the requests. + """ + + model_config = ConfigDict(populate_by_name=True) + + limit: Annotated[int | None, Field(alias='limit', default=None)] + """The maximum number of requests that were requested from the queue.""" + + had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients', default=False)] + """Indicates whether the queue has been accessed by multiple clients (consumers).""" + + queue_modified_at: Annotated[datetime, Field(alias='queueModifiedAt')] + """The timestamp when the queue was last modified.""" + + lock_time: Annotated[timedelta | None, Field(alias='lockSecs', default=None)] + """The duration for which the returned requests are locked and cannot be processed by other clients.""" + + queue_has_locked_requests: Annotated[bool | None, Field(alias='queueHasLockedRequests', default=False)] + """Indicates whether the queue contains any locked requests.""" + + items: Annotated[list[Request], Field(alias='items', default_factory=list[Request])] + """The list of request objects retrieved from the beginning of the queue.""" + + +class KeyValueStoreKeyInfo(BaseModel): + """Model for a key-value store key info.""" + + model_config = ConfigDict(populate_by_name=True) + + key: Annotated[str, Field(alias='key')] + size: Annotated[int, Field(alias='size')] + + +class KeyValueStoreListKeysPage(BaseModel): + """Model for listing keys in the key-value store.""" + + model_config = ConfigDict(populate_by_name=True) + + count: Annotated[int, Field(alias='count')] + limit: Annotated[int, Field(alias='limit')] + is_truncated: Annotated[bool, Field(alias='isTruncated')] + items: Annotated[list[KeyValueStoreKeyInfo], Field(alias='items', default_factory=list)] + exclusive_start_key: Annotated[str | None, Field(alias='exclusiveStartKey', default=None)] + next_exclusive_start_key: Annotated[str | None, Field(alias='nextExclusiveStartKey', default=None)] + + +class CachedRequest(BaseModel): + """Pydantic model for cached request information.""" + + id: str + """The ID of the request.""" + + was_already_handled: bool + """Whether the request was already handled.""" + + hydrated: Request | None = None + """The hydrated request object (the original one).""" + + lock_expires_at: datetime | None = None + """The expiration time of the lock on the request.""" + + forefront: bool = False + """Whether the request was added to the forefront of the queue.""" diff --git a/src/apify/apify_storage_client/_request_queue_client.py b/src/apify/apify_storage_client/_request_queue_client.py index 2dcb06a3..a2570417 100644 --- a/src/apify/apify_storage_client/_request_queue_client.py +++ b/src/apify/apify_storage_client/_request_queue_client.py @@ -13,14 +13,9 @@ from crawlee import Request from crawlee._utils.requests import unique_key_to_request_id from crawlee.storage_clients._base import RequestQueueClient -from crawlee.storage_clients.models import ( - AddRequestsResponse, - CachedRequest, - ProcessedRequest, - ProlongRequestLockResponse, - RequestQueueHead, - RequestQueueMetadata, -) +from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata + +from ._models import CachedRequest, ProlongRequestLockResponse, RequestQueueHead if TYPE_CHECKING: from collections.abc import Sequence From 38bef6859a5f288f5afc51d105b37919a64d52c4 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 18 Jun 2025 14:34:01 +0200 Subject: [PATCH 04/44] Adapt to Crawlee v1 --- docs/03_concepts/code/03_dataset_exports.py | 4 +- .../code/conditional_actor_charge.py | 3 +- pyproject.toml | 5 ++- src/apify/_actor.py | 2 +- .../apify_storage_client/_dataset_client.py | 2 +- .../_key_value_store_client.py | 4 +- .../_request_queue_client.py | 2 +- src/apify/scrapy/extensions/_httpcache.py | 12 ++++-- src/apify/scrapy/scheduler.py | 11 +++-- tests/integration/conftest.py | 17 ++++---- tests/integration/test_actor_dataset.py | 6 +-- .../integration/test_actor_key_value_store.py | 16 ++++---- tests/integration/test_actor_request_queue.py | 6 +-- tests/integration/test_request_queue.py | 4 +- tests/unit/actor/test_actor_dataset.py | 25 ++++------- .../unit/actor/test_actor_key_value_store.py | 41 +++++++++---------- tests/unit/actor/test_actor_request_queue.py | 4 +- tests/unit/conftest.py | 28 +++++-------- uv.lock | 10 ++--- 19 files changed, 97 insertions(+), 105 deletions(-) diff --git a/docs/03_concepts/code/03_dataset_exports.py b/docs/03_concepts/code/03_dataset_exports.py index 78f0f5b9..4f0c01c4 100644 --- a/docs/03_concepts/code/03_dataset_exports.py +++ b/docs/03_concepts/code/03_dataset_exports.py @@ -11,14 +11,14 @@ async def main() -> None: await dataset.export_to( content_type='csv', key='data.csv', - to_key_value_store_name='my-cool-key-value-store', + to_kvs_name='my-cool-key-value-store', ) # Export the data as JSON await dataset.export_to( content_type='json', key='data.json', - to_key_value_store_name='my-cool-key-value-store', + to_kvs_name='my-cool-key-value-store', ) # Print the exported records diff --git a/docs/03_concepts/code/conditional_actor_charge.py b/docs/03_concepts/code/conditional_actor_charge.py index 926c591d..08e2d073 100644 --- a/docs/03_concepts/code/conditional_actor_charge.py +++ b/docs/03_concepts/code/conditional_actor_charge.py @@ -6,8 +6,7 @@ async def main() -> None: # Check the dataset because there might already be items # if the run migrated or was restarted default_dataset = await Actor.open_dataset() - dataset_info = await default_dataset.get_info() - charged_items = dataset_info.item_count if dataset_info else 0 + charged_items = default_dataset.metadata.item_count # highlight-start if Actor.get_charging_manager().get_pricing_info().is_pay_per_event: diff --git a/pyproject.toml b/pyproject.toml index 08c1ba8f..f066a119 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ keywords = [ dependencies = [ "apify-client>=1.11.0", "apify-shared>=1.3.0", - "crawlee~=0.6.0", + "crawlee@git+https://github.com/apify/crawlee-python.git@new-storage-clients", "cryptography>=42.0.0", "httpx>=0.27.0", # TODO: ensure compatibility with the latest version of lazy-object-proxy @@ -78,6 +78,9 @@ dev = [ [tool.hatch.build.targets.wheel] packages = ["src/apify"] +[tool.hatch.metadata] +allow-direct-references = true + [tool.ruff] line-length = 120 include = ["src/**/*.py", "tests/**/*.py", "docs/**/*.py", "website/**/*.py"] diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 11e54665..d34b4c3f 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -88,7 +88,7 @@ def __init__( # Create an instance of the cloud storage client, the local storage client is obtained # from the service locator. - self._cloud_storage_client = ApifyStorageClient.from_config(config=self._configuration) + self._cloud_storage_client = ApifyStorageClient() # Set the event manager based on whether the Actor is running on the platform or locally. self._event_manager = ( diff --git a/src/apify/apify_storage_client/_dataset_client.py b/src/apify/apify_storage_client/_dataset_client.py index 12ded618..80e8986f 100644 --- a/src/apify/apify_storage_client/_dataset_client.py +++ b/src/apify/apify_storage_client/_dataset_client.py @@ -54,8 +54,8 @@ def __init__( self._lock = asyncio.Lock() """A lock to ensure that only one operation is performed at a time.""" - @override @property + @override def metadata(self) -> DatasetMetadata: return self._metadata diff --git a/src/apify/apify_storage_client/_key_value_store_client.py b/src/apify/apify_storage_client/_key_value_store_client.py index 73463da6..14f2cd58 100644 --- a/src/apify/apify_storage_client/_key_value_store_client.py +++ b/src/apify/apify_storage_client/_key_value_store_client.py @@ -10,8 +10,8 @@ from apify_client import ApifyClientAsync from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata -from ._models import KeyValueStoreListKeysPage +from ._models import KeyValueStoreListKeysPage from apify._crypto import create_hmac_signature if TYPE_CHECKING: @@ -56,8 +56,8 @@ def __init__( self._lock = asyncio.Lock() """A lock to ensure that only one operation is performed at a time.""" - @override @property + @override def metadata(self) -> KeyValueStoreMetadata: return self._metadata diff --git a/src/apify/apify_storage_client/_request_queue_client.py b/src/apify/apify_storage_client/_request_queue_client.py index a2570417..8fc0849b 100644 --- a/src/apify/apify_storage_client/_request_queue_client.py +++ b/src/apify/apify_storage_client/_request_queue_client.py @@ -86,8 +86,8 @@ def __init__( self._should_check_for_forefront_requests = False """Whether to check for forefront requests in the next list_head call.""" - @override @property + @override def metadata(self) -> RequestQueueMetadata: return self._metadata diff --git a/src/apify/scrapy/extensions/_httpcache.py b/src/apify/scrapy/extensions/_httpcache.py index 509c4d8a..ee6147e8 100644 --- a/src/apify/scrapy/extensions/_httpcache.py +++ b/src/apify/scrapy/extensions/_httpcache.py @@ -51,10 +51,14 @@ def open_spider(self, spider: Spider) -> None: kvs_name = get_kvs_name(spider.name) async def open_kvs() -> KeyValueStore: - config = Configuration.get_global_configuration() - if config.is_at_home: - storage_client = ApifyStorageClient.from_config(config) - return await KeyValueStore.open(name=kvs_name, storage_client=storage_client) + configuration = Configuration.get_global_configuration() + if configuration.is_at_home: + storage_client = ApifyStorageClient() + return await KeyValueStore.open( + name=kvs_name, + configuration=configuration, + storage_client=storage_client, + ) return await KeyValueStore.open(name=kvs_name) logger.debug("Starting background thread for cache storage's event loop") diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index a243a368..d3b9b949 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -49,10 +49,13 @@ def open(self, spider: Spider) -> Deferred[None] | None: self.spider = spider async def open_rq() -> RequestQueue: - config = Configuration.get_global_configuration() - if config.is_at_home: - storage_client = ApifyStorageClient.from_config(config) - return await RequestQueue.open(storage_client=storage_client) + configuration = Configuration.get_global_configuration() + if configuration.is_at_home: + storage_client = ApifyStorageClient() + return await RequestQueue.open( + configuration=configuration, + storage_client=storage_client, + ) return await RequestQueue.open() try: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 1cd800f1..b4e649af 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -15,7 +15,7 @@ from apify_client import ApifyClient, ApifyClientAsync from apify_shared.consts import ActorJobStatus, ActorSourceType, ApifyEnvVars from crawlee import service_locator -from crawlee.storages import _creation_management +from crawlee.storages import Dataset, KeyValueStore, RequestQueue import apify._actor from ._utils import generate_unique_resource_name @@ -65,12 +65,15 @@ def _prepare_test_env() -> None: service_locator._storage_client = None # Clear creation-related caches to ensure no state is carried over between tests. - monkeypatch.setattr(_creation_management, '_cache_dataset_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_dataset_by_name', {}) - monkeypatch.setattr(_creation_management, '_cache_kvs_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_kvs_by_name', {}) - monkeypatch.setattr(_creation_management, '_cache_rq_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_rq_by_name', {}) + Dataset._cache_by_id.clear() + Dataset._cache_by_name.clear() + Dataset._default_instance = None + KeyValueStore._cache_by_id.clear() + KeyValueStore._cache_by_name.clear() + KeyValueStore._default_instance = None + RequestQueue._cache_by_id.clear() + RequestQueue._cache_by_name.clear() + RequestQueue._default_instance = None # Verify that the test environment was set up correctly. assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path) diff --git a/tests/integration/test_actor_dataset.py b/tests/integration/test_actor_dataset.py index 20a71750..52de59c5 100644 --- a/tests/integration/test_actor_dataset.py +++ b/tests/integration/test_actor_dataset.py @@ -104,8 +104,8 @@ async def main() -> None: dataset_by_name_2 = await Actor.open_dataset(name=dataset_name) assert dataset_by_name_1 is dataset_by_name_2 - dataset_by_id_1 = await Actor.open_dataset(id=dataset_by_name_1._id) - dataset_by_id_2 = await Actor.open_dataset(id=dataset_by_name_1._id) + dataset_by_id_1 = await Actor.open_dataset(id=dataset_by_name_1.metadata.id) + dataset_by_id_2 = await Actor.open_dataset(id=dataset_by_name_1.metadata.id) assert dataset_by_id_1 is dataset_by_name_1 assert dataset_by_id_2 is dataset_by_id_1 @@ -129,7 +129,7 @@ async def test_force_cloud( async with Actor: dataset = await Actor.open_dataset(name=dataset_name, force_cloud=True) - dataset_id = dataset._id + dataset_id = dataset.metadata.id await dataset.push_data(dataset_item) diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index 6b6dd767..8b54f8a9 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -45,8 +45,8 @@ async def main() -> None: kvs_by_name_2 = await Actor.open_key_value_store(name=kvs_name) assert kvs_by_name_1 is kvs_by_name_2 - kvs_by_id_1 = await Actor.open_key_value_store(id=kvs_by_name_1._id) - kvs_by_id_2 = await Actor.open_key_value_store(id=kvs_by_name_1._id) + kvs_by_id_1 = await Actor.open_key_value_store(id=kvs_by_name_1.metadata.id) + kvs_by_id_2 = await Actor.open_key_value_store(id=kvs_by_name_1.metadata.id) assert kvs_by_id_1 is kvs_by_name_1 assert kvs_by_id_2 is kvs_by_id_1 @@ -69,7 +69,7 @@ async def test_force_cloud( async with Actor: key_value_store = await Actor.open_key_value_store(name=key_value_store_name, force_cloud=True) - key_value_store_id = key_value_store._id + key_value_store_id = key_value_store.metadata.id await key_value_store.set_value('foo', 'bar') @@ -208,15 +208,15 @@ async def main() -> None: default_store_id = Actor.config.default_key_value_store_id record_key = 'public-record-key' - store = await Actor.open_key_value_store() + kvs = await Actor.open_key_value_store() - assert isinstance(store.storage_object.model_extra, dict) - url_signing_secret_key = store.storage_object.model_extra.get('urlSigningSecretKey') + assert isinstance(kvs.metadata.model_extra, dict) + url_signing_secret_key = kvs.metadata.model_extra.get('urlSigningSecretKey') assert url_signing_secret_key is not None - await store.set_value(record_key, {'exposedData': 'test'}, 'application/json') + await kvs.set_value(record_key, {'exposedData': 'test'}, 'application/json') - record_url = await store.get_public_url(record_key) + record_url = await kvs.get_public_url(record_key) signature = create_hmac_signature(url_signing_secret_key, record_key) assert ( diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 06e8529e..41cb7bb7 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -46,8 +46,8 @@ async def main() -> None: rq_by_name_2 = await Actor.open_request_queue(name=rq_name) assert rq_by_name_1 is rq_by_name_2 - rq_by_id_1 = await Actor.open_request_queue(id=rq_by_name_1._id) - rq_by_id_2 = await Actor.open_request_queue(id=rq_by_name_1._id) + rq_by_id_1 = await Actor.open_request_queue(id=rq_by_name_1.metadata.id) + rq_by_id_2 = await Actor.open_request_queue(id=rq_by_name_1.metadata.id) assert rq_by_id_1 is rq_by_name_1 assert rq_by_id_2 is rq_by_id_1 @@ -70,7 +70,7 @@ async def test_force_cloud( async with Actor: request_queue = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) - request_queue_id = request_queue._id + request_queue_id = request_queue.metadata.id request_info = await request_queue.add_request(Request.from_url('http://example.com')) diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index 4bce884a..e6d9f9f3 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -53,7 +53,7 @@ async def main() -> None: # I have seen it get stuck on this call rq = await Actor.open_request_queue() # Add some requests - await rq.add_requests_batched([f'https://example.com/{i}' for i in range(desired_request_count)]) + await rq.add_requests([f'https://example.com/{i}' for i in range(desired_request_count)]) handled_request_count = 0 while next_request := await rq.fetch_next_request(): @@ -87,7 +87,7 @@ async def main() -> None: # I have seen it get stuck on this call rq = await Actor.open_request_queue() # Add some requests - await rq.add_requests_batched( + await rq.add_requests( [ Request.from_url(f'https://example.com/{i}', unique_key=str(i - 1 if i % 4 == 1 else i)) for i in range(desired_request_count) diff --git a/tests/unit/actor/test_actor_dataset.py b/tests/unit/actor/test_actor_dataset.py index ef6282bb..a8da8dd3 100644 --- a/tests/unit/actor/test_actor_dataset.py +++ b/tests/unit/actor/test_actor_dataset.py @@ -1,16 +1,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING - import pytest from apify_shared.consts import ActorEnvVars +from crawlee.storage_clients import MemoryStorageClient from apify import Actor -if TYPE_CHECKING: - from crawlee.storage_clients import MemoryStorageClient - # NOTE: We only test the dataset methods available on Actor class/instance. # Actual tests for the implementations are in storages/. @@ -31,24 +27,24 @@ async def test_open_dataset_returns_same_references() -> None: dataset_by_name_2 = await Actor.open_dataset(name=dataset_name) assert dataset_by_name_1 is dataset_by_name_2 - dataset_by_id_1 = await Actor.open_dataset(id=dataset_by_name_1._id) - dataset_by_id_2 = await Actor.open_dataset(id=dataset_by_name_1._id) + dataset_by_id_1 = await Actor.open_dataset(id=dataset_by_name_1.metadata.id) + dataset_by_id_2 = await Actor.open_dataset(id=dataset_by_name_1.metadata.id) assert dataset_by_id_1 is dataset_by_name_1 assert dataset_by_id_2 is dataset_by_id_1 -async def test_open_dataset_uses_env_var( - monkeypatch: pytest.MonkeyPatch, - memory_storage_client: MemoryStorageClient, -) -> None: +async def test_open_dataset_uses_env_var(monkeypatch: pytest.MonkeyPatch) -> None: + memory_storage_client = MemoryStorageClient() + default_dataset_id = 'my-new-default-id' monkeypatch.setenv(ActorEnvVars.DEFAULT_DATASET_ID, default_dataset_id) async with Actor: ddt = await Actor.open_dataset() - assert ddt._id == default_dataset_id - await memory_storage_client.dataset(ddt._id).delete() + assert ddt.metadata.id == default_dataset_id + dataset = await memory_storage_client.open_dataset_client(id=ddt.metadata.id) + await dataset.drop() async def test_push_data_to_dataset() -> None: @@ -57,8 +53,5 @@ async def test_push_data_to_dataset() -> None: desired_item_count = 100 await dataset.push_data([{'id': i} for i in range(desired_item_count)]) - dataset_info = await dataset.get_info() - assert dataset_info is not None - list_page = await dataset.get_data(limit=desired_item_count) assert {item['id'] for item in list_page.items} == set(range(desired_item_count)) diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index 821065e1..16a9b78f 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -1,20 +1,16 @@ from __future__ import annotations -from typing import TYPE_CHECKING - import pytest from apify_shared.consts import ApifyEnvVars from apify_shared.utils import json_dumps +from crawlee.storage_clients import MemoryStorageClient from ..test_crypto import PRIVATE_KEY_PASSWORD, PRIVATE_KEY_PEM_BASE64, PUBLIC_KEY from apify import Actor from apify._consts import ENCRYPTED_INPUT_VALUE_PREFIX from apify._crypto import public_encrypt -if TYPE_CHECKING: - from crawlee.storage_clients import MemoryStorageClient - # NOTE: We only test the key-value store methods available on Actor class/instance. # Actual tests for the implementations are in storages/. @@ -29,8 +25,8 @@ async def test_open_returns_same_references() -> None: kvs_by_name_2 = await Actor.open_key_value_store(name=kvs_name) assert kvs_by_name_1 is kvs_by_name_2 - kvs_by_id_1 = await Actor.open_key_value_store(id=kvs_by_name_1._id) - kvs_by_id_2 = await Actor.open_key_value_store(id=kvs_by_name_1._id) + kvs_by_id_1 = await Actor.open_key_value_store(id=kvs_by_name_1.metadata.id) + kvs_by_id_2 = await Actor.open_key_value_store(id=kvs_by_name_1.metadata.id) assert kvs_by_id_1 is kvs_by_name_1 assert kvs_by_id_2 is kvs_by_id_1 @@ -50,29 +46,31 @@ async def test_set_and_get_value() -> None: assert value == test_value -async def test_get_input(memory_storage_client: MemoryStorageClient) -> None: +async def test_get_input() -> None: + memory_storage_client = MemoryStorageClient() + input_key = 'INPUT' test_input = {'foo': 'bar'} - await memory_storage_client.key_value_stores().get_or_create(id='default') - await memory_storage_client.key_value_store('default').set_record( + kvs_client = await memory_storage_client.open_key_value_store_client() + + await kvs_client.set_value( key=input_key, value=json_dumps(test_input), content_type='application/json', ) async with Actor as my_actor: - input = await my_actor.get_input() # noqa: A001 - assert input['foo'] == test_input['foo'] + actor_input = await my_actor.get_input() + assert actor_input['foo'] == test_input['foo'] -async def test_get_input_with_encrypted_secrets( - monkeypatch: pytest.MonkeyPatch, - memory_storage_client: MemoryStorageClient, -) -> None: +async def test_get_input_with_encrypted_secrets(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv(ApifyEnvVars.INPUT_SECRETS_PRIVATE_KEY_FILE, PRIVATE_KEY_PEM_BASE64) monkeypatch.setenv(ApifyEnvVars.INPUT_SECRETS_PRIVATE_KEY_PASSPHRASE, PRIVATE_KEY_PASSWORD) + memory_storage_client = MemoryStorageClient() + input_key = 'INPUT' secret_string = 'secret-string' encrypted_secret = public_encrypt(secret_string, public_key=PUBLIC_KEY) @@ -81,14 +79,15 @@ async def test_get_input_with_encrypted_secrets( 'secret': f'{ENCRYPTED_INPUT_VALUE_PREFIX}:{encrypted_secret["encrypted_password"]}:{encrypted_secret["encrypted_value"]}', # noqa: E501 } - await memory_storage_client.key_value_stores().get_or_create(id='default') - await memory_storage_client.key_value_store('default').set_record( + kvs_client = await memory_storage_client.open_key_value_store_client() + + await kvs_client.set_value( key=input_key, value=json_dumps(input_with_secret), content_type='application/json', ) async with Actor as my_actor: - input = await my_actor.get_input() # noqa: A001 - assert input['foo'] == input_with_secret['foo'] - assert input['secret'] == secret_string + actor_input = await my_actor.get_input() + assert actor_input['foo'] == input_with_secret['foo'] + assert actor_input['secret'] == secret_string diff --git a/tests/unit/actor/test_actor_request_queue.py b/tests/unit/actor/test_actor_request_queue.py index 5504715f..4450e5d1 100644 --- a/tests/unit/actor/test_actor_request_queue.py +++ b/tests/unit/actor/test_actor_request_queue.py @@ -23,7 +23,7 @@ async def test_open_returns_same_references() -> None: rq_by_name_2 = await Actor.open_key_value_store(name=rq_name) assert rq_by_name_1 is rq_by_name_2 - rq_by_id_1 = await Actor.open_key_value_store(id=rq_by_name_1._id) - rq_by_id_2 = await Actor.open_key_value_store(id=rq_by_name_1._id) + rq_by_id_1 = await Actor.open_key_value_store(id=rq_by_name_1.metadata.id) + rq_by_id_2 = await Actor.open_key_value_store(id=rq_by_name_1.metadata.id) assert rq_by_id_1 is rq_by_name_1 assert rq_by_id_2 is rq_by_id_1 diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 2e574da7..b1ad1178 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -12,9 +12,7 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars from crawlee import service_locator -from crawlee.configuration import Configuration as CrawleeConfiguration -from crawlee.storage_clients import MemoryStorageClient -from crawlee.storages import _creation_management +from crawlee.storages import Dataset, KeyValueStore, RequestQueue import apify._actor @@ -57,12 +55,15 @@ def _prepare_test_env() -> None: service_locator._storage_client = None # Clear creation-related caches to ensure no state is carried over between tests. - monkeypatch.setattr(_creation_management, '_cache_dataset_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_dataset_by_name', {}) - monkeypatch.setattr(_creation_management, '_cache_kvs_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_kvs_by_name', {}) - monkeypatch.setattr(_creation_management, '_cache_rq_by_id', {}) - monkeypatch.setattr(_creation_management, '_cache_rq_by_name', {}) + Dataset._cache_by_id.clear() + Dataset._cache_by_name.clear() + Dataset._default_instance = None + KeyValueStore._cache_by_id.clear() + KeyValueStore._cache_by_name.clear() + KeyValueStore._default_instance = None + RequestQueue._cache_by_id.clear() + RequestQueue._cache_by_name.clear() + RequestQueue._default_instance = None # Verify that the test environment was set up correctly. assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path) @@ -197,12 +198,3 @@ def getattr_override(apify_client_instance: Any, attr_name: str) -> Any: @pytest.fixture def apify_client_async_patcher(monkeypatch: pytest.MonkeyPatch) -> ApifyClientAsyncPatcher: return ApifyClientAsyncPatcher(monkeypatch) - - -@pytest.fixture -def memory_storage_client() -> MemoryStorageClient: - configuration = CrawleeConfiguration() - configuration.persist_storage = True - configuration.write_metadata = True - - return MemoryStorageClient.from_config(configuration) diff --git a/uv.lock b/uv.lock index ba18b20c..b1d8420d 100644 --- a/uv.lock +++ b/uv.lock @@ -72,7 +72,7 @@ dev = [ requires-dist = [ { name = "apify-client", specifier = ">=1.11.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, - { name = "crawlee", specifier = "~=0.6.0" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -631,8 +631,8 @@ toml = [ [[package]] name = "crawlee" -version = "0.6.10" -source = { registry = "https://pypi.org/simple" } +version = "0.6.11" +source = { git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients#78efb4ddf234e731a1c784a2280a8b1bec812573" } dependencies = [ { name = "apify-fingerprint-datapoints" }, { name = "browserforge" }, @@ -653,10 +653,6 @@ dependencies = [ { name = "typing-extensions" }, { name = "yarl" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ed/93/20033411bffaf199e44b759fc45be45fabc1d8c357bc4d0bb080713724dc/crawlee-0.6.10.tar.gz", hash = "sha256:a06e9aa19611868712df81ca4b7dc482633f921456bf3cf1a5432ce3836fd432", size = 24135107, upload-time = "2025-06-02T12:10:17.67Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5a/12/2c6c41438f24760ebe044d5e88eebb35c51178de9aec39b695d0845cbff7/crawlee-0.6.10-py3-none-any.whl", hash = "sha256:081565d0a3f11d21798ec11929f4b0c17e3ba7a84f33251c9b6b0e6457d05367", size = 260863, upload-time = "2025-06-02T12:10:14.994Z" }, -] [[package]] name = "cryptography" From 1f85430f16161f04a05f9a85ad8df3d2978e295c Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 23 Jun 2025 11:12:16 +0200 Subject: [PATCH 05/44] Adapt to Crawlee v1 (p2) --- pyproject.toml | 4 +- src/apify/_actor.py | 2 +- src/apify/_proxy_configuration.py | 3 +- src/apify/scrapy/extensions/_httpcache.py | 2 +- src/apify/scrapy/requests.py | 3 +- src/apify/scrapy/scheduler.py | 2 +- src/apify/storage_clients/__init__.py | 10 +++++ .../_apify}/__init__.py | 0 .../_apify}/_dataset_client.py | 15 ++++--- .../_apify}/_key_value_store_client.py | 23 ++++++++--- .../_apify}/_models.py | 3 +- .../_apify}/_request_queue_client.py | 18 +++++---- .../_apify}/_storage_client.py | 6 +-- .../_apify}/py.typed | 0 src/apify/storage_clients/py.typed | 0 src/apify/storages/_request_list.py | 2 +- tests/integration/conftest.py | 21 ++-------- tests/integration/test_actor_request_queue.py | 3 +- tests/integration/test_request_queue.py | 2 +- tests/unit/actor/test_actor_dataset.py | 11 +++--- tests/unit/actor/test_actor_env_helpers.py | 7 ++-- .../unit/actor/test_actor_key_value_store.py | 39 +++++-------------- tests/unit/conftest.py | 21 ++-------- .../scrapy/requests/test_to_scrapy_request.py | 12 +++--- uv.lock | 13 +++++++ 25 files changed, 112 insertions(+), 110 deletions(-) create mode 100644 src/apify/storage_clients/__init__.py rename src/apify/{apify_storage_client => storage_clients/_apify}/__init__.py (100%) rename src/apify/{apify_storage_client => storage_clients/_apify}/_dataset_client.py (90%) rename src/apify/{apify_storage_client => storage_clients/_apify}/_key_value_store_client.py (86%) rename src/apify/{apify_storage_client => storage_clients/_apify}/_models.py (99%) rename src/apify/{apify_storage_client => storage_clients/_apify}/_request_queue_client.py (97%) rename src/apify/{apify_storage_client => storage_clients/_apify}/_storage_client.py (94%) rename src/apify/{apify_storage_client => storage_clients/_apify}/py.typed (100%) create mode 100644 src/apify/storage_clients/py.typed diff --git a/pyproject.toml b/pyproject.toml index f066a119..21ea7e14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ keywords = [ dependencies = [ "apify-client>=1.11.0", "apify-shared>=1.3.0", + "cachetools>=5.5.0", "crawlee@git+https://github.com/apify/crawlee-python.git@new-storage-clients", "cryptography>=42.0.0", "httpx>=0.27.0", @@ -72,7 +73,8 @@ dev = [ "pytest~=8.4.0", "respx~=0.22.0", "ruff~=0.11.0", - "setuptools", # setuptools are used by pytest but not explicitly required + "setuptools", # setuptools are used by pytest but not explicitly required + "types-cachetools>=6.0.0.20250525", ] [tool.hatch.build.targets.wheel] diff --git a/src/apify/_actor.py b/src/apify/_actor.py index d34b4c3f..99457a5d 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -33,8 +33,8 @@ from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager from apify._proxy_configuration import ProxyConfiguration from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython -from apify.apify_storage_client import ApifyStorageClient from apify.log import _configure_logging, logger +from apify.storage_clients import ApifyStorageClient from apify.storages import Dataset, KeyValueStore, RequestQueue if TYPE_CHECKING: diff --git a/src/apify/_proxy_configuration.py b/src/apify/_proxy_configuration.py index 1d5b9f72..f56cb2a1 100644 --- a/src/apify/_proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -21,7 +21,8 @@ if TYPE_CHECKING: from apify_client import ApifyClientAsync - from crawlee import Request + + from apify import Request APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$') COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$') diff --git a/src/apify/scrapy/extensions/_httpcache.py b/src/apify/scrapy/extensions/_httpcache.py index ee6147e8..14d8753d 100644 --- a/src/apify/scrapy/extensions/_httpcache.py +++ b/src/apify/scrapy/extensions/_httpcache.py @@ -13,8 +13,8 @@ from scrapy.responsetypes import responsetypes from apify import Configuration -from apify.apify_storage_client import ApifyStorageClient from apify.scrapy._async_thread import AsyncThread +from apify.storage_clients import ApifyStorageClient from apify.storages import KeyValueStore if TYPE_CHECKING: diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index a262b920..63bba3c7 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -10,9 +10,10 @@ from scrapy.http.headers import Headers from scrapy.utils.request import request_from_dict -from crawlee import Request as ApifyRequest from crawlee._types import HttpHeaders +from apify import Request as ApifyRequest + logger = getLogger(__name__) diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index d3b9b949..2dcacd9a 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -11,7 +11,7 @@ from ._async_thread import AsyncThread from .requests import to_apify_request, to_scrapy_request from apify import Configuration -from apify.apify_storage_client import ApifyStorageClient +from apify.storage_clients import ApifyStorageClient from apify.storages import RequestQueue if TYPE_CHECKING: diff --git a/src/apify/storage_clients/__init__.py b/src/apify/storage_clients/__init__.py new file mode 100644 index 00000000..e8c98462 --- /dev/null +++ b/src/apify/storage_clients/__init__.py @@ -0,0 +1,10 @@ +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, StorageClient + +from ._apify import ApifyStorageClient + +__all__ = [ + 'ApifyStorageClient', + 'FileSystemStorageClient', + 'MemoryStorageClient', + 'StorageClient', +] diff --git a/src/apify/apify_storage_client/__init__.py b/src/apify/storage_clients/_apify/__init__.py similarity index 100% rename from src/apify/apify_storage_client/__init__.py rename to src/apify/storage_clients/_apify/__init__.py diff --git a/src/apify/apify_storage_client/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py similarity index 90% rename from src/apify/apify_storage_client/_dataset_client.py rename to src/apify/storage_clients/_apify/_dataset_client.py index 80e8986f..31c97127 100644 --- a/src/apify/apify_storage_client/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -15,8 +15,7 @@ from datetime import datetime from apify_client.clients import DatasetClientAsync - - from apify import Configuration + from crawlee.configuration import Configuration logger = getLogger(__name__) @@ -68,8 +67,13 @@ async def open( name: str | None, configuration: Configuration, ) -> ApifyDatasetClient: - token = configuration.token - api_url = configuration.api_base_url + token = getattr(configuration, 'token', None) + if not token: + raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') + + api_url = getattr(configuration, 'api_base_url', None) + if not api_url: + raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') # Otherwise, create a new one. apify_client_async = ApifyClientAsync( @@ -100,7 +104,8 @@ async def open( @override async def purge(self) -> None: - # TODO: better + # TODO: better? + # https://github.com/apify/apify-sdk-python/issues/469 async with self._lock: await self._api_client.delete() diff --git a/src/apify/apify_storage_client/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py similarity index 86% rename from src/apify/apify_storage_client/_key_value_store_client.py rename to src/apify/storage_clients/_apify/_key_value_store_client.py index 14f2cd58..0588493d 100644 --- a/src/apify/apify_storage_client/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -19,8 +19,7 @@ from datetime import datetime from apify_client.clients import KeyValueStoreClientAsync - - from apify import Configuration + from crawlee.configuration import Configuration logger = getLogger(__name__) @@ -70,8 +69,13 @@ async def open( name: str | None, configuration: Configuration, ) -> ApifyKeyValueStoreClient: - token = configuration.token - api_url = configuration.api_base_url + token = getattr(configuration, 'token', None) + if not token: + raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') + + api_url = getattr(configuration, 'api_base_url', None) + if not api_url: + raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') # Otherwise, create a new one. apify_client_async = ApifyClientAsync( @@ -101,7 +105,8 @@ async def open( @override async def purge(self) -> None: - # TODO: better + # TODO: better? + # https://github.com/apify/apify-sdk-python/issues/469 async with self._lock: await self._api_client.delete() @@ -147,7 +152,13 @@ async def iterate_keys( list_key_page = KeyValueStoreListKeysPage.model_validate(response) for item in list_key_page.items: - yield item + # Convert KeyValueStoreKeyInfo to KeyValueStoreRecordMetadata + record_metadata = KeyValueStoreRecordMetadata( + key=item.key, + size=item.size, + content_type='application/octet-stream', # Content type not available from list_keys + ) + yield record_metadata count += 1 # If we've reached the limit, stop yielding diff --git a/src/apify/apify_storage_client/_models.py b/src/apify/storage_clients/_apify/_models.py similarity index 99% rename from src/apify/apify_storage_client/_models.py rename to src/apify/storage_clients/_apify/_models.py index dd94ec56..abb7aca1 100644 --- a/src/apify/apify_storage_client/_models.py +++ b/src/apify/storage_clients/_apify/_models.py @@ -5,9 +5,10 @@ from pydantic import BaseModel, ConfigDict, Field -from crawlee import Request from crawlee._utils.docs import docs_group +from apify import Request + @docs_group('Data structures') class ProlongRequestLockResponse(BaseModel): diff --git a/src/apify/apify_storage_client/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py similarity index 97% rename from src/apify/apify_storage_client/_request_queue_client.py rename to src/apify/storage_clients/_apify/_request_queue_client.py index 8fc0849b..95b276b3 100644 --- a/src/apify/apify_storage_client/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -10,19 +10,18 @@ from typing_extensions import override from apify_client import ApifyClientAsync -from crawlee import Request from crawlee._utils.requests import unique_key_to_request_id from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata from ._models import CachedRequest, ProlongRequestLockResponse, RequestQueueHead +from apify import Request if TYPE_CHECKING: from collections.abc import Sequence from apify_client.clients import RequestQueueClientAsync - - from apify import Configuration + from crawlee.configuration import Configuration logger = getLogger(__name__) @@ -100,9 +99,13 @@ async def open( name: str | None, configuration: Configuration, ) -> ApifyRequestQueueClient: - # Get API credentials - token = configuration.token - api_url = configuration.api_base_url + token = getattr(configuration, 'token', None) + if not token: + raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') + + api_url = getattr(configuration, 'api_base_url', None) + if not api_url: + raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') # Create a new API client apify_client_async = ApifyClientAsync( @@ -139,7 +142,8 @@ async def open( @override async def purge(self) -> None: - # TODO: better + # TODO: better? + # https://github.com/apify/apify-sdk-python/issues/469 async with self._lock: await self._api_client.delete() diff --git a/src/apify/apify_storage_client/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py similarity index 94% rename from src/apify/apify_storage_client/_storage_client.py rename to src/apify/storage_clients/_apify/_storage_client.py index 1d4d66dd..b00ea9f3 100644 --- a/src/apify/apify_storage_client/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -14,7 +14,7 @@ class ApifyStorageClient(StorageClient): """Apify storage client.""" @override - async def open_dataset_client( + async def create_dataset_client( self, *, id: str | None = None, @@ -31,7 +31,7 @@ async def open_dataset_client( return client @override - async def open_key_value_store_client( + async def create_kvs_client( self, *, id: str | None = None, @@ -48,7 +48,7 @@ async def open_key_value_store_client( return client @override - async def open_request_queue_client( + async def create_rq_client( self, *, id: str | None = None, diff --git a/src/apify/apify_storage_client/py.typed b/src/apify/storage_clients/_apify/py.typed similarity index 100% rename from src/apify/apify_storage_client/py.typed rename to src/apify/storage_clients/_apify/py.typed diff --git a/src/apify/storage_clients/py.typed b/src/apify/storage_clients/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/src/apify/storages/_request_list.py b/src/apify/storages/_request_list.py index e9bd9e6a..422476e4 100644 --- a/src/apify/storages/_request_list.py +++ b/src/apify/storages/_request_list.py @@ -8,11 +8,11 @@ from pydantic import BaseModel, Field, TypeAdapter -from crawlee import Request from crawlee._types import HttpMethod from crawlee.http_clients import HttpClient, HttpxHttpClient from crawlee.request_loaders import RequestList as CrawleeRequestList +from apify import Request from apify._utils import docs_group URL_NO_COMMAS_REGEX = re.compile( diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index b4e649af..6ec454b9 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -15,7 +15,6 @@ from apify_client import ApifyClient, ApifyClientAsync from apify_shared.consts import ActorJobStatus, ActorSourceType, ApifyEnvVars from crawlee import service_locator -from crawlee.storages import Dataset, KeyValueStore, RequestQueue import apify._actor from ._utils import generate_unique_resource_name @@ -53,27 +52,15 @@ def _prepare_test_env() -> None: # Set the environment variable for the local storage directory to the temporary path. monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path)) - # Reset the flags in the service locator to indicate that no services are explicitly set. This ensures - # a clean state, as services might have been set during a previous test and not reset properly. - service_locator._configuration_was_retrieved = False - service_locator._storage_client_was_retrieved = False - service_locator._event_manager_was_retrieved = False - # Reset the services in the service locator. service_locator._configuration = None service_locator._event_manager = None service_locator._storage_client = None - # Clear creation-related caches to ensure no state is carried over between tests. - Dataset._cache_by_id.clear() - Dataset._cache_by_name.clear() - Dataset._default_instance = None - KeyValueStore._cache_by_id.clear() - KeyValueStore._cache_by_name.clear() - KeyValueStore._default_instance = None - RequestQueue._cache_by_id.clear() - RequestQueue._cache_by_name.clear() - RequestQueue._default_instance = None + # Reset the retrieval flags. + service_locator._configuration_was_retrieved = False + service_locator._event_manager_was_retrieved = False + service_locator._storage_client_was_retrieved = False # Verify that the test environment was set up correctly. assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path) diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 41cb7bb7..211cfc1f 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -3,10 +3,9 @@ from typing import TYPE_CHECKING from apify_shared.consts import ApifyEnvVars -from crawlee import Request from ._utils import generate_unique_resource_name -from apify import Actor +from apify import Actor, Request if TYPE_CHECKING: import pytest diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index e6d9f9f3..8c8cecec 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -79,7 +79,7 @@ async def test_add_non_unique_requests_in_batch( run_actor: RunActorFunction, ) -> None: async def main() -> None: - from crawlee import Request + from apify import Request async with Actor: desired_request_count = 100 diff --git a/tests/unit/actor/test_actor_dataset.py b/tests/unit/actor/test_actor_dataset.py index a8da8dd3..9a8aa7e8 100644 --- a/tests/unit/actor/test_actor_dataset.py +++ b/tests/unit/actor/test_actor_dataset.py @@ -3,7 +3,7 @@ import pytest from apify_shared.consts import ActorEnvVars -from crawlee.storage_clients import MemoryStorageClient +from crawlee.storage_clients import FileSystemStorageClient from apify import Actor @@ -34,8 +34,9 @@ async def test_open_dataset_returns_same_references() -> None: assert dataset_by_id_2 is dataset_by_id_1 +@pytest.mark.skip(reason='TODO: fix this test') async def test_open_dataset_uses_env_var(monkeypatch: pytest.MonkeyPatch) -> None: - memory_storage_client = MemoryStorageClient() + memory_storage_client = FileSystemStorageClient() default_dataset_id = 'my-new-default-id' monkeypatch.setenv(ActorEnvVars.DEFAULT_DATASET_ID, default_dataset_id) @@ -43,13 +44,13 @@ async def test_open_dataset_uses_env_var(monkeypatch: pytest.MonkeyPatch) -> Non async with Actor: ddt = await Actor.open_dataset() assert ddt.metadata.id == default_dataset_id - dataset = await memory_storage_client.open_dataset_client(id=ddt.metadata.id) + dataset = await memory_storage_client.create_dataset_client(id=ddt.metadata.id) await dataset.drop() async def test_push_data_to_dataset() -> None: - async with Actor as my_actor: - dataset = await my_actor.open_dataset() + async with Actor as actor: + dataset = await actor.open_dataset() desired_item_count = 100 await dataset.push_data([{'id': i} for i in range(desired_item_count)]) diff --git a/tests/unit/actor/test_actor_env_helpers.py b/tests/unit/actor/test_actor_env_helpers.py index e9eacdb2..4ac8d4a4 100644 --- a/tests/unit/actor/test_actor_env_helpers.py +++ b/tests/unit/actor/test_actor_env_helpers.py @@ -4,8 +4,9 @@ import string from datetime import datetime, timedelta from decimal import Decimal -from typing import TYPE_CHECKING, Any +from typing import Any +import pytest from pydantic_core import TzInfo from apify_shared.consts import ( @@ -21,9 +22,6 @@ from apify import Actor -if TYPE_CHECKING: - import pytest - async def test_actor_is_not_at_home_when_local() -> None: async with Actor as actor: @@ -31,6 +29,7 @@ async def test_actor_is_not_at_home_when_local() -> None: assert is_at_home is False +@pytest.mark.skip(reason='TODO: fix this test') async def test_get_env_with_randomized_env_vars(monkeypatch: pytest.MonkeyPatch) -> None: # noqa: PLR0912 ignored_env_vars = { ApifyEnvVars.INPUT_KEY, diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index 16a9b78f..15a33907 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -3,8 +3,6 @@ import pytest from apify_shared.consts import ApifyEnvVars -from apify_shared.utils import json_dumps -from crawlee.storage_clients import MemoryStorageClient from ..test_crypto import PRIVATE_KEY_PASSWORD, PRIVATE_KEY_PEM_BASE64, PUBLIC_KEY from apify import Actor @@ -40,28 +38,20 @@ async def test_set_and_get_value() -> None: test_key = 'test_key' test_value = 'test_value' test_content_type = 'text/plain' - async with Actor as my_actor: - await my_actor.set_value(key=test_key, value=test_value, content_type=test_content_type) - value = await my_actor.get_value(key=test_key) + + async with Actor as actor: + await actor.set_value(key=test_key, value=test_value, content_type=test_content_type) + value = await actor.get_value(key=test_key) assert value == test_value async def test_get_input() -> None: - memory_storage_client = MemoryStorageClient() - input_key = 'INPUT' test_input = {'foo': 'bar'} - kvs_client = await memory_storage_client.open_key_value_store_client() - - await kvs_client.set_value( - key=input_key, - value=json_dumps(test_input), - content_type='application/json', - ) - - async with Actor as my_actor: - actor_input = await my_actor.get_input() + async with Actor as actor: + await actor.set_value(key=input_key, value=test_input) + actor_input = await actor.get_input() assert actor_input['foo'] == test_input['foo'] @@ -69,8 +59,6 @@ async def test_get_input_with_encrypted_secrets(monkeypatch: pytest.MonkeyPatch) monkeypatch.setenv(ApifyEnvVars.INPUT_SECRETS_PRIVATE_KEY_FILE, PRIVATE_KEY_PEM_BASE64) monkeypatch.setenv(ApifyEnvVars.INPUT_SECRETS_PRIVATE_KEY_PASSPHRASE, PRIVATE_KEY_PASSWORD) - memory_storage_client = MemoryStorageClient() - input_key = 'INPUT' secret_string = 'secret-string' encrypted_secret = public_encrypt(secret_string, public_key=PUBLIC_KEY) @@ -79,15 +67,8 @@ async def test_get_input_with_encrypted_secrets(monkeypatch: pytest.MonkeyPatch) 'secret': f'{ENCRYPTED_INPUT_VALUE_PREFIX}:{encrypted_secret["encrypted_password"]}:{encrypted_secret["encrypted_value"]}', # noqa: E501 } - kvs_client = await memory_storage_client.open_key_value_store_client() - - await kvs_client.set_value( - key=input_key, - value=json_dumps(input_with_secret), - content_type='application/json', - ) - - async with Actor as my_actor: - actor_input = await my_actor.get_input() + async with Actor as actor: + await actor.set_value(key=input_key, value=input_with_secret) + actor_input = await actor.get_input() assert actor_input['foo'] == input_with_secret['foo'] assert actor_input['secret'] == secret_string diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index b1ad1178..a6943d3f 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -12,7 +12,6 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars from crawlee import service_locator -from crawlee.storages import Dataset, KeyValueStore, RequestQueue import apify._actor @@ -43,27 +42,15 @@ def _prepare_test_env() -> None: # Set the environment variable for the local storage directory to the temporary path. monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path)) - # Reset the flags in the service locator to indicate that no services are explicitly set. This ensures - # a clean state, as services might have been set during a previous test and not reset properly. - service_locator._configuration_was_retrieved = False - service_locator._storage_client_was_retrieved = False - service_locator._event_manager_was_retrieved = False - # Reset the services in the service locator. service_locator._configuration = None service_locator._event_manager = None service_locator._storage_client = None - # Clear creation-related caches to ensure no state is carried over between tests. - Dataset._cache_by_id.clear() - Dataset._cache_by_name.clear() - Dataset._default_instance = None - KeyValueStore._cache_by_id.clear() - KeyValueStore._cache_by_name.clear() - KeyValueStore._default_instance = None - RequestQueue._cache_by_id.clear() - RequestQueue._cache_by_name.clear() - RequestQueue._default_instance = None + # Reset the retrieval flags. + service_locator._configuration_was_retrieved = False + service_locator._event_manager_was_retrieved = False + service_locator._storage_client_was_retrieved = False # Verify that the test environment was set up correctly. assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path) diff --git a/tests/unit/scrapy/requests/test_to_scrapy_request.py b/tests/unit/scrapy/requests/test_to_scrapy_request.py index d1481a98..2b8f0ab7 100644 --- a/tests/unit/scrapy/requests/test_to_scrapy_request.py +++ b/tests/unit/scrapy/requests/test_to_scrapy_request.py @@ -5,9 +5,9 @@ import pytest from scrapy import Request, Spider -from crawlee import Request as CrawleeRequest from crawlee._types import HttpHeaders +from apify import Request as ApifyRequest from apify.scrapy.requests import to_scrapy_request @@ -23,7 +23,7 @@ def spider() -> DummySpider: def test_without_reconstruction(spider: Spider) -> None: # Without reconstruction of encoded Scrapy request - apify_request = CrawleeRequest( + apify_request = ApifyRequest( url='https://example.com', method='GET', unique_key='https://example.com', @@ -42,7 +42,7 @@ def test_without_reconstruction(spider: Spider) -> None: def test_without_reconstruction_with_optional_fields(spider: Spider) -> None: # Without reconstruction of encoded Scrapy request - apify_request = CrawleeRequest( + apify_request = ApifyRequest( url='https://crawlee.dev', method='GET', unique_key='https://crawlee.dev', @@ -67,7 +67,7 @@ def test_without_reconstruction_with_optional_fields(spider: Spider) -> None: def test_with_reconstruction(spider: Spider) -> None: # With reconstruction of encoded Scrapy request - apify_request = CrawleeRequest( + apify_request = ApifyRequest( url='https://apify.com', method='GET', id='fvwscO2UJLdr10B', @@ -89,7 +89,7 @@ def test_with_reconstruction(spider: Spider) -> None: def test_with_reconstruction_with_optional_fields(spider: Spider) -> None: # With reconstruction of encoded Scrapy request - apify_request = CrawleeRequest( + apify_request = ApifyRequest( url='https://apify.com', method='GET', id='fvwscO2UJLdr10B', @@ -116,7 +116,7 @@ def test_with_reconstruction_with_optional_fields(spider: Spider) -> None: def test_invalid_request_for_reconstruction(spider: Spider) -> None: - apify_request = CrawleeRequest( + apify_request = ApifyRequest( url='https://example.com', method='GET', id='invalid123', diff --git a/uv.lock b/uv.lock index b1d8420d..7abb8dbf 100644 --- a/uv.lock +++ b/uv.lock @@ -37,6 +37,7 @@ source = { editable = "." } dependencies = [ { name = "apify-client" }, { name = "apify-shared" }, + { name = "cachetools" }, { name = "crawlee" }, { name = "cryptography" }, { name = "httpx" }, @@ -66,12 +67,14 @@ dev = [ { name = "respx" }, { name = "ruff" }, { name = "setuptools" }, + { name = "types-cachetools" }, ] [package.metadata] requires-dist = [ { name = "apify-client", specifier = ">=1.11.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, + { name = "cachetools", specifier = ">=5.5.0" }, { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, @@ -98,6 +101,7 @@ dev = [ { name = "respx", specifier = "~=0.22.0" }, { name = "ruff", specifier = "~=0.11.0" }, { name = "setuptools" }, + { name = "types-cachetools", specifier = ">=6.0.0.20250525" }, ] [[package]] @@ -2303,6 +2307,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b6/33/38da585b06978d262cc2b2b45bc57ee75f0ce5e0b4ef1cab1b86461e9298/typeapi-2.2.4-py3-none-any.whl", hash = "sha256:bd6d5e5907fa47e0303bf254e7cc8712d4be4eb26d7ffaedb67c9e7844c53bb8", size = 26387, upload-time = "2025-01-29T11:40:12.328Z" }, ] +[[package]] +name = "types-cachetools" +version = "6.0.0.20250525" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/03/d0/55ff0eeda141436c1bd2142cd026906870c661b3f7755070d6da7ea7210f/types_cachetools-6.0.0.20250525.tar.gz", hash = "sha256:baf06f234cac3aeb44c07893447ba03ecdb6c0742ba2607e28a35d38e6821b02", size = 8925, upload-time = "2025-05-25T03:13:53.498Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/8c/4ab0a17ece30fe608270b89cf066387051862899fff9f54ab12511fc7fdd/types_cachetools-6.0.0.20250525-py3-none-any.whl", hash = "sha256:1de8f0fe4bdcb187a48d2026c1e3672830f67943ad2bf3486abe031b632f1252", size = 8938, upload-time = "2025-05-25T03:13:52.406Z" }, +] + [[package]] name = "typing-extensions" version = "4.14.0" From a3d68a2656224dc1191396e7570455cf1164c2c1 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 25 Jun 2025 15:08:24 +0200 Subject: [PATCH 06/44] Fix default storage IDs --- pyproject.toml | 2 +- .../storage_clients/_apify/_dataset_client.py | 35 ++++++++++++++++--- .../_apify/_key_value_store_client.py | 33 ++++++++++++++--- .../_apify/_request_queue_client.py | 34 +++++++++++++++--- .../storage_clients/_apify/_storage_client.py | 24 ++----------- .../integration/actor_source_base/Dockerfile | 4 +++ tests/integration/conftest.py | 1 + .../integration/test_actor_key_value_store.py | 18 ++++++++-- tests/unit/conftest.py | 1 + uv.lock | 4 +-- 10 files changed, 116 insertions(+), 40 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 21ea7e14..c5c1fa00 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "apify-client>=1.11.0", "apify-shared>=1.3.0", "cachetools>=5.5.0", - "crawlee@git+https://github.com/apify/crawlee-python.git@new-storage-clients", + "crawlee@git+https://github.com/apify/crawlee-python.git@1cbf15e13af882c864b87f8ed48252bcb3747993", "cryptography>=42.0.0", "httpx>=0.27.0", # TODO: ensure compatibility with the latest version of lazy-object-proxy diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index 31c97127..1d0a9dc5 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import os from logging import getLogger from typing import TYPE_CHECKING, Any @@ -86,11 +87,35 @@ async def open( apify_datasets_client = apify_client_async.datasets() - metadata = DatasetMetadata.model_validate( - await apify_datasets_client.get_or_create(name=id if id is not None else name), - ) - - apify_dataset_client = apify_client_async.dataset(dataset_id=metadata.id) + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # If name is provided, get or create the storage by name. + if name is not None and id is None: + id = DatasetMetadata.model_validate( + await apify_datasets_client.get_or_create(name=name), + ).id + + # If both id and name are None, try to get the default storage ID from environment variables. + if id is None and name is None: + id = os.environ.get( + 'ACTOR_DEFAULT_DATASET_ID', + None, + ) or os.environ.get( + 'APIFY_DEFAULT_DATASET_ID', + None, + ) + + if id is None: + raise ValueError( + 'Either "id" or "name" must be provided, or the storage ID must be set in environment variable.' + ) + + # Get the client for the specific storage by ID. + apify_dataset_client = apify_client_async.dataset(dataset_id=id) + + # Fetch its metadata. + metadata = DatasetMetadata.model_validate(await apify_dataset_client.get()) return cls( id=metadata.id, diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index 0588493d..ee24cedd 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import os from logging import getLogger from typing import TYPE_CHECKING, Any @@ -88,11 +89,35 @@ async def open( apify_kvss_client = apify_client_async.key_value_stores() - metadata = KeyValueStoreMetadata.model_validate( - await apify_kvss_client.get_or_create(name=id if id is not None else name), - ) + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # If name is provided, get or create the storage by name. + if name is not None and id is None: + id = KeyValueStoreMetadata.model_validate( + await apify_kvss_client.get_or_create(name=name), + ).id + + # If both id and name are None, try to get the default storage ID from environment variables. + if id is None and name is None: + id = os.environ.get( + 'ACTOR_DEFAULT_KEY_VALUE_STORE_ID', + None, + ) or os.environ.get( + 'APIFY_DEFAULT_KEY_VALUE_STORE_ID', + None, + ) + + if id is None: + raise ValueError( + 'Either "id" or "name" must be provided, or the storage ID must be set in environment variable.' + ) + + # Get the client for the specific storage by ID. + apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id) - apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=metadata.id) + # Fetch its metadata. + metadata = KeyValueStoreMetadata.model_validate(await apify_kvs_client.get()) return cls( id=metadata.id, diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 95b276b3..a7de7a3b 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import os from collections import deque from datetime import datetime, timedelta, timezone from logging import getLogger @@ -118,12 +119,35 @@ async def open( apify_rqs_client = apify_client_async.request_queues() - # Get or create the request queue - metadata = RequestQueueMetadata.model_validate( - await apify_rqs_client.get_or_create(name=id if id is not None else name), - ) + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # If name is provided, get or create the storage by name. + if name is not None and id is None: + id = RequestQueueMetadata.model_validate( + await apify_rqs_client.get_or_create(name=name), + ).id + + # If both id and name are None, try to get the default storage ID from environment variables. + if id is None and name is None: + id = os.environ.get( + 'ACTOR_DEFAULT_REQUEST_QUEUE_ID', + None, + ) or os.environ.get( + 'APIFY_DEFAULT_REQUEST_QUEUE_ID', + None, + ) + + if id is None: + raise ValueError( + 'Either "id" or "name" must be provided, or the storage ID must be set in environment variable.' + ) + + # Get the client for the specific storage by ID. + apify_rq_client = apify_client_async.request_queue(request_queue_id=id) - apify_rq_client = apify_client_async.request_queue(request_queue_id=metadata.id) + # Fetch its metadata. + metadata = RequestQueueMetadata.model_validate(await apify_rq_client.get()) # Create the client instance return cls( diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index b00ea9f3..04904ab3 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -22,13 +22,7 @@ async def create_dataset_client( configuration: Configuration | None = None, ) -> ApifyDatasetClient: configuration = configuration or Configuration.get_global_configuration() - client = await ApifyDatasetClient.open(id=id, name=name, configuration=configuration) - - if configuration.purge_on_start: - await client.drop() - client = await ApifyDatasetClient.open(id=id, name=name, configuration=configuration) - - return client + return await ApifyDatasetClient.open(id=id, name=name, configuration=configuration) @override async def create_kvs_client( @@ -39,13 +33,7 @@ async def create_kvs_client( configuration: Configuration | None = None, ) -> ApifyKeyValueStoreClient: configuration = configuration or Configuration.get_global_configuration() - client = await ApifyKeyValueStoreClient.open(id=id, name=name, configuration=configuration) - - if configuration.purge_on_start: - await client.drop() - client = await ApifyKeyValueStoreClient.open(id=id, name=name, configuration=configuration) - - return client + return await ApifyKeyValueStoreClient.open(id=id, name=name, configuration=configuration) @override async def create_rq_client( @@ -56,10 +44,4 @@ async def create_rq_client( configuration: Configuration | None = None, ) -> ApifyRequestQueueClient: configuration = configuration or Configuration.get_global_configuration() - client = await ApifyRequestQueueClient.open(id=id, name=name, configuration=configuration) - - if configuration.purge_on_start: - await client.drop() - client = await ApifyRequestQueueClient.open(id=id, name=name, configuration=configuration) - - return client + return await ApifyRequestQueueClient.open(id=id, name=name, configuration=configuration) diff --git a/tests/integration/actor_source_base/Dockerfile b/tests/integration/actor_source_base/Dockerfile index b65eab68..9edfb387 100644 --- a/tests/integration/actor_source_base/Dockerfile +++ b/tests/integration/actor_source_base/Dockerfile @@ -3,6 +3,10 @@ FROM apify/actor-python:BASE_IMAGE_VERSION_PLACEHOLDER COPY . ./ +RUN apt-get update && apt-get install -y \ + git \ + && rm -rf /var/lib/apt/lists/* + RUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 6ec454b9..6c06e5a9 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -56,6 +56,7 @@ def _prepare_test_env() -> None: service_locator._configuration = None service_locator._event_manager = None service_locator._storage_client = None + service_locator._storage_instance_manager = None # Reset the retrieval flags. service_locator._configuration_was_retrieved = False diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index 8b54f8a9..b4071ae9 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -201,11 +201,25 @@ async def test_generate_public_url_for_kvs_record( run_actor: RunActorFunction, ) -> None: async def main() -> None: + import os + from apify._crypto import create_hmac_signature async with Actor: public_api_url = Actor.config.api_public_base_url - default_store_id = Actor.config.default_key_value_store_id + + default_kvs_id = ( + os.environ.get( + 'ACTOR_DEFAULT_KEY_VALUE_STORE_ID', + None, + ) + or os.environ.get( + 'APIFY_DEFAULT_KEY_VALUE_STORE_ID', + None, + ) + or 'default' + ) + record_key = 'public-record-key' kvs = await Actor.open_key_value_store() @@ -221,7 +235,7 @@ async def main() -> None: signature = create_hmac_signature(url_signing_secret_key, record_key) assert ( record_url - == f'{public_api_url}/v2/key-value-stores/{default_store_id}/records/{record_key}?signature={signature}' + == f'{public_api_url}/v2/key-value-stores/{default_kvs_id}/records/{record_key}?signature={signature}' ) actor = await make_actor(label='kvs-get-public-url', main_func=main) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index a6943d3f..1454cf2e 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -46,6 +46,7 @@ def _prepare_test_env() -> None: service_locator._configuration = None service_locator._event_manager = None service_locator._storage_client = None + service_locator._storage_instance_manager = None # Reset the retrieval flags. service_locator._configuration_was_retrieved = False diff --git a/uv.lock b/uv.lock index 7abb8dbf..bccb7875 100644 --- a/uv.lock +++ b/uv.lock @@ -75,7 +75,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=1.11.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=1cbf15e13af882c864b87f8ed48252bcb3747993" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -636,7 +636,7 @@ toml = [ [[package]] name = "crawlee" version = "0.6.11" -source = { git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients#78efb4ddf234e731a1c784a2280a8b1bec812573" } +source = { git = "https://github.com/apify/crawlee-python.git?rev=1cbf15e13af882c864b87f8ed48252bcb3747993#1cbf15e13af882c864b87f8ed48252bcb3747993" } dependencies = [ { name = "apify-fingerprint-datapoints" }, { name = "browserforge" }, From c77e8d52036befd3623882e18ea16c0ef5484115 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 26 Jun 2025 10:42:22 +0200 Subject: [PATCH 07/44] Fix integration test and Not implemented exception in purge --- src/apify/storage_clients/_apify/_dataset_client.py | 8 ++++---- .../storage_clients/_apify/_key_value_store_client.py | 8 ++++---- src/apify/storage_clients/_apify/_request_queue_client.py | 8 ++++---- tests/integration/test_actor_api_helpers.py | 3 --- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index 1d0a9dc5..c820bc15 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -129,10 +129,10 @@ async def open( @override async def purge(self) -> None: - # TODO: better? - # https://github.com/apify/apify-sdk-python/issues/469 - async with self._lock: - await self._api_client.delete() + raise NotImplementedError( + 'Purging datasets is not supported in the Apify platform. ' + 'Use the `drop` method to delete the dataset instead.' + ) @override async def drop(self) -> None: diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index ee24cedd..b8e479ee 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -130,10 +130,10 @@ async def open( @override async def purge(self) -> None: - # TODO: better? - # https://github.com/apify/apify-sdk-python/issues/469 - async with self._lock: - await self._api_client.delete() + raise NotImplementedError( + 'Purging key-value stores is not supported in the Apify platform. ' + 'Use the `drop` method to delete the key-value store instead.' + ) @override async def drop(self) -> None: diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index a7de7a3b..b1631377 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -166,10 +166,10 @@ async def open( @override async def purge(self) -> None: - # TODO: better? - # https://github.com/apify/apify-sdk-python/issues/469 - async with self._lock: - await self._api_client.delete() + raise NotImplementedError( + 'Purging the request queue is not supported in the Apify platform. ' + 'Use the `drop` method to delete the request queue instead.' + ) @override async def drop(self) -> None: diff --git a/tests/integration/test_actor_api_helpers.py b/tests/integration/test_actor_api_helpers.py index 5327af9c..47ecfb66 100644 --- a/tests/integration/test_actor_api_helpers.py +++ b/tests/integration/test_actor_api_helpers.py @@ -46,9 +46,6 @@ async def main() -> None: assert len(env_dict.get('actor_id', '')) == 17 assert len(env_dict.get('actor_run_id', '')) == 17 assert len(env_dict.get('user_id', '')) == 17 - assert len(env_dict.get('default_dataset_id', '')) == 17 - assert len(env_dict.get('default_key_value_store_id', '')) == 17 - assert len(env_dict.get('default_request_queue_id', '')) == 17 actor = await make_actor(label='get-env', main_func=main) run_result = await run_actor(actor) From 8731affc07753536144523839f1ce793798ab202 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 26 Jun 2025 14:48:58 +0200 Subject: [PATCH 08/44] Fix unit tests --- Makefile | 6 +++--- pyproject.toml | 2 +- src/apify/_actor.py | 18 ++++++++++++++++- .../storage_clients/_apify/_dataset_client.py | 5 +++-- .../_apify/_key_value_store_client.py | 5 +++-- .../_apify/_request_queue_client.py | 5 +++-- tests/integration/test_actor_dataset.py | 4 ++++ tests/unit/actor/test_actor_dataset.py | 20 ------------------- tests/unit/actor/test_actor_env_helpers.py | 10 ++++++---- .../unit/actor/test_actor_key_value_store.py | 2 -- tests/unit/actor/test_actor_request_queue.py | 2 -- uv.lock | 4 ++-- 12 files changed, 42 insertions(+), 41 deletions(-) diff --git a/Makefile b/Makefile index 707ebec7..73f69455 100644 --- a/Makefile +++ b/Makefile @@ -26,13 +26,13 @@ type-check: uv run mypy unit-tests: - uv run pytest --numprocesses=auto --verbose --cov=src/apify tests/unit + uv run pytest --numprocesses=auto -vv --cov=src/apify tests/unit unit-tests-cov: - uv run pytest --numprocesses=auto --verbose --cov=src/apify --cov-report=html tests/unit + uv run pytest --numprocesses=auto -vv --cov=src/apify --cov-report=html tests/unit integration-tests: - uv run pytest --numprocesses=$(INTEGRATION_TESTS_CONCURRENCY) --verbose tests/integration + uv run pytest --numprocesses=$(INTEGRATION_TESTS_CONCURRENCY) -vv tests/integration format: uv run ruff check --fix diff --git a/pyproject.toml b/pyproject.toml index c5c1fa00..21ea7e14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "apify-client>=1.11.0", "apify-shared>=1.3.0", "cachetools>=5.5.0", - "crawlee@git+https://github.com/apify/crawlee-python.git@1cbf15e13af882c864b87f8ed48252bcb3747993", + "crawlee@git+https://github.com/apify/crawlee-python.git@new-storage-clients", "cryptography>=42.0.0", "httpx>=0.27.0", # TODO: ensure compatibility with the latest version of lazy-object-proxy diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 99457a5d..4fc093f0 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -682,7 +682,23 @@ def get_env(self) -> dict: config[alias] = getattr(self._configuration, field_name) env_vars = {env_var.value.lower(): env_var.name.lower() for env_var in [*ActorEnvVars, *ApifyEnvVars]} - return {option_name: config[env_var] for env_var, option_name in env_vars.items() if env_var in config} + result = {option_name: config[env_var] for env_var, option_name in env_vars.items() if env_var in config} + + # These environment variables are not part of the Configuration model, + # so we need to add them manually to the result dictionary. + result[ActorEnvVars.DEFAULT_DATASET_ID.name.lower()] = os.environ.get( + ActorEnvVars.DEFAULT_DATASET_ID.value + ) or os.environ.get(ApifyEnvVars.DEFAULT_DATASET_ID.value) + + result[ActorEnvVars.DEFAULT_KEY_VALUE_STORE_ID.name.lower()] = os.environ.get( + ActorEnvVars.DEFAULT_KEY_VALUE_STORE_ID.value + ) or os.environ.get(ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID.value) + + result[ActorEnvVars.DEFAULT_REQUEST_QUEUE_ID.name.lower()] = os.environ.get( + ActorEnvVars.DEFAULT_REQUEST_QUEUE_ID.value + ) or os.environ.get(ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID.value) + + return result async def start( self, diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index c820bc15..fabcbafc 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -8,6 +8,7 @@ from typing_extensions import override from apify_client import ApifyClientAsync +from apify_shared.consts import ActorEnvVars, ApifyEnvVars from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata @@ -99,10 +100,10 @@ async def open( # If both id and name are None, try to get the default storage ID from environment variables. if id is None and name is None: id = os.environ.get( - 'ACTOR_DEFAULT_DATASET_ID', + ActorEnvVars.DEFAULT_DATASET_ID.value, None, ) or os.environ.get( - 'APIFY_DEFAULT_DATASET_ID', + ApifyEnvVars.DEFAULT_DATASET_ID.value, None, ) diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index b8e479ee..9bfac104 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -9,6 +9,7 @@ from yarl import URL from apify_client import ApifyClientAsync +from apify_shared.consts import ActorEnvVars, ApifyEnvVars from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata @@ -101,10 +102,10 @@ async def open( # If both id and name are None, try to get the default storage ID from environment variables. if id is None and name is None: id = os.environ.get( - 'ACTOR_DEFAULT_KEY_VALUE_STORE_ID', + ActorEnvVars.DEFAULT_KEY_VALUE_STORE_ID.value, None, ) or os.environ.get( - 'APIFY_DEFAULT_KEY_VALUE_STORE_ID', + ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID.value, None, ) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index b1631377..e2213561 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -11,6 +11,7 @@ from typing_extensions import override from apify_client import ApifyClientAsync +from apify_shared.consts import ActorEnvVars, ApifyEnvVars from crawlee._utils.requests import unique_key_to_request_id from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata @@ -131,10 +132,10 @@ async def open( # If both id and name are None, try to get the default storage ID from environment variables. if id is None and name is None: id = os.environ.get( - 'ACTOR_DEFAULT_REQUEST_QUEUE_ID', + ActorEnvVars.DEFAULT_REQUEST_QUEUE_ID.value, None, ) or os.environ.get( - 'APIFY_DEFAULT_REQUEST_QUEUE_ID', + ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID.value, None, ) diff --git a/tests/integration/test_actor_dataset.py b/tests/integration/test_actor_dataset.py index 52de59c5..eadf3585 100644 --- a/tests/integration/test_actor_dataset.py +++ b/tests/integration/test_actor_dataset.py @@ -37,6 +37,10 @@ async def main() -> None: assert len(list_page.items) == list_page.count == desired_item_count +import pytest + + +@pytest.mark.only async def test_push_large_data_chunks_over_9mb( make_actor: MakeActorFunction, run_actor: RunActorFunction, diff --git a/tests/unit/actor/test_actor_dataset.py b/tests/unit/actor/test_actor_dataset.py index 9a8aa7e8..8020c52e 100644 --- a/tests/unit/actor/test_actor_dataset.py +++ b/tests/unit/actor/test_actor_dataset.py @@ -2,14 +2,8 @@ import pytest -from apify_shared.consts import ActorEnvVars -from crawlee.storage_clients import FileSystemStorageClient - from apify import Actor -# NOTE: We only test the dataset methods available on Actor class/instance. -# Actual tests for the implementations are in storages/. - async def test_throws_error_without_actor_init() -> None: with pytest.raises(RuntimeError): @@ -34,20 +28,6 @@ async def test_open_dataset_returns_same_references() -> None: assert dataset_by_id_2 is dataset_by_id_1 -@pytest.mark.skip(reason='TODO: fix this test') -async def test_open_dataset_uses_env_var(monkeypatch: pytest.MonkeyPatch) -> None: - memory_storage_client = FileSystemStorageClient() - - default_dataset_id = 'my-new-default-id' - monkeypatch.setenv(ActorEnvVars.DEFAULT_DATASET_ID, default_dataset_id) - - async with Actor: - ddt = await Actor.open_dataset() - assert ddt.metadata.id == default_dataset_id - dataset = await memory_storage_client.create_dataset_client(id=ddt.metadata.id) - await dataset.drop() - - async def test_push_data_to_dataset() -> None: async with Actor as actor: dataset = await actor.open_dataset() diff --git a/tests/unit/actor/test_actor_env_helpers.py b/tests/unit/actor/test_actor_env_helpers.py index 4ac8d4a4..27fc1c39 100644 --- a/tests/unit/actor/test_actor_env_helpers.py +++ b/tests/unit/actor/test_actor_env_helpers.py @@ -4,9 +4,8 @@ import string from datetime import datetime, timedelta from decimal import Decimal -from typing import Any +from typing import TYPE_CHECKING, Any -import pytest from pydantic_core import TzInfo from apify_shared.consts import ( @@ -22,6 +21,9 @@ from apify import Actor +if TYPE_CHECKING: + import pytest + async def test_actor_is_not_at_home_when_local() -> None: async with Actor as actor: @@ -29,7 +31,6 @@ async def test_actor_is_not_at_home_when_local() -> None: assert is_at_home is False -@pytest.mark.skip(reason='TODO: fix this test') async def test_get_env_with_randomized_env_vars(monkeypatch: pytest.MonkeyPatch) -> None: # noqa: PLR0912 ignored_env_vars = { ApifyEnvVars.INPUT_KEY, @@ -43,6 +44,7 @@ async def test_get_env_with_randomized_env_vars(monkeypatch: pytest.MonkeyPatch) ApifyEnvVars.LOG_FORMAT, ApifyEnvVars.LOG_LEVEL, ActorEnvVars.STANDBY_PORT, + ApifyEnvVars.PERSIST_STORAGE, } legacy_env_vars = { @@ -58,7 +60,7 @@ async def test_get_env_with_randomized_env_vars(monkeypatch: pytest.MonkeyPatch) } # Set up random env vars - expected_get_env: dict[str, Any] = {} + expected_get_env = dict[str, Any]() expected_get_env[ApifyEnvVars.LOG_LEVEL.name.lower()] = 'INFO' for int_env_var in INTEGER_ENV_VARS: diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index 15a33907..7877480e 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -10,8 +10,6 @@ from apify._crypto import public_encrypt -# NOTE: We only test the key-value store methods available on Actor class/instance. -# Actual tests for the implementations are in storages/. async def test_open_returns_same_references() -> None: async with Actor: kvs1 = await Actor.open_key_value_store() diff --git a/tests/unit/actor/test_actor_request_queue.py b/tests/unit/actor/test_actor_request_queue.py index 4450e5d1..ceb6e797 100644 --- a/tests/unit/actor/test_actor_request_queue.py +++ b/tests/unit/actor/test_actor_request_queue.py @@ -4,8 +4,6 @@ from apify import Actor -# NOTE: We only test the references here. Actual tests for the implementations are in storages/ - async def test_open_throws_without_init() -> None: with pytest.raises(RuntimeError): diff --git a/uv.lock b/uv.lock index bccb7875..588a4d96 100644 --- a/uv.lock +++ b/uv.lock @@ -75,7 +75,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=1.11.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=1cbf15e13af882c864b87f8ed48252bcb3747993" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -636,7 +636,7 @@ toml = [ [[package]] name = "crawlee" version = "0.6.11" -source = { git = "https://github.com/apify/crawlee-python.git?rev=1cbf15e13af882c864b87f8ed48252bcb3747993#1cbf15e13af882c864b87f8ed48252bcb3747993" } +source = { git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients#1cbf15e13af882c864b87f8ed48252bcb3747993" } dependencies = [ { name = "apify-fingerprint-datapoints" }, { name = "browserforge" }, From 8dfaffb4b403641d26558f847587f16a3a2d2ec8 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 26 Jun 2025 14:51:39 +0200 Subject: [PATCH 09/44] fix lint --- tests/integration/test_actor_dataset.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/integration/test_actor_dataset.py b/tests/integration/test_actor_dataset.py index eadf3585..52de59c5 100644 --- a/tests/integration/test_actor_dataset.py +++ b/tests/integration/test_actor_dataset.py @@ -37,10 +37,6 @@ async def main() -> None: assert len(list_page.items) == list_page.count == desired_item_count -import pytest - - -@pytest.mark.only async def test_push_large_data_chunks_over_9mb( make_actor: MakeActorFunction, run_actor: RunActorFunction, From 53fad073fe5be3dfbefc911e5f91078af8cefa8b Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 26 Jun 2025 16:40:01 +0200 Subject: [PATCH 10/44] add KVS record_exists not implemented --- pyproject.toml | 2 +- src/apify/storage_clients/_apify/_key_value_store_client.py | 6 ++++++ uv.lock | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 21ea7e14..80caf49c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "apify-client>=1.11.0", "apify-shared>=1.3.0", "cachetools>=5.5.0", - "crawlee@git+https://github.com/apify/crawlee-python.git@new-storage-clients", + "crawlee@git+https://github.com/apify/crawlee-python.git@bc50990dd09eb5c2b66783b2fa62a8bc689a7737", "cryptography>=42.0.0", "httpx>=0.27.0", # TODO: ensure compatibility with the latest version of lazy-object-proxy diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index 9bfac104..35c5b920 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -199,6 +199,12 @@ async def iterate_keys( await self._update_metadata() + @override + async def record_exists(self, key: str) -> bool: + raise NotImplementedError( + 'Checking if a record exists is currently not supported in the Apify storage client. ' + ) + async def get_public_url(self, key: str) -> str: """Get a URL for the given key that may be used to publicly access the value in the remote key-value store. diff --git a/uv.lock b/uv.lock index 588a4d96..fa2ef451 100644 --- a/uv.lock +++ b/uv.lock @@ -75,7 +75,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=1.11.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=bc50990dd09eb5c2b66783b2fa62a8bc689a7737" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -636,7 +636,7 @@ toml = [ [[package]] name = "crawlee" version = "0.6.11" -source = { git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients#1cbf15e13af882c864b87f8ed48252bcb3747993" } +source = { git = "https://github.com/apify/crawlee-python.git?rev=bc50990dd09eb5c2b66783b2fa62a8bc689a7737#bc50990dd09eb5c2b66783b2fa62a8bc689a7737" } dependencies = [ { name = "apify-fingerprint-datapoints" }, { name = "browserforge" }, From 5869f8ee6c8760cf9b943e69dfd6ec01e6aaad4d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 26 Jun 2025 16:56:50 +0200 Subject: [PATCH 11/44] update to apify client 1.12 and implement record exists --- pyproject.toml | 2 +- .../storage_clients/_apify/_key_value_store_client.py | 4 +--- uv.lock | 8 ++++---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 80caf49c..78009166 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ keywords = [ "scraping", ] dependencies = [ - "apify-client>=1.11.0", + "apify-client>=1.12.0", "apify-shared>=1.3.0", "cachetools>=5.5.0", "crawlee@git+https://github.com/apify/crawlee-python.git@bc50990dd09eb5c2b66783b2fa62a8bc689a7737", diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index 35c5b920..b4f8dfc6 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -201,9 +201,7 @@ async def iterate_keys( @override async def record_exists(self, key: str) -> bool: - raise NotImplementedError( - 'Checking if a record exists is currently not supported in the Apify storage client. ' - ) + return await self._api_client.record_exists(key=key) async def get_public_url(self, key: str) -> str: """Get a URL for the given key that may be used to publicly access the value in the remote key-value store. diff --git a/uv.lock b/uv.lock index fa2ef451..2580acb6 100644 --- a/uv.lock +++ b/uv.lock @@ -72,7 +72,7 @@ dev = [ [package.metadata] requires-dist = [ - { name = "apify-client", specifier = ">=1.11.0" }, + { name = "apify-client", specifier = ">=1.12.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, { name = "cachetools", specifier = ">=5.5.0" }, { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=bc50990dd09eb5c2b66783b2fa62a8bc689a7737" }, @@ -106,7 +106,7 @@ dev = [ [[package]] name = "apify-client" -version = "1.11.0" +version = "1.12.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "apify-shared" }, @@ -114,9 +114,9 @@ dependencies = [ { name = "httpx" }, { name = "more-itertools" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/49/44/b7cae857f2129d4093bc5a0a2267fcbba7905207a0b7cc424dc3c7c90291/apify_client-1.11.0.tar.gz", hash = "sha256:c2e151754c35be9bc7c1028bf7cb127aeb1ffa2fbd1ec1ad7e97b901deb32e08", size = 346095, upload-time = "2025-06-13T11:46:39.129Z" } +sdist = { url = "https://files.pythonhosted.org/packages/73/94/93bc6eca322e642a9f879b0c77005a83ea3977389f6462e1a6a784574d0a/apify_client-1.12.0.tar.gz", hash = "sha256:6b711be930d746a828a456b809abe882cf9e851e9571e5d8307591726e753ea7", size = 346892, upload-time = "2025-06-26T14:50:16.783Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/8f/24/d3273bfe5b4a96fd60c8d554edbab99274fae8cb2347b96f2e3fa0bc4d5b/apify_client-1.11.0-py3-none-any.whl", hash = "sha256:9d691960bdbeee17624a2a82aafc4f0bfba9b48820a48f559b7eba76bf01cb3c", size = 82550, upload-time = "2025-06-13T11:46:37.483Z" }, + { url = "https://files.pythonhosted.org/packages/e7/93/f1e509e4b1c090fdd2f507caf3e1455067f4ca6d4cbbaf32fbf4b7a2139f/apify_client-1.12.0-py3-none-any.whl", hash = "sha256:be24c4a069af4d9b362452ae4d973142187633bbb296f0f6a85021cb4b0bb611", size = 82810, upload-time = "2025-06-26T14:50:15.288Z" }, ] [[package]] From 82e65fc733e09be80077ba3b5937d431a8dbd03e Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 27 Jun 2025 08:43:31 +0200 Subject: [PATCH 12/44] Move default storage IDs to Configuration --- src/apify/_actor.py | 18 +----------------- src/apify/_configuration.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 4fc093f0..99457a5d 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -682,23 +682,7 @@ def get_env(self) -> dict: config[alias] = getattr(self._configuration, field_name) env_vars = {env_var.value.lower(): env_var.name.lower() for env_var in [*ActorEnvVars, *ApifyEnvVars]} - result = {option_name: config[env_var] for env_var, option_name in env_vars.items() if env_var in config} - - # These environment variables are not part of the Configuration model, - # so we need to add them manually to the result dictionary. - result[ActorEnvVars.DEFAULT_DATASET_ID.name.lower()] = os.environ.get( - ActorEnvVars.DEFAULT_DATASET_ID.value - ) or os.environ.get(ApifyEnvVars.DEFAULT_DATASET_ID.value) - - result[ActorEnvVars.DEFAULT_KEY_VALUE_STORE_ID.name.lower()] = os.environ.get( - ActorEnvVars.DEFAULT_KEY_VALUE_STORE_ID.value - ) or os.environ.get(ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID.value) - - result[ActorEnvVars.DEFAULT_REQUEST_QUEUE_ID.name.lower()] = os.environ.get( - ActorEnvVars.DEFAULT_REQUEST_QUEUE_ID.value - ) or os.environ.get(ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID.value) - - return result + return {option_name: config[env_var] for env_var, option_name in env_vars.items() if env_var in config} async def start( self, diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 4e12304c..aa584055 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -140,6 +140,39 @@ class Configuration(CrawleeConfiguration): ), ] = None + default_dataset_id: Annotated[ + str, + Field( + validation_alias=AliasChoices( + 'actor_default_dataset_id', + 'apify_default_dataset_id', + ), + description='Default dataset ID used by the Apify storage client when no ID or name is provided.', + ), + ] = 'default' + + default_key_value_store_id: Annotated[ + str, + Field( + validation_alias=AliasChoices( + 'actor_default_key_value_store_id', + 'apify_default_key_value_store_id', + ), + description='Default key-value store ID for the Apify storage client when no ID or name is provided.', + ), + ] = 'default' + + default_request_queue_id: Annotated[ + str, + Field( + validation_alias=AliasChoices( + 'actor_default_request_queue_id', + 'apify_default_request_queue_id', + ), + description='Default request queue ID for the Apify storage client when no ID or name is provided.', + ), + ] = 'default' + disable_outdated_warning: Annotated[ bool, Field( From 8de950bd5893e97bbba51152c5314bb962934aab Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 27 Jun 2025 10:10:30 +0200 Subject: [PATCH 13/44] opening storages get default id from config --- src/apify/storage_clients/_apify/_dataset_client.py | 10 +--------- .../storage_clients/_apify/_key_value_store_client.py | 10 +--------- .../storage_clients/_apify/_request_queue_client.py | 10 +--------- 3 files changed, 3 insertions(+), 27 deletions(-) diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index fabcbafc..48265fb4 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -1,14 +1,12 @@ from __future__ import annotations import asyncio -import os from logging import getLogger from typing import TYPE_CHECKING, Any from typing_extensions import override from apify_client import ApifyClientAsync -from apify_shared.consts import ActorEnvVars, ApifyEnvVars from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata @@ -99,13 +97,7 @@ async def open( # If both id and name are None, try to get the default storage ID from environment variables. if id is None and name is None: - id = os.environ.get( - ActorEnvVars.DEFAULT_DATASET_ID.value, - None, - ) or os.environ.get( - ApifyEnvVars.DEFAULT_DATASET_ID.value, - None, - ) + id = getattr(configuration, 'default_dataset_id', None) if id is None: raise ValueError( diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index b4f8dfc6..ad74cd60 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -import os from logging import getLogger from typing import TYPE_CHECKING, Any @@ -9,7 +8,6 @@ from yarl import URL from apify_client import ApifyClientAsync -from apify_shared.consts import ActorEnvVars, ApifyEnvVars from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata @@ -101,13 +99,7 @@ async def open( # If both id and name are None, try to get the default storage ID from environment variables. if id is None and name is None: - id = os.environ.get( - ActorEnvVars.DEFAULT_KEY_VALUE_STORE_ID.value, - None, - ) or os.environ.get( - ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID.value, - None, - ) + id = getattr(configuration, 'default_key_value_store_id', None) if id is None: raise ValueError( diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index e2213561..99c5480c 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -import os from collections import deque from datetime import datetime, timedelta, timezone from logging import getLogger @@ -11,7 +10,6 @@ from typing_extensions import override from apify_client import ApifyClientAsync -from apify_shared.consts import ActorEnvVars, ApifyEnvVars from crawlee._utils.requests import unique_key_to_request_id from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata @@ -131,13 +129,7 @@ async def open( # If both id and name are None, try to get the default storage ID from environment variables. if id is None and name is None: - id = os.environ.get( - ActorEnvVars.DEFAULT_REQUEST_QUEUE_ID.value, - None, - ) or os.environ.get( - ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID.value, - None, - ) + id = getattr(configuration, 'default_request_queue_id', None) if id is None: raise ValueError( From 98b76c5880c76490ce36a43b64e273ef2b5418bd Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 27 Jun 2025 10:50:39 +0200 Subject: [PATCH 14/44] Addressing more feedback --- src/apify/storage_clients/__init__.py | 3 +-- .../storage_clients/_apify/_dataset_client.py | 13 +++++++------ .../_apify/_key_value_store_client.py | 13 +++++++------ .../_apify/_request_queue_client.py | 14 +++++++------- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/apify/storage_clients/__init__.py b/src/apify/storage_clients/__init__.py index e8c98462..ca93ae43 100644 --- a/src/apify/storage_clients/__init__.py +++ b/src/apify/storage_clients/__init__.py @@ -1,4 +1,4 @@ -from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, StorageClient +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient from ._apify import ApifyStorageClient @@ -6,5 +6,4 @@ 'ApifyStorageClient', 'FileSystemStorageClient', 'MemoryStorageClient', - 'StorageClient', ] diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index 48265fb4..aa9a3903 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -33,6 +33,7 @@ def __init__( modified_at: datetime, item_count: int, api_client: DatasetClientAsync, + lock: asyncio.Lock, ) -> None: """Initialize a new instance. @@ -50,7 +51,7 @@ def __init__( self._api_client = api_client """The Apify dataset client for API operations.""" - self._lock = asyncio.Lock() + self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @property @@ -75,7 +76,10 @@ async def open( if not api_url: raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') - # Otherwise, create a new one. + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # Create Apify client with the provided token and API URL. apify_client_async = ApifyClientAsync( token=token, api_url=api_url, @@ -83,12 +87,8 @@ async def open( min_delay_between_retries_millis=500, timeout_secs=360, ) - apify_datasets_client = apify_client_async.datasets() - if id and name: - raise ValueError('Only one of "id" or "name" can be specified, not both.') - # If name is provided, get or create the storage by name. if name is not None and id is None: id = DatasetMetadata.model_validate( @@ -118,6 +118,7 @@ async def open( modified_at=metadata.modified_at, item_count=metadata.item_count, api_client=apify_dataset_client, + lock=asyncio.Lock(), ) @override diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index ad74cd60..c95959da 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -36,6 +36,7 @@ def __init__( accessed_at: datetime, modified_at: datetime, api_client: KeyValueStoreClientAsync, + lock: asyncio.Lock, ) -> None: """Initialize a new instance. @@ -52,7 +53,7 @@ def __init__( self._api_client = api_client """The Apify key-value store client for API operations.""" - self._lock = asyncio.Lock() + self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @property @@ -77,7 +78,10 @@ async def open( if not api_url: raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') - # Otherwise, create a new one. + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # Create Apify client with the provided token and API URL. apify_client_async = ApifyClientAsync( token=token, api_url=api_url, @@ -85,12 +89,8 @@ async def open( min_delay_between_retries_millis=500, timeout_secs=360, ) - apify_kvss_client = apify_client_async.key_value_stores() - if id and name: - raise ValueError('Only one of "id" or "name" can be specified, not both.') - # If name is provided, get or create the storage by name. if name is not None and id is None: id = KeyValueStoreMetadata.model_validate( @@ -119,6 +119,7 @@ async def open( accessed_at=metadata.accessed_at, modified_at=metadata.modified_at, api_client=apify_kvs_client, + lock=asyncio.Lock(), ) @override diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 99c5480c..4be27829 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -49,6 +49,7 @@ def __init__( stats: dict, total_request_count: int, api_client: RequestQueueClientAsync, + lock: asyncio.Lock, ) -> None: """Initialize a new instance. @@ -70,7 +71,7 @@ def __init__( self._api_client = api_client """The Apify request queue client for API operations.""" - self._lock = asyncio.Lock() + self._lock = lock """A lock to ensure that only one operation is performed at a time.""" self._queue_head = deque[str]() @@ -107,7 +108,10 @@ async def open( if not api_url: raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') - # Create a new API client + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # Create Apify client with the provided token and API URL. apify_client_async = ApifyClientAsync( token=token, api_url=api_url, @@ -115,12 +119,8 @@ async def open( min_delay_between_retries_millis=500, timeout_secs=360, ) - apify_rqs_client = apify_client_async.request_queues() - if id and name: - raise ValueError('Only one of "id" or "name" can be specified, not both.') - # If name is provided, get or create the storage by name. if name is not None and id is None: id = RequestQueueMetadata.model_validate( @@ -142,7 +142,6 @@ async def open( # Fetch its metadata. metadata = RequestQueueMetadata.model_validate(await apify_rq_client.get()) - # Create the client instance return cls( id=metadata.id, name=metadata.name, @@ -155,6 +154,7 @@ async def open( stats=metadata.stats, total_request_count=metadata.total_request_count, api_client=apify_rq_client, + lock=asyncio.Lock(), ) @override From 7b5ee07ea792f13b108b7bdc0d183a5264632e4e Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 27 Jun 2025 13:37:14 +0200 Subject: [PATCH 15/44] Fixing integration test test_push_large_data_chunks_over_9mb --- .../storage_clients/_apify/_dataset_client.py | 85 ++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index aa9a3903..864c2c04 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -7,6 +7,8 @@ from typing_extensions import override from apify_client import ApifyClientAsync +from crawlee._utils.byte_size import ByteSize +from crawlee._utils.file import json_dumps from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata @@ -15,6 +17,7 @@ from datetime import datetime from apify_client.clients import DatasetClientAsync + from crawlee._types import JsonSerializable from crawlee.configuration import Configuration logger = getLogger(__name__) @@ -23,6 +26,15 @@ class ApifyDatasetClient(DatasetClient): """An Apify platform implementation of the dataset client.""" + _MAX_PAYLOAD_SIZE = ByteSize.from_mb(9) + """Maximum size for a single payload.""" + + _SAFETY_BUFFER_PERCENT = 0.01 / 100 # 0.01% + """Percentage buffer to reduce payload limit slightly for safety.""" + + _EFFECTIVE_LIMIT_SIZE = _MAX_PAYLOAD_SIZE - (_MAX_PAYLOAD_SIZE * _SAFETY_BUFFER_PERCENT) + """Calculated payload limit considering safety buffer.""" + def __init__( self, *, @@ -135,8 +147,22 @@ async def drop(self) -> None: @override async def push_data(self, data: list[Any] | dict[str, Any]) -> None: + async def payloads_generator() -> AsyncIterator[str]: + for index, item in enumerate(data): + yield await self._check_and_serialize(item, index) + async with self._lock: - await self._api_client.push_items(items=data) + # Handle lists + if isinstance(data, list): + # Invoke client in series to preserve the order of data + async for items in self._chunk_by_size(payloads_generator()): + await self._api_client.push_items(items=items) + + # Handle singular items + else: + items = await self._check_and_serialize(data) + await self._api_client.push_items(items=items) + await self._update_metadata() @override @@ -205,3 +231,60 @@ async def _update_metadata(self) -> None: """Update the dataset metadata file with current information.""" metadata = await self._api_client.get() self._metadata = DatasetMetadata.model_validate(metadata) + + @classmethod + async def _check_and_serialize(cls, item: JsonSerializable, index: int | None = None) -> str: + """Serialize a given item to JSON, checks its serializability and size against a limit. + + Args: + item: The item to serialize. + index: Index of the item, used for error context. + + Returns: + Serialized JSON string. + + Raises: + ValueError: If item is not JSON serializable or exceeds size limit. + """ + s = ' ' if index is None else f' at index {index} ' + + try: + payload = await json_dumps(item) + except Exception as exc: + raise ValueError(f'Data item{s}is not serializable to JSON.') from exc + + payload_size = ByteSize(len(payload.encode('utf-8'))) + if payload_size > cls._EFFECTIVE_LIMIT_SIZE: + raise ValueError(f'Data item{s}is too large (size: {payload_size}, limit: {cls._EFFECTIVE_LIMIT_SIZE})') + + return payload + + async def _chunk_by_size(self, items: AsyncIterator[str]) -> AsyncIterator[str]: + """Yield chunks of JSON arrays composed of input strings, respecting a size limit. + + Groups an iterable of JSON string payloads into larger JSON arrays, ensuring the total size + of each array does not exceed `EFFECTIVE_LIMIT_SIZE`. Each output is a JSON array string that + contains as many payloads as possible without breaching the size threshold, maintaining the + order of the original payloads. Assumes individual items are below the size limit. + + Args: + items: Iterable of JSON string payloads. + + Yields: + Strings representing JSON arrays of payloads, each staying within the size limit. + """ + last_chunk_size = ByteSize(2) # Add 2 bytes for [] wrapper. + current_chunk = [] + + async for payload in items: + payload_size = ByteSize(len(payload.encode('utf-8'))) + + if last_chunk_size + payload_size <= self._EFFECTIVE_LIMIT_SIZE: + current_chunk.append(payload) + last_chunk_size += payload_size + ByteSize(1) # Add 1 byte for ',' separator. + else: + yield f'[{",".join(current_chunk)}]' + current_chunk = [payload] + last_chunk_size = payload_size + ByteSize(2) # Add 2 bytes for [] wrapper. + + yield f'[{",".join(current_chunk)}]' From afcb8c76989085d1f86f4e45dac14525582bb610 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 30 Jun 2025 12:54:00 +0200 Subject: [PATCH 16/44] Abstract open method is removed from storage clients --- pyproject.toml | 2 +- .../storage_clients/_apify/_dataset_client.py | 23 ++++++++++++- .../_apify/_key_value_store_client.py | 34 +++++++++++++++---- .../_apify/_request_queue_client.py | 24 ++++++++++++- .../integration/test_actor_key_value_store.py | 18 ++-------- uv.lock | 4 +-- 6 files changed, 78 insertions(+), 27 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 78009166..ef075f11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "apify-client>=1.12.0", "apify-shared>=1.3.0", "cachetools>=5.5.0", - "crawlee@git+https://github.com/apify/crawlee-python.git@bc50990dd09eb5c2b66783b2fa62a8bc689a7737", + "crawlee@git+https://github.com/apify/crawlee-python.git@new-storage-clients", "cryptography>=42.0.0", "httpx>=0.27.0", # TODO: ensure compatibility with the latest version of lazy-object-proxy diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index 864c2c04..6efc8c5c 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -71,7 +71,6 @@ def __init__( def metadata(self) -> DatasetMetadata: return self._metadata - @override @classmethod async def open( cls, @@ -80,6 +79,28 @@ async def open( name: str | None, configuration: Configuration, ) -> ApifyDatasetClient: + """Open an Apify dataset client. + + This method creates and initializes a new instance of the Apify dataset client. + It handles authentication, storage lookup/creation, and metadata retrieval. + + Args: + id: The ID of an existing dataset to open. If provided, the client will connect to this specific storage. + Cannot be used together with `name`. + name: The name of a dataset to get or create. If a storage with this name exists, it will be opened; + otherwise, a new one will be created. Cannot be used together with `id`. + configuration: The configuration object containing API credentials and settings. Must include a valid + `token` and `api_base_url`. May also contain a `default_dataset_id` for fallback when neither + `id` nor `name` is provided. + + Returns: + An instance for the opened or created storage client. + + Raises: + ValueError: If the configuration is missing required fields (token, api_base_url), if both `id` and `name` + are provided, or if neither `id` nor `name` is provided and no default storage ID is available in + the configuration. + """ token = getattr(configuration, 'token', None) if not token: raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index c95959da..1fd12470 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -61,7 +61,6 @@ def __init__( def metadata(self) -> KeyValueStoreMetadata: return self._metadata - @override @classmethod async def open( cls, @@ -70,6 +69,28 @@ async def open( name: str | None, configuration: Configuration, ) -> ApifyKeyValueStoreClient: + """Open an Apify key-value store client. + + This method creates and initializes a new instance of the Apify key-value store client. + It handles authentication, storage lookup/creation, and metadata retrieval. + + Args: + id: The ID of an existing key-value store to open. If provided, the client will connect to this specific + storage. Cannot be used together with `name`. + name: The name of a key-value store to get or create. If a storage with this name exists, it will be + opened; otherwise, a new one will be created. Cannot be used together with `id`. + configuration: The configuration object containing API credentials and settings. Must include a valid + `token` and `api_base_url`. May also contain a `default_key_value_store_id` for fallback when + neither `id` nor `name` is provided. + + Returns: + An instance for the opened or created storage client. + + Raises: + ValueError: If the configuration is missing required fields (token, api_base_url), if both `id` and `name` + are provided, or if neither `id` nor `name` is provided and no default storage ID is available + in the configuration. + """ token = getattr(configuration, 'token', None) if not token: raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') @@ -201,6 +222,9 @@ async def get_public_url(self, key: str) -> str: Args: key: The key for which the URL should be generated. + + Returns: + A public URL that can be used to access the value of the given key in the KVS. """ if self._api_client.resource_id is None: raise ValueError('resource_id cannot be None when generating a public URL') @@ -209,11 +233,9 @@ async def get_public_url(self, key: str) -> str: URL(self._api_client.base_url) / 'v2' / 'key-value-stores' / self._api_client.resource_id / 'records' / key ) - key_value_store = self.metadata - - if key_value_store and key_value_store.model_extra: - url_signing_secret_key = key_value_store.model_extra.get('urlSigningSecretKey') - if url_signing_secret_key: + if self.metadata.model_extra is not None: + url_signing_secret_key = self.metadata.model_extra.get('urlSigningSecretKey') + if url_signing_secret_key is not None: public_url = public_url.with_query(signature=create_hmac_signature(url_signing_secret_key, key)) return str(public_url) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 4be27829..159d663a 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -91,7 +91,6 @@ def __init__( def metadata(self) -> RequestQueueMetadata: return self._metadata - @override @classmethod async def open( cls, @@ -100,6 +99,29 @@ async def open( name: str | None, configuration: Configuration, ) -> ApifyRequestQueueClient: + """Open an Apify request queue client. + + This method creates and initializes a new instance of the Apify request queue client. It handles + authentication, storage lookup/creation, and metadata retrieval, and sets up internal caching and queue + management structures. + + Args: + id: The ID of an existing request queue to open. If provided, the client will connect to this specific + storage. Cannot be used together with `name`. + name: The name of a request queue to get or create. If a storage with this name exists, it will be opened; + otherwise, a new one will be created. Cannot be used together with `id`. + configuration: The configuration object containing API credentials and settings. Must include a valid + `token` and `api_base_url`. May also contain a `default_request_queue_id` for fallback when neither + `id` nor `name` is provided. + + Returns: + An instance for the opened or created storage client. + + Raises: + ValueError: If the configuration is missing required fields (token, api_base_url), if both `id` and `name` + are provided, or if neither `id` nor `name` is provided and no default storage ID is available + in the configuration. + """ token = getattr(configuration, 'token', None) if not token: raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index b4071ae9..0009fa10 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -201,30 +201,16 @@ async def test_generate_public_url_for_kvs_record( run_actor: RunActorFunction, ) -> None: async def main() -> None: - import os - from apify._crypto import create_hmac_signature async with Actor: public_api_url = Actor.config.api_public_base_url - - default_kvs_id = ( - os.environ.get( - 'ACTOR_DEFAULT_KEY_VALUE_STORE_ID', - None, - ) - or os.environ.get( - 'APIFY_DEFAULT_KEY_VALUE_STORE_ID', - None, - ) - or 'default' - ) - + default_kvs_id = Actor.config.default_key_value_store_id record_key = 'public-record-key' kvs = await Actor.open_key_value_store() + assert kvs.metadata.model_extra is not None - assert isinstance(kvs.metadata.model_extra, dict) url_signing_secret_key = kvs.metadata.model_extra.get('urlSigningSecretKey') assert url_signing_secret_key is not None diff --git a/uv.lock b/uv.lock index 2580acb6..38ebb8e0 100644 --- a/uv.lock +++ b/uv.lock @@ -75,7 +75,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=1.12.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=bc50990dd09eb5c2b66783b2fa62a8bc689a7737" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -636,7 +636,7 @@ toml = [ [[package]] name = "crawlee" version = "0.6.11" -source = { git = "https://github.com/apify/crawlee-python.git?rev=bc50990dd09eb5c2b66783b2fa62a8bc689a7737#bc50990dd09eb5c2b66783b2fa62a8bc689a7737" } +source = { git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients#d6c9877b5e09a32db4c6b1e5541af196a9c6b4e8" } dependencies = [ { name = "apify-fingerprint-datapoints" }, { name = "browserforge" }, From 3bacab74d101315e3fc0e6e8e6da7e4761e14c6f Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 30 Jun 2025 15:04:04 +0200 Subject: [PATCH 17/44] fixing generate public url for KVS records --- .../storage_clients/_apify/_dataset_client.py | 36 ++++++-------- .../_apify/_key_value_store_client.py | 37 +++++++-------- .../_apify/_request_queue_client.py | 47 ++++++------------- .../integration/test_actor_key_value_store.py | 8 ++-- 4 files changed, 51 insertions(+), 77 deletions(-) diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index 6efc8c5c..b5c1ea59 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -14,7 +14,6 @@ if TYPE_CHECKING: from collections.abc import AsyncIterator - from datetime import datetime from apify_client.clients import DatasetClientAsync from crawlee._types import JsonSerializable @@ -38,31 +37,23 @@ class ApifyDatasetClient(DatasetClient): def __init__( self, *, - id: str, - name: str | None, - created_at: datetime, - accessed_at: datetime, - modified_at: datetime, - item_count: int, + metadata: DatasetMetadata, api_client: DatasetClientAsync, + api_public_base_url: str, lock: asyncio.Lock, ) -> None: """Initialize a new instance. Preferably use the `ApifyDatasetClient.open` class method to create a new instance. """ - self._metadata = DatasetMetadata( - id=id, - name=name, - created_at=created_at, - accessed_at=accessed_at, - modified_at=modified_at, - item_count=item_count, - ) + self._metadata = metadata self._api_client = api_client """The Apify dataset client for API operations.""" + self._api_public_base_url = api_public_base_url + """The public base URL for accessing the key-value store records.""" + self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @@ -109,6 +100,13 @@ async def open( if not api_url: raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') + api_public_base_url = getattr(configuration, 'api_public_base_url', None) + if not api_public_base_url: + raise ValueError( + 'Apify storage client requires a valid API public base URL in Configuration ' + f'(api_public_base_url={api_public_base_url}).' + ) + if id and name: raise ValueError('Only one of "id" or "name" can be specified, not both.') @@ -144,13 +142,9 @@ async def open( metadata = DatasetMetadata.model_validate(await apify_dataset_client.get()) return cls( - id=metadata.id, - name=metadata.name, - created_at=metadata.created_at, - accessed_at=metadata.accessed_at, - modified_at=metadata.modified_at, - item_count=metadata.item_count, + metadata=metadata, api_client=apify_dataset_client, + api_public_base_url=api_public_base_url, lock=asyncio.Lock(), ) diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index 1fd12470..54c6dd17 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -16,7 +16,6 @@ if TYPE_CHECKING: from collections.abc import AsyncIterator - from datetime import datetime from apify_client.clients import KeyValueStoreClientAsync from crawlee.configuration import Configuration @@ -30,28 +29,22 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient): def __init__( self, *, - id: str, - name: str | None, - created_at: datetime, - accessed_at: datetime, - modified_at: datetime, + metadata: KeyValueStoreMetadata, api_client: KeyValueStoreClientAsync, + api_public_base_url: str, lock: asyncio.Lock, ) -> None: """Initialize a new instance. Preferably use the `ApifyKeyValueStoreClient.open` class method to create a new instance. """ - self._metadata = KeyValueStoreMetadata( - id=id, - name=name, - created_at=created_at, - accessed_at=accessed_at, - modified_at=modified_at, - ) + self._metadata = metadata self._api_client = api_client - """The Apify key-value store client for API operations.""" + """The Apify KVS client for API operations.""" + + self._api_public_base_url = api_public_base_url + """The public base URL for accessing the key-value store records.""" self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @@ -99,6 +92,13 @@ async def open( if not api_url: raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') + api_public_base_url = getattr(configuration, 'api_public_base_url', None) + if not api_public_base_url: + raise ValueError( + 'Apify storage client requires a valid API public base URL in Configuration ' + f'(api_public_base_url={api_public_base_url}).' + ) + if id and name: raise ValueError('Only one of "id" or "name" can be specified, not both.') @@ -134,12 +134,9 @@ async def open( metadata = KeyValueStoreMetadata.model_validate(await apify_kvs_client.get()) return cls( - id=metadata.id, - name=metadata.name, - created_at=metadata.created_at, - accessed_at=metadata.accessed_at, - modified_at=metadata.modified_at, + metadata=metadata, api_client=apify_kvs_client, + api_public_base_url=api_public_base_url, lock=asyncio.Lock(), ) @@ -230,7 +227,7 @@ async def get_public_url(self, key: str) -> str: raise ValueError('resource_id cannot be None when generating a public URL') public_url = ( - URL(self._api_client.base_url) / 'v2' / 'key-value-stores' / self._api_client.resource_id / 'records' / key + URL(self._api_public_base_url) / 'v2' / 'key-value-stores' / self._api_client.resource_id / 'records' / key ) if self.metadata.model_extra is not None: diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 159d663a..41567578 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -38,39 +38,23 @@ class ApifyRequestQueueClient(RequestQueueClient): def __init__( self, *, - id: str, - name: str | None, - created_at: datetime, - accessed_at: datetime, - modified_at: datetime, - had_multiple_clients: bool, - handled_request_count: int, - pending_request_count: int, - stats: dict, - total_request_count: int, + metadata: RequestQueueMetadata, api_client: RequestQueueClientAsync, + api_public_base_url: str, lock: asyncio.Lock, ) -> None: """Initialize a new instance. Preferably use the `ApifyRequestQueueClient.open` class method to create a new instance. """ - self._metadata = RequestQueueMetadata( - id=id, - name=name, - created_at=created_at, - accessed_at=accessed_at, - modified_at=modified_at, - had_multiple_clients=had_multiple_clients, - handled_request_count=handled_request_count, - pending_request_count=pending_request_count, - stats=stats, - total_request_count=total_request_count, - ) + self._metadata = metadata self._api_client = api_client """The Apify request queue client for API operations.""" + self._api_public_base_url = api_public_base_url + """The public base URL for accessing the key-value store records.""" + self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @@ -130,6 +114,13 @@ async def open( if not api_url: raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') + api_public_base_url = getattr(configuration, 'api_public_base_url', None) + if not api_public_base_url: + raise ValueError( + 'Apify storage client requires a valid API public base URL in Configuration ' + f'(api_public_base_url={api_public_base_url}).' + ) + if id and name: raise ValueError('Only one of "id" or "name" can be specified, not both.') @@ -165,17 +156,9 @@ async def open( metadata = RequestQueueMetadata.model_validate(await apify_rq_client.get()) return cls( - id=metadata.id, - name=metadata.name, - created_at=metadata.created_at, - accessed_at=metadata.accessed_at, - modified_at=metadata.modified_at, - had_multiple_clients=metadata.had_multiple_clients, - handled_request_count=metadata.handled_request_count, - pending_request_count=metadata.pending_request_count, - stats=metadata.stats, - total_request_count=metadata.total_request_count, + metadata=metadata, api_client=apify_rq_client, + api_public_base_url=api_public_base_url, lock=asyncio.Lock(), ) diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index 0009fa10..4d3c30c8 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -217,13 +217,13 @@ async def main() -> None: await kvs.set_value(record_key, {'exposedData': 'test'}, 'application/json') record_url = await kvs.get_public_url(record_key) - signature = create_hmac_signature(url_signing_secret_key, record_key) - assert ( - record_url - == f'{public_api_url}/v2/key-value-stores/{default_kvs_id}/records/{record_key}?signature={signature}' + expected_record_url = ( + f'{public_api_url}/v2/key-value-stores/{default_kvs_id}/records/{record_key}?signature={signature}' ) + assert record_url == expected_record_url + actor = await make_actor(label='kvs-get-public-url', main_func=main) run_result = await run_actor(actor) From 287a1191eddd965b11fbad5d9df79db21d6a7cfa Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 1 Jul 2025 15:02:57 +0200 Subject: [PATCH 18/44] add async metadata getters --- .../code/conditional_actor_charge.py | 3 ++- pyproject.toml | 2 +- .../storage_clients/_apify/_dataset_client.py | 26 +++---------------- .../_apify/_key_value_store_client.py | 18 +++++-------- .../_apify/_request_queue_client.py | 6 ++--- tests/integration/test_actor_dataset.py | 7 ++--- .../integration/test_actor_key_value_store.py | 12 +++++---- tests/integration/test_actor_request_queue.py | 7 ++--- tests/unit/actor/test_actor_dataset.py | 5 ++-- .../unit/actor/test_actor_key_value_store.py | 5 ++-- tests/unit/actor/test_actor_request_queue.py | 5 ++-- uv.lock | 4 +-- 12 files changed, 42 insertions(+), 58 deletions(-) diff --git a/docs/03_concepts/code/conditional_actor_charge.py b/docs/03_concepts/code/conditional_actor_charge.py index 08e2d073..f4695cc4 100644 --- a/docs/03_concepts/code/conditional_actor_charge.py +++ b/docs/03_concepts/code/conditional_actor_charge.py @@ -6,7 +6,8 @@ async def main() -> None: # Check the dataset because there might already be items # if the run migrated or was restarted default_dataset = await Actor.open_dataset() - charged_items = default_dataset.metadata.item_count + metadata = await default_dataset.get_metadata() + charged_items = metadata.item_count # highlight-start if Actor.get_charging_manager().get_pricing_info().is_pay_per_event: diff --git a/pyproject.toml b/pyproject.toml index ef075f11..20af0608 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "apify-client>=1.12.0", "apify-shared>=1.3.0", "cachetools>=5.5.0", - "crawlee@git+https://github.com/apify/crawlee-python.git@new-storage-clients", + "crawlee@git+https://github.com/apify/crawlee-python.git@9dfac4b8afb8027979d85947f0db303f384b7158", "cryptography>=42.0.0", "httpx>=0.27.0", # TODO: ensure compatibility with the latest version of lazy-object-proxy diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index b5c1ea59..7c71a9fe 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -37,7 +37,6 @@ class ApifyDatasetClient(DatasetClient): def __init__( self, *, - metadata: DatasetMetadata, api_client: DatasetClientAsync, api_public_base_url: str, lock: asyncio.Lock, @@ -46,8 +45,6 @@ def __init__( Preferably use the `ApifyDatasetClient.open` class method to create a new instance. """ - self._metadata = metadata - self._api_client = api_client """The Apify dataset client for API operations.""" @@ -57,10 +54,10 @@ def __init__( self._lock = lock """A lock to ensure that only one operation is performed at a time.""" - @property @override - def metadata(self) -> DatasetMetadata: - return self._metadata + async def get_metadata(self) -> DatasetMetadata: + metadata = await self._api_client.get() + return DatasetMetadata.model_validate(metadata) @classmethod async def open( @@ -138,11 +135,7 @@ async def open( # Get the client for the specific storage by ID. apify_dataset_client = apify_client_async.dataset(dataset_id=id) - # Fetch its metadata. - metadata = DatasetMetadata.model_validate(await apify_dataset_client.get()) - return cls( - metadata=metadata, api_client=apify_dataset_client, api_public_base_url=api_public_base_url, lock=asyncio.Lock(), @@ -178,8 +171,6 @@ async def payloads_generator() -> AsyncIterator[str]: items = await self._check_and_serialize(data) await self._api_client.push_items(items=items) - await self._update_metadata() - @override async def get_data( self, @@ -209,9 +200,7 @@ async def get_data( flatten=flatten, view=view, ) - result = DatasetItemsListPage.model_validate(vars(response)) - await self._update_metadata() - return result + return DatasetItemsListPage.model_validate(vars(response)) @override async def iterate_items( @@ -240,13 +229,6 @@ async def iterate_items( ): yield item - await self._update_metadata() - - async def _update_metadata(self) -> None: - """Update the dataset metadata file with current information.""" - metadata = await self._api_client.get() - self._metadata = DatasetMetadata.model_validate(metadata) - @classmethod async def _check_and_serialize(cls, item: JsonSerializable, index: int | None = None) -> str: """Serialize a given item to JSON, checks its serializability and size against a limit. diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index 54c6dd17..2b501750 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -29,7 +29,6 @@ class ApifyKeyValueStoreClient(KeyValueStoreClient): def __init__( self, *, - metadata: KeyValueStoreMetadata, api_client: KeyValueStoreClientAsync, api_public_base_url: str, lock: asyncio.Lock, @@ -38,8 +37,6 @@ def __init__( Preferably use the `ApifyKeyValueStoreClient.open` class method to create a new instance. """ - self._metadata = metadata - self._api_client = api_client """The Apify KVS client for API operations.""" @@ -49,10 +46,10 @@ def __init__( self._lock = lock """A lock to ensure that only one operation is performed at a time.""" - @property @override - def metadata(self) -> KeyValueStoreMetadata: - return self._metadata + async def get_metadata(self) -> KeyValueStoreMetadata: + metadata = await self._api_client.get() + return KeyValueStoreMetadata.model_validate(metadata) @classmethod async def open( @@ -130,11 +127,7 @@ async def open( # Get the client for the specific storage by ID. apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id) - # Fetch its metadata. - metadata = KeyValueStoreMetadata.model_validate(await apify_kvs_client.get()) - return cls( - metadata=metadata, api_client=apify_kvs_client, api_public_base_url=api_public_base_url, lock=asyncio.Lock(), @@ -229,9 +222,10 @@ async def get_public_url(self, key: str) -> str: public_url = ( URL(self._api_public_base_url) / 'v2' / 'key-value-stores' / self._api_client.resource_id / 'records' / key ) + metadata = await self.get_metadata() - if self.metadata.model_extra is not None: - url_signing_secret_key = self.metadata.model_extra.get('urlSigningSecretKey') + if metadata.model_extra is not None: + url_signing_secret_key = metadata.model_extra.get('urlSigningSecretKey') if url_signing_secret_key is not None: public_url = public_url.with_query(signature=create_hmac_signature(url_signing_secret_key, key)) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 41567578..fd17b6c3 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -70,10 +70,10 @@ def __init__( self._should_check_for_forefront_requests = False """Whether to check for forefront requests in the next list_head call.""" - @property @override - def metadata(self) -> RequestQueueMetadata: - return self._metadata + async def get_metadata(self) -> RequestQueueMetadata: + metadata = await self._api_client.get() + return RequestQueueMetadata.model_validate(metadata) @classmethod async def open( diff --git a/tests/integration/test_actor_dataset.py b/tests/integration/test_actor_dataset.py index 52de59c5..1cce4fd9 100644 --- a/tests/integration/test_actor_dataset.py +++ b/tests/integration/test_actor_dataset.py @@ -104,8 +104,9 @@ async def main() -> None: dataset_by_name_2 = await Actor.open_dataset(name=dataset_name) assert dataset_by_name_1 is dataset_by_name_2 - dataset_by_id_1 = await Actor.open_dataset(id=dataset_by_name_1.metadata.id) - dataset_by_id_2 = await Actor.open_dataset(id=dataset_by_name_1.metadata.id) + dataset_1_metadata = await dataset_by_name_1.get_metadata() + dataset_by_id_1 = await Actor.open_dataset(id=dataset_1_metadata.id) + dataset_by_id_2 = await Actor.open_dataset(id=dataset_1_metadata.id) assert dataset_by_id_1 is dataset_by_name_1 assert dataset_by_id_2 is dataset_by_id_1 @@ -129,7 +130,7 @@ async def test_force_cloud( async with Actor: dataset = await Actor.open_dataset(name=dataset_name, force_cloud=True) - dataset_id = dataset.metadata.id + dataset_id = (await dataset.get_metadata()).id await dataset.push_data(dataset_item) diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index 4d3c30c8..3d0fc22b 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -45,8 +45,9 @@ async def main() -> None: kvs_by_name_2 = await Actor.open_key_value_store(name=kvs_name) assert kvs_by_name_1 is kvs_by_name_2 - kvs_by_id_1 = await Actor.open_key_value_store(id=kvs_by_name_1.metadata.id) - kvs_by_id_2 = await Actor.open_key_value_store(id=kvs_by_name_1.metadata.id) + kvs_1_metadata = await kvs_by_name_1.get_metadata() + kvs_by_id_1 = await Actor.open_key_value_store(id=kvs_1_metadata.id) + kvs_by_id_2 = await Actor.open_key_value_store(id=kvs_1_metadata.id) assert kvs_by_id_1 is kvs_by_name_1 assert kvs_by_id_2 is kvs_by_id_1 @@ -69,7 +70,7 @@ async def test_force_cloud( async with Actor: key_value_store = await Actor.open_key_value_store(name=key_value_store_name, force_cloud=True) - key_value_store_id = key_value_store.metadata.id + key_value_store_id = (await key_value_store.get_metadata()).id await key_value_store.set_value('foo', 'bar') @@ -209,9 +210,10 @@ async def main() -> None: record_key = 'public-record-key' kvs = await Actor.open_key_value_store() - assert kvs.metadata.model_extra is not None + metadata = await kvs.get_metadata() + assert metadata.model_extra is not None - url_signing_secret_key = kvs.metadata.model_extra.get('urlSigningSecretKey') + url_signing_secret_key = metadata.model_extra.get('urlSigningSecretKey') assert url_signing_secret_key is not None await kvs.set_value(record_key, {'exposedData': 'test'}, 'application/json') diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 211cfc1f..9689367a 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -45,8 +45,9 @@ async def main() -> None: rq_by_name_2 = await Actor.open_request_queue(name=rq_name) assert rq_by_name_1 is rq_by_name_2 - rq_by_id_1 = await Actor.open_request_queue(id=rq_by_name_1.metadata.id) - rq_by_id_2 = await Actor.open_request_queue(id=rq_by_name_1.metadata.id) + rq_1_metadata = await rq_by_name_1.get_metadata() + rq_by_id_1 = await Actor.open_request_queue(id=rq_1_metadata.id) + rq_by_id_2 = await Actor.open_request_queue(id=rq_1_metadata.id) assert rq_by_id_1 is rq_by_name_1 assert rq_by_id_2 is rq_by_id_1 @@ -69,7 +70,7 @@ async def test_force_cloud( async with Actor: request_queue = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) - request_queue_id = request_queue.metadata.id + request_queue_id = (await request_queue.get_metadata()).id request_info = await request_queue.add_request(Request.from_url('http://example.com')) diff --git a/tests/unit/actor/test_actor_dataset.py b/tests/unit/actor/test_actor_dataset.py index 8020c52e..4e1b99d9 100644 --- a/tests/unit/actor/test_actor_dataset.py +++ b/tests/unit/actor/test_actor_dataset.py @@ -21,8 +21,9 @@ async def test_open_dataset_returns_same_references() -> None: dataset_by_name_2 = await Actor.open_dataset(name=dataset_name) assert dataset_by_name_1 is dataset_by_name_2 - dataset_by_id_1 = await Actor.open_dataset(id=dataset_by_name_1.metadata.id) - dataset_by_id_2 = await Actor.open_dataset(id=dataset_by_name_1.metadata.id) + dataset_1_metadata = await dataset_by_name_1.get_metadata() + dataset_by_id_1 = await Actor.open_dataset(id=dataset_1_metadata.id) + dataset_by_id_2 = await Actor.open_dataset(id=dataset_1_metadata.id) assert dataset_by_id_1 is dataset_by_name_1 assert dataset_by_id_2 is dataset_by_id_1 diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index 7877480e..405aa977 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -21,8 +21,9 @@ async def test_open_returns_same_references() -> None: kvs_by_name_2 = await Actor.open_key_value_store(name=kvs_name) assert kvs_by_name_1 is kvs_by_name_2 - kvs_by_id_1 = await Actor.open_key_value_store(id=kvs_by_name_1.metadata.id) - kvs_by_id_2 = await Actor.open_key_value_store(id=kvs_by_name_1.metadata.id) + kvs_1_metadata = await kvs_by_name_1.get_metadata() + kvs_by_id_1 = await Actor.open_key_value_store(id=kvs_1_metadata.id) + kvs_by_id_2 = await Actor.open_key_value_store(id=kvs_1_metadata.id) assert kvs_by_id_1 is kvs_by_name_1 assert kvs_by_id_2 is kvs_by_id_1 diff --git a/tests/unit/actor/test_actor_request_queue.py b/tests/unit/actor/test_actor_request_queue.py index ceb6e797..d7c52771 100644 --- a/tests/unit/actor/test_actor_request_queue.py +++ b/tests/unit/actor/test_actor_request_queue.py @@ -21,7 +21,8 @@ async def test_open_returns_same_references() -> None: rq_by_name_2 = await Actor.open_key_value_store(name=rq_name) assert rq_by_name_1 is rq_by_name_2 - rq_by_id_1 = await Actor.open_key_value_store(id=rq_by_name_1.metadata.id) - rq_by_id_2 = await Actor.open_key_value_store(id=rq_by_name_1.metadata.id) + rq_1_metadata = await rq_by_name_1.get_metadata() + rq_by_id_1 = await Actor.open_key_value_store(id=rq_1_metadata.id) + rq_by_id_2 = await Actor.open_key_value_store(id=rq_1_metadata.id) assert rq_by_id_1 is rq_by_name_1 assert rq_by_id_2 is rq_by_id_1 diff --git a/uv.lock b/uv.lock index 38ebb8e0..e0eb4f63 100644 --- a/uv.lock +++ b/uv.lock @@ -75,7 +75,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=1.12.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=9dfac4b8afb8027979d85947f0db303f384b7158" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -636,7 +636,7 @@ toml = [ [[package]] name = "crawlee" version = "0.6.11" -source = { git = "https://github.com/apify/crawlee-python.git?rev=new-storage-clients#d6c9877b5e09a32db4c6b1e5541af196a9c6b4e8" } +source = { git = "https://github.com/apify/crawlee-python.git?rev=9dfac4b8afb8027979d85947f0db303f384b7158#9dfac4b8afb8027979d85947f0db303f384b7158" } dependencies = [ { name = "apify-fingerprint-datapoints" }, { name = "browserforge" }, From 51178ca9e46c45b251e44a7e077ee30d9f833eea Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 1 Jul 2025 15:44:14 +0200 Subject: [PATCH 19/44] better usage of apify config --- .../storage_clients/_apify/_dataset_client.py | 9 ++-- .../_apify/_key_value_store_client.py | 9 ++-- .../_apify/_request_queue_client.py | 9 ++-- .../storage_clients/_apify/_storage_client.py | 42 +++++++++++++++---- 4 files changed, 50 insertions(+), 19 deletions(-) diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index 7c71a9fe..784000cd 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -17,7 +17,8 @@ from apify_client.clients import DatasetClientAsync from crawlee._types import JsonSerializable - from crawlee.configuration import Configuration + + from apify import Configuration logger = getLogger(__name__) @@ -89,15 +90,15 @@ async def open( are provided, or if neither `id` nor `name` is provided and no default storage ID is available in the configuration. """ - token = getattr(configuration, 'token', None) + token = configuration.token if not token: raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') - api_url = getattr(configuration, 'api_base_url', None) + api_url = configuration.api_base_url if not api_url: raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') - api_public_base_url = getattr(configuration, 'api_public_base_url', None) + api_public_base_url = configuration.api_public_base_url if not api_public_base_url: raise ValueError( 'Apify storage client requires a valid API public base URL in Configuration ' diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index 2b501750..8a1c5433 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -18,7 +18,8 @@ from collections.abc import AsyncIterator from apify_client.clients import KeyValueStoreClientAsync - from crawlee.configuration import Configuration + + from apify import Configuration logger = getLogger(__name__) @@ -81,15 +82,15 @@ async def open( are provided, or if neither `id` nor `name` is provided and no default storage ID is available in the configuration. """ - token = getattr(configuration, 'token', None) + token = configuration.token if not token: raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') - api_url = getattr(configuration, 'api_base_url', None) + api_url = configuration.api_base_url if not api_url: raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') - api_public_base_url = getattr(configuration, 'api_public_base_url', None) + api_public_base_url = configuration.api_public_base_url if not api_public_base_url: raise ValueError( 'Apify storage client requires a valid API public base URL in Configuration ' diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index fd17b6c3..f24696c3 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -21,7 +21,8 @@ from collections.abc import Sequence from apify_client.clients import RequestQueueClientAsync - from crawlee.configuration import Configuration + + from apify import Configuration logger = getLogger(__name__) @@ -106,15 +107,15 @@ async def open( are provided, or if neither `id` nor `name` is provided and no default storage ID is available in the configuration. """ - token = getattr(configuration, 'token', None) + token = configuration.token if not token: raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') - api_url = getattr(configuration, 'api_base_url', None) + api_url = configuration.api_base_url if not api_url: raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') - api_public_base_url = getattr(configuration, 'api_public_base_url', None) + api_public_base_url = configuration.api_public_base_url if not api_public_base_url: raise ValueError( 'Apify storage client requires a valid API public base URL in Configuration ' diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index 04904ab3..95b7a2c3 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -1,14 +1,18 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from typing_extensions import override -from crawlee.configuration import Configuration from crawlee.storage_clients._base import StorageClient from ._dataset_client import ApifyDatasetClient from ._key_value_store_client import ApifyKeyValueStoreClient from ._request_queue_client import ApifyRequestQueueClient +if TYPE_CHECKING: + from crawlee.configuration import Configuration + class ApifyStorageClient(StorageClient): """Apify storage client.""" @@ -21,8 +25,16 @@ async def create_dataset_client( name: str | None = None, configuration: Configuration | None = None, ) -> ApifyDatasetClient: - configuration = configuration or Configuration.get_global_configuration() - return await ApifyDatasetClient.open(id=id, name=name, configuration=configuration) + from apify import Configuration as ApifyConfiguration + + configuration = configuration or ApifyConfiguration.get_global_configuration() + if isinstance(configuration, ApifyConfiguration): + return await ApifyDatasetClient.open(id=id, name=name, configuration=configuration) + + raise TypeError( + f'Expected "configuration" to be an instance of "apify.Configuration", ' + f'but got {type(configuration).__name__} instead.' + ) @override async def create_kvs_client( @@ -32,8 +44,16 @@ async def create_kvs_client( name: str | None = None, configuration: Configuration | None = None, ) -> ApifyKeyValueStoreClient: - configuration = configuration or Configuration.get_global_configuration() - return await ApifyKeyValueStoreClient.open(id=id, name=name, configuration=configuration) + from apify import Configuration as ApifyConfiguration + + configuration = configuration or ApifyConfiguration.get_global_configuration() + if isinstance(configuration, ApifyConfiguration): + return await ApifyKeyValueStoreClient.open(id=id, name=name, configuration=configuration) + + raise TypeError( + f'Expected "configuration" to be an instance of "apify.Configuration", ' + f'but got {type(configuration).__name__} instead.' + ) @override async def create_rq_client( @@ -43,5 +63,13 @@ async def create_rq_client( name: str | None = None, configuration: Configuration | None = None, ) -> ApifyRequestQueueClient: - configuration = configuration or Configuration.get_global_configuration() - return await ApifyRequestQueueClient.open(id=id, name=name, configuration=configuration) + from apify import Configuration as ApifyConfiguration + + configuration = configuration or ApifyConfiguration.get_global_configuration() + if isinstance(configuration, ApifyConfiguration): + return await ApifyRequestQueueClient.open(id=id, name=name, configuration=configuration) + + raise TypeError( + f'Expected "configuration" to be an instance of "apify.Configuration", ' + f'but got {type(configuration).__name__} instead.' + ) From 3cd7dfec576f66c157476d3c675cbf06156d34da Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 2 Jul 2025 12:52:29 +0200 Subject: [PATCH 20/44] renaming --- src/apify/storage_clients/_apify/_dataset_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index 784000cd..f9bf3d6a 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -29,10 +29,10 @@ class ApifyDatasetClient(DatasetClient): _MAX_PAYLOAD_SIZE = ByteSize.from_mb(9) """Maximum size for a single payload.""" - _SAFETY_BUFFER_PERCENT = 0.01 / 100 # 0.01% + _SAFETY_BUFFER_COEFFICIENT = 0.01 / 100 # 0.01% """Percentage buffer to reduce payload limit slightly for safety.""" - _EFFECTIVE_LIMIT_SIZE = _MAX_PAYLOAD_SIZE - (_MAX_PAYLOAD_SIZE * _SAFETY_BUFFER_PERCENT) + _EFFECTIVE_LIMIT_SIZE = _MAX_PAYLOAD_SIZE - (_MAX_PAYLOAD_SIZE * _SAFETY_BUFFER_COEFFICIENT) """Calculated payload limit considering safety buffer.""" def __init__( From 1547cbd00585724588fba9a69b28e65c5afb1f52 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 3 Jul 2025 16:12:15 +0200 Subject: [PATCH 21/44] fixes after merge commit --- .../storage_clients/_apify/_storage_client.py | 9 ++++-- uv.lock | 32 ++++++++++++------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index 95b7a2c3..9d43b983 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -25,7 +25,8 @@ async def create_dataset_client( name: str | None = None, configuration: Configuration | None = None, ) -> ApifyDatasetClient: - from apify import Configuration as ApifyConfiguration + # Import here to avoid circular imports. + from apify import Configuration as ApifyConfiguration # noqa: PLC0415 configuration = configuration or ApifyConfiguration.get_global_configuration() if isinstance(configuration, ApifyConfiguration): @@ -44,7 +45,8 @@ async def create_kvs_client( name: str | None = None, configuration: Configuration | None = None, ) -> ApifyKeyValueStoreClient: - from apify import Configuration as ApifyConfiguration + # Import here to avoid circular imports. + from apify import Configuration as ApifyConfiguration # noqa: PLC0415 configuration = configuration or ApifyConfiguration.get_global_configuration() if isinstance(configuration, ApifyConfiguration): @@ -63,7 +65,8 @@ async def create_rq_client( name: str | None = None, configuration: Configuration | None = None, ) -> ApifyRequestQueueClient: - from apify import Configuration as ApifyConfiguration + # Import here to avoid circular imports. + from apify import Configuration as ApifyConfiguration # noqa: PLC0415 configuration = configuration or ApifyConfiguration.get_global_configuration() if isinstance(configuration, ApifyConfiguration): diff --git a/uv.lock b/uv.lock index 6fb7841f..d2de9016 100644 --- a/uv.lock +++ b/uv.lock @@ -33,6 +33,7 @@ source = { editable = "." } dependencies = [ { name = "apify-client" }, { name = "apify-shared" }, + { name = "cachetools" }, { name = "crawlee" }, { name = "cryptography" }, { name = "httpx" }, @@ -63,13 +64,15 @@ dev = [ { name = "respx" }, { name = "ruff" }, { name = "setuptools" }, + { name = "types-cachetools" }, ] [package.metadata] requires-dist = [ - { name = "apify-client", specifier = ">=1.11.0" }, + { name = "apify-client", specifier = ">=1.12.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, - { name = "crawlee", specifier = "~=0.6.0" }, + { name = "cachetools", specifier = ">=5.5.0" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=9dfac4b8afb8027979d85947f0db303f384b7158" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -96,6 +99,7 @@ dev = [ { name = "respx", specifier = "~=0.22.0" }, { name = "ruff", specifier = "~=0.12.0" }, { name = "setuptools" }, + { name = "types-cachetools", specifier = ">=6.0.0.20250525" }, ] [[package]] @@ -310,11 +314,11 @@ wheels = [ [[package]] name = "certifi" -version = "2025.6.15" +version = "2025.1.31" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload-time = "2025-06-15T02:45:51.329Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1c/ab/c9f1e32b7b1bf505bf26f0ef697775960db7932abeb7b516de930ba2705f/certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651", size = 167577, upload-time = "2025-01-31T02:16:47.166Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload-time = "2025-06-15T02:45:49.977Z" }, + { url = "https://files.pythonhosted.org/packages/38/fc/bce832fd4fd99766c04d1ee0eead6b0ec6486fb100ae5e74c1d91292b982/certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe", size = 166393, upload-time = "2025-01-31T02:16:45.015Z" }, ] [[package]] @@ -546,11 +550,12 @@ toml = [ [[package]] name = "crawlee" version = "0.6.11" -source = { registry = "https://pypi.org/simple" } +source = { git = "https://github.com/apify/crawlee-python.git?rev=9dfac4b8afb8027979d85947f0db303f384b7158#9dfac4b8afb8027979d85947f0db303f384b7158" } dependencies = [ { name = "apify-fingerprint-datapoints" }, { name = "browserforge" }, { name = "cachetools" }, + { name = "certifi" }, { name = "colorama" }, { name = "eval-type-backport" }, { name = "httpx", extra = ["brotli", "http2", "zstd"] }, @@ -566,10 +571,6 @@ dependencies = [ { name = "typing-extensions" }, { name = "yarl" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a4/61/76d4c43a244bcea123500989a03729ab999054a1d57ebfa85cb66fb86cb7/crawlee-0.6.11.tar.gz", hash = "sha256:746c59b726cce728d7d703e9d2e737ed5f9b2ea8409d3c5b4de0d728af7c0249", size = 24144865, upload-time = "2025-06-23T08:49:53.162Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/02/8c/9f6cdcc80acca132721331cd07ebe19b6a6509e792eb8f04f9a519c525f3/crawlee-0.6.11-py3-none-any.whl", hash = "sha256:899ae74f891ad87c7c0fc9ae6f448be7f1163f54cda5ec4b9b2e080a0758f6c2", size = 263313, upload-time = "2025-06-23T08:49:51.057Z" }, -] [[package]] name = "cryptography" @@ -2119,6 +2120,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b6/33/38da585b06978d262cc2b2b45bc57ee75f0ce5e0b4ef1cab1b86461e9298/typeapi-2.2.4-py3-none-any.whl", hash = "sha256:bd6d5e5907fa47e0303bf254e7cc8712d4be4eb26d7ffaedb67c9e7844c53bb8", size = 26387, upload-time = "2025-01-29T11:40:12.328Z" }, ] +[[package]] +name = "types-cachetools" +version = "6.0.0.20250525" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/03/d0/55ff0eeda141436c1bd2142cd026906870c661b3f7755070d6da7ea7210f/types_cachetools-6.0.0.20250525.tar.gz", hash = "sha256:baf06f234cac3aeb44c07893447ba03ecdb6c0742ba2607e28a35d38e6821b02", size = 8925, upload-time = "2025-05-25T03:13:53.498Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/8c/4ab0a17ece30fe608270b89cf066387051862899fff9f54ab12511fc7fdd/types_cachetools-6.0.0.20250525-py3-none-any.whl", hash = "sha256:1de8f0fe4bdcb187a48d2026c1e3672830f67943ad2bf3486abe031b632f1252", size = 8938, upload-time = "2025-05-25T03:13:52.406Z" }, +] + [[package]] name = "typing-extensions" version = "4.14.0" @@ -2556,4 +2566,4 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/90/2633473864f67a15526324b007a9f96c96f56d5f32ef2a56cc12f9548723/zstandard-0.23.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa6ce8b52c5987b3e34d5674b0ab529a4602b632ebab0a93b07bfb4dfc8f8a33", size = 5191299, upload-time = "2024-07-15T00:16:49.053Z" }, { url = "https://files.pythonhosted.org/packages/b0/4c/315ca5c32da7e2dc3455f3b2caee5c8c2246074a61aac6ec3378a97b7136/zstandard-0.23.0-cp313-cp313-win32.whl", hash = "sha256:a9b07268d0c3ca5c170a385a0ab9fb7fdd9f5fd866be004c4ea39e44edce47dd", size = 430862, upload-time = "2024-07-15T00:16:51.003Z" }, { url = "https://files.pythonhosted.org/packages/a2/bf/c6aaba098e2d04781e8f4f7c0ba3c7aa73d00e4c436bcc0cf059a66691d1/zstandard-0.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:f3513916e8c645d0610815c257cbfd3242adfd5c4cfa78be514e5a3ebb42a41b", size = 495578, upload-time = "2024-07-15T00:16:53.135Z" }, -] \ No newline at end of file +] From 4e4fa93a8d952914900d494a78e619d64b9ee944 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 9 Jul 2025 14:07:52 +0200 Subject: [PATCH 22/44] Change from orphan commit to master in crawlee version --- pyproject.toml | 2 +- uv.lock | 21 +++++++-------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 10ec8cea..d0b864a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "apify-client>=1.12.0", "apify-shared>=1.3.0", "cachetools>=5.5.0", - "crawlee@git+https://github.com/apify/crawlee-python.git@9dfac4b8afb8027979d85947f0db303f384b7158", + "crawlee@git+https://github.com/apify/crawlee-python.git@master", "cryptography>=42.0.0", "httpx>=0.27.0", # TODO: ensure compatibility with the latest version of lazy-object-proxy diff --git a/uv.lock b/uv.lock index d2de9016..2fde32a7 100644 --- a/uv.lock +++ b/uv.lock @@ -72,7 +72,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=1.12.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=9dfac4b8afb8027979d85947f0db303f384b7158" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=master" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -549,15 +549,13 @@ toml = [ [[package]] name = "crawlee" -version = "0.6.11" -source = { git = "https://github.com/apify/crawlee-python.git?rev=9dfac4b8afb8027979d85947f0db303f384b7158#9dfac4b8afb8027979d85947f0db303f384b7158" } +version = "0.6.12" +source = { git = "https://github.com/apify/crawlee-python.git?rev=master#0debe1df6ae0dcea296e0d8d6ce09637ead5a4f3" } dependencies = [ { name = "apify-fingerprint-datapoints" }, { name = "browserforge" }, { name = "cachetools" }, - { name = "certifi" }, { name = "colorama" }, - { name = "eval-type-backport" }, { name = "httpx", extra = ["brotli", "http2", "zstd"] }, { name = "more-itertools" }, { name = "protego" }, @@ -744,15 +742,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f8/1a/25272fafd13c92a2e3b8e351127410b9ea5557324bfea3552388d65797fc/dycw_pytest_only-2.1.1-py3-none-any.whl", hash = "sha256:ea8fe48878dd95ad0ca804e549225cf3b7a1928eb188c22a284c1d17b48a7b89", size = 2413, upload-time = "2025-06-03T01:04:46.585Z" }, ] -[[package]] -name = "eval-type-backport" -version = "0.2.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/30/ea/8b0ac4469d4c347c6a385ff09dc3c048c2d021696664e26c7ee6791631b5/eval_type_backport-0.2.2.tar.gz", hash = "sha256:f0576b4cf01ebb5bd358d02314d31846af5e07678387486e2c798af0e7d849c1", size = 9079, upload-time = "2024-12-21T20:09:46.005Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ce/31/55cd413eaccd39125368be33c46de24a1f639f2e12349b0361b4678f3915/eval_type_backport-0.2.2-py3-none-any.whl", hash = "sha256:cb6ad7c393517f476f96d456d0412ea80f0a8cf96f6892834cd9340149111b0a", size = 5830, upload-time = "2024-12-21T20:09:44.175Z" }, -] - [[package]] name = "exceptiongroup" version = "1.3.0" @@ -1053,10 +1042,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9a/55/2cb24ea48aa30c99f805921c1c7860c1f45c0e811e44ee4e6a155668de06/lxml-6.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:219e0431ea8006e15005767f0351e3f7f9143e793e58519dc97fe9e07fae5563", size = 4952289, upload-time = "2025-06-28T18:47:25.602Z" }, { url = "https://files.pythonhosted.org/packages/31/c0/b25d9528df296b9a3306ba21ff982fc5b698c45ab78b94d18c2d6ae71fd9/lxml-6.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bd5913b4972681ffc9718bc2d4c53cde39ef81415e1671ff93e9aa30b46595e7", size = 5111310, upload-time = "2025-06-28T18:47:28.136Z" }, { url = "https://files.pythonhosted.org/packages/e9/af/681a8b3e4f668bea6e6514cbcb297beb6de2b641e70f09d3d78655f4f44c/lxml-6.0.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:390240baeb9f415a82eefc2e13285016f9c8b5ad71ec80574ae8fa9605093cd7", size = 5025457, upload-time = "2025-06-26T16:26:15.068Z" }, + { url = "https://files.pythonhosted.org/packages/99/b6/3a7971aa05b7be7dfebc7ab57262ec527775c2c3c5b2f43675cac0458cad/lxml-6.0.0-cp312-cp312-manylinux_2_27_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d6e200909a119626744dd81bae409fc44134389e03fbf1d68ed2a55a2fb10991", size = 5657016, upload-time = "2025-07-03T19:19:06.008Z" }, { url = "https://files.pythonhosted.org/packages/69/f8/693b1a10a891197143c0673fcce5b75fc69132afa81a36e4568c12c8faba/lxml-6.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ca50bd612438258a91b5b3788c6621c1f05c8c478e7951899f492be42defc0da", size = 5257565, upload-time = "2025-06-26T16:26:17.906Z" }, { url = "https://files.pythonhosted.org/packages/a8/96/e08ff98f2c6426c98c8964513c5dab8d6eb81dadcd0af6f0c538ada78d33/lxml-6.0.0-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:c24b8efd9c0f62bad0439283c2c795ef916c5a6b75f03c17799775c7ae3c0c9e", size = 4713390, upload-time = "2025-06-26T16:26:20.292Z" }, { url = "https://files.pythonhosted.org/packages/a8/83/6184aba6cc94d7413959f6f8f54807dc318fdcd4985c347fe3ea6937f772/lxml-6.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:afd27d8629ae94c5d863e32ab0e1d5590371d296b87dae0a751fb22bf3685741", size = 5066103, upload-time = "2025-06-26T16:26:22.765Z" }, { url = "https://files.pythonhosted.org/packages/ee/01/8bf1f4035852d0ff2e36a4d9aacdbcc57e93a6cd35a54e05fa984cdf73ab/lxml-6.0.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:54c4855eabd9fc29707d30141be99e5cd1102e7d2258d2892314cf4c110726c3", size = 4791428, upload-time = "2025-06-26T16:26:26.461Z" }, + { url = "https://files.pythonhosted.org/packages/29/31/c0267d03b16954a85ed6b065116b621d37f559553d9339c7dcc4943a76f1/lxml-6.0.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c907516d49f77f6cd8ead1322198bdfd902003c3c330c77a1c5f3cc32a0e4d16", size = 5678523, upload-time = "2025-07-03T19:19:09.837Z" }, { url = "https://files.pythonhosted.org/packages/5c/f7/5495829a864bc5f8b0798d2b52a807c89966523140f3d6fa3a58ab6720ea/lxml-6.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:36531f81c8214e293097cd2b7873f178997dae33d3667caaae8bdfb9666b76c0", size = 5281290, upload-time = "2025-06-26T16:26:29.406Z" }, { url = "https://files.pythonhosted.org/packages/79/56/6b8edb79d9ed294ccc4e881f4db1023af56ba451909b9ce79f2a2cd7c532/lxml-6.0.0-cp312-cp312-win32.whl", hash = "sha256:690b20e3388a7ec98e899fd54c924e50ba6693874aa65ef9cb53de7f7de9d64a", size = 3613495, upload-time = "2025-06-26T16:26:31.588Z" }, { url = "https://files.pythonhosted.org/packages/0b/1e/cc32034b40ad6af80b6fd9b66301fc0f180f300002e5c3eb5a6110a93317/lxml-6.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:310b719b695b3dd442cdfbbe64936b2f2e231bb91d998e99e6f0daf991a3eba3", size = 4014711, upload-time = "2025-06-26T16:26:33.723Z" }, @@ -1067,10 +1058,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/46/3572761efc1bd45fcafb44a63b3b0feeb5b3f0066886821e94b0254f9253/lxml-6.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d18a25b19ca7307045581b18b3ec9ead2b1db5ccd8719c291f0cd0a5cec6cb81", size = 4947559, upload-time = "2025-06-28T18:47:31.091Z" }, { url = "https://files.pythonhosted.org/packages/94/8a/5e40de920e67c4f2eef9151097deb9b52d86c95762d8ee238134aff2125d/lxml-6.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d4f0c66df4386b75d2ab1e20a489f30dc7fd9a06a896d64980541506086be1f1", size = 5102143, upload-time = "2025-06-28T18:47:33.612Z" }, { url = "https://files.pythonhosted.org/packages/7c/4b/20555bdd75d57945bdabfbc45fdb1a36a1a0ff9eae4653e951b2b79c9209/lxml-6.0.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f4b481b6cc3a897adb4279216695150bbe7a44c03daba3c894f49d2037e0a24", size = 5021931, upload-time = "2025-06-26T16:26:47.503Z" }, + { url = "https://files.pythonhosted.org/packages/b6/6e/cf03b412f3763d4ca23b25e70c96a74cfece64cec3addf1c4ec639586b13/lxml-6.0.0-cp313-cp313-manylinux_2_27_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8a78d6c9168f5bcb20971bf3329c2b83078611fbe1f807baadc64afc70523b3a", size = 5645469, upload-time = "2025-07-03T19:19:13.32Z" }, { url = "https://files.pythonhosted.org/packages/d4/dd/39c8507c16db6031f8c1ddf70ed95dbb0a6d466a40002a3522c128aba472/lxml-6.0.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ae06fbab4f1bb7db4f7c8ca9897dc8db4447d1a2b9bee78474ad403437bcc29", size = 5247467, upload-time = "2025-06-26T16:26:49.998Z" }, { url = "https://files.pythonhosted.org/packages/4d/56/732d49def0631ad633844cfb2664563c830173a98d5efd9b172e89a4800d/lxml-6.0.0-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:1fa377b827ca2023244a06554c6e7dc6828a10aaf74ca41965c5d8a4925aebb4", size = 4720601, upload-time = "2025-06-26T16:26:52.564Z" }, { url = "https://files.pythonhosted.org/packages/8f/7f/6b956fab95fa73462bca25d1ea7fc8274ddf68fb8e60b78d56c03b65278e/lxml-6.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1676b56d48048a62ef77a250428d1f31f610763636e0784ba67a9740823988ca", size = 5060227, upload-time = "2025-06-26T16:26:55.054Z" }, { url = "https://files.pythonhosted.org/packages/97/06/e851ac2924447e8b15a294855caf3d543424364a143c001014d22c8ca94c/lxml-6.0.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:0e32698462aacc5c1cf6bdfebc9c781821b7e74c79f13e5ffc8bfe27c42b1abf", size = 4790637, upload-time = "2025-06-26T16:26:57.384Z" }, + { url = "https://files.pythonhosted.org/packages/06/d4/fd216f3cd6625022c25b336c7570d11f4a43adbaf0a56106d3d496f727a7/lxml-6.0.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4d6036c3a296707357efb375cfc24bb64cd955b9ec731abf11ebb1e40063949f", size = 5662049, upload-time = "2025-07-03T19:19:16.409Z" }, { url = "https://files.pythonhosted.org/packages/52/03/0e764ce00b95e008d76b99d432f1807f3574fb2945b496a17807a1645dbd/lxml-6.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7488a43033c958637b1a08cddc9188eb06d3ad36582cebc7d4815980b47e27ef", size = 5272430, upload-time = "2025-06-26T16:27:00.031Z" }, { url = "https://files.pythonhosted.org/packages/5f/01/d48cc141bc47bc1644d20fe97bbd5e8afb30415ec94f146f2f76d0d9d098/lxml-6.0.0-cp313-cp313-win32.whl", hash = "sha256:5fcd7d3b1d8ecb91445bd71b9c88bdbeae528fefee4f379895becfc72298d181", size = 3612896, upload-time = "2025-06-26T16:27:04.251Z" }, { url = "https://files.pythonhosted.org/packages/f4/87/6456b9541d186ee7d4cb53bf1b9a0d7f3b1068532676940fdd594ac90865/lxml-6.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:2f34687222b78fff795feeb799a7d44eca2477c3d9d3a46ce17d51a4f383e32e", size = 4013132, upload-time = "2025-06-26T16:27:06.415Z" }, From e5b2bc41719c97b9024129749c5a0bf398c3baa3 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 9 Jul 2025 19:52:10 +0200 Subject: [PATCH 23/44] fix encrypted secrets test --- pyproject.toml | 2 +- tests/unit/actor/test_actor_key_value_store.py | 7 ++++--- uv.lock | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d0b864a4..7fdd66ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "apify-client>=1.12.0", "apify-shared>=1.3.0", "cachetools>=5.5.0", - "crawlee@git+https://github.com/apify/crawlee-python.git@master", + "crawlee@git+https://github.com/apify/crawlee-python.git@0c4cfc9ada06e35f63213e6a937c4e85defcbecf", "cryptography>=42.0.0", "httpx>=0.27.0", # TODO: ensure compatibility with the latest version of lazy-object-proxy diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index 5229deb2..66d4a6e7 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -3,6 +3,7 @@ import pytest from apify_shared.consts import ApifyEnvVars +from crawlee._utils.file import json_dumps from ..test_crypto import PRIVATE_KEY_PASSWORD, PRIVATE_KEY_PEM_BASE64, PUBLIC_KEY from apify import Actor @@ -69,9 +70,9 @@ async def test_get_input_with_encrypted_secrets(monkeypatch: pytest.MonkeyPatch) # and includes schemahash. We are testing both formats to ensure backward compatibility. encrypted_string_legacy = public_encrypt(secret_string_legacy, public_key=PUBLIC_KEY) - encrypted_string = public_encrypt(json_dumps(secret_string), public_key=PUBLIC_KEY) - encrypted_object = public_encrypt(json_dumps(secret_object), public_key=PUBLIC_KEY) - encrypted_array = public_encrypt(json_dumps(secret_array), public_key=PUBLIC_KEY) + encrypted_string = public_encrypt(await json_dumps(secret_string), public_key=PUBLIC_KEY) + encrypted_object = public_encrypt(await json_dumps(secret_object), public_key=PUBLIC_KEY) + encrypted_array = public_encrypt(await json_dumps(secret_array), public_key=PUBLIC_KEY) input_with_secret = { 'foo': 'bar', diff --git a/uv.lock b/uv.lock index 2fde32a7..6926e937 100644 --- a/uv.lock +++ b/uv.lock @@ -72,7 +72,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=1.12.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=master" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=0c4cfc9ada06e35f63213e6a937c4e85defcbecf" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -550,7 +550,7 @@ toml = [ [[package]] name = "crawlee" version = "0.6.12" -source = { git = "https://github.com/apify/crawlee-python.git?rev=master#0debe1df6ae0dcea296e0d8d6ce09637ead5a4f3" } +source = { git = "https://github.com/apify/crawlee-python.git?rev=0c4cfc9ada06e35f63213e6a937c4e85defcbecf#0c4cfc9ada06e35f63213e6a937c4e85defcbecf" } dependencies = [ { name = "apify-fingerprint-datapoints" }, { name = "browserforge" }, From 638756f9b3680ee7de609042572c2faeb6d1e7c2 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 10 Jul 2025 11:10:07 +0200 Subject: [PATCH 24/44] Add Apify's version of FS client that keeps the INPUT json --- src/apify/storage_clients/__init__.py | 3 +- .../storage_clients/_file_system/__init__.py | 1 + .../_file_system/_key_value_store_client.py | 36 ++++++++++++ .../_file_system/_storage_client.py | 37 +++++++++++++ tests/unit/storage_clients/__init__.py | 0 .../unit/storage_clients/test_file_system.py | 55 +++++++++++++++++++ 6 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 src/apify/storage_clients/_file_system/__init__.py create mode 100644 src/apify/storage_clients/_file_system/_key_value_store_client.py create mode 100644 src/apify/storage_clients/_file_system/_storage_client.py create mode 100644 tests/unit/storage_clients/__init__.py create mode 100644 tests/unit/storage_clients/test_file_system.py diff --git a/src/apify/storage_clients/__init__.py b/src/apify/storage_clients/__init__.py index ca93ae43..209cfaa4 100644 --- a/src/apify/storage_clients/__init__.py +++ b/src/apify/storage_clients/__init__.py @@ -1,6 +1,7 @@ -from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient +from crawlee.storage_clients import MemoryStorageClient from ._apify import ApifyStorageClient +from ._file_system import FileSystemStorageClient __all__ = [ 'ApifyStorageClient', diff --git a/src/apify/storage_clients/_file_system/__init__.py b/src/apify/storage_clients/_file_system/__init__.py new file mode 100644 index 00000000..164e04cc --- /dev/null +++ b/src/apify/storage_clients/_file_system/__init__.py @@ -0,0 +1 @@ +from ._storage_client import ApifyFileSystemStorageClient as FileSystemStorageClient diff --git a/src/apify/storage_clients/_file_system/_key_value_store_client.py b/src/apify/storage_clients/_file_system/_key_value_store_client.py new file mode 100644 index 00000000..d0b882c8 --- /dev/null +++ b/src/apify/storage_clients/_file_system/_key_value_store_client.py @@ -0,0 +1,36 @@ +import asyncio + +from typing_extensions import override + +from crawlee._consts import METADATA_FILENAME +from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient + +from apify._configuration import Configuration + + +class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient): + """Apify-specific implementation of the `FileSystemKeyValueStoreClient`. + + The only difference is that it overrides the `purge` method to delete all files in the key-value store + directory, except for the metadata file and the `INPUT.json` file. + """ + + @override + async def purge(self) -> None: + """Purges the key-value store by deleting all its contents. + + It deletes all files in the key-value store directory, except for the metadata file and + the `INPUT.json` file. It also updates the metadata to reflect that the store has been purged. + """ + kvs_input_key = Configuration.get_global_configuration().input_key + async with self._lock: + for file_path in self.path_to_kvs.glob('*'): + if file_path.name in {METADATA_FILENAME, f'{kvs_input_key}.json'}: + continue + if file_path.is_file(): + await asyncio.to_thread(file_path.unlink, missing_ok=True) + + await self._update_metadata( + update_accessed_at=True, + update_modified_at=True, + ) diff --git a/src/apify/storage_clients/_file_system/_storage_client.py b/src/apify/storage_clients/_file_system/_storage_client.py new file mode 100644 index 00000000..f0039cc9 --- /dev/null +++ b/src/apify/storage_clients/_file_system/_storage_client.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from typing_extensions import override + +from crawlee._utils.docs import docs_group +from crawlee.configuration import Configuration +from crawlee.storage_clients import FileSystemStorageClient + +from ._key_value_store_client import ApifyFileSystemKeyValueStoreClient + +if TYPE_CHECKING: + from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient + + +@docs_group('Classes') +class ApifyFileSystemStorageClient(FileSystemStorageClient): + """Apify-specific implementation of the file system storage client. + + The only difference is that it uses `ApifyFileSystemKeyValueStoreClient` for key-value stores, + which overrides the `purge` method to delete all files in the key-value store directory + except for the metadata file and the `INPUT.json` file. + """ + + @override + async def create_kvs_client( + self, + *, + id: str | None = None, + name: str | None = None, + configuration: Configuration | None = None, + ) -> FileSystemKeyValueStoreClient: + configuration = configuration or Configuration.get_global_configuration() + client = await ApifyFileSystemKeyValueStoreClient.open(id=id, name=name, configuration=configuration) + await self._purge_if_needed(client, configuration) + return client diff --git a/tests/unit/storage_clients/__init__.py b/tests/unit/storage_clients/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/storage_clients/test_file_system.py b/tests/unit/storage_clients/test_file_system.py new file mode 100644 index 00000000..64984e05 --- /dev/null +++ b/tests/unit/storage_clients/test_file_system.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +import asyncio + +from crawlee._consts import METADATA_FILENAME + +from apify import Configuration +from apify.storage_clients._file_system._key_value_store_client import ApifyFileSystemKeyValueStoreClient + + +async def test_purge_preserves_input_file_and_metadata() -> None: + """Test that purge() preserves INPUT.json and metadata files but removes other files.""" + # Get the global configuration (storage directory is set by test fixtures) + config = Configuration.get_global_configuration() + + # Create the key-value store client + kvs_client = await ApifyFileSystemKeyValueStoreClient.open( + id=None, + name='test-kvs', + configuration=config, + ) + + # Create some test files in the KVS directory + kvs_path = kvs_client.path_to_kvs + + # Create various files + kvs_input_filename = f'{config.input_key}.json' + input_file = kvs_path / kvs_input_filename + metadata_file = kvs_path / METADATA_FILENAME + regular_file1 = kvs_path / 'regular_file1.json' + regular_file2 = kvs_path / 'another_file.txt' + + # Write content to files + await asyncio.to_thread(input_file.write_text, '{"test": "input"}') + await asyncio.to_thread(regular_file1.write_text, '{"test": "data1"}') + await asyncio.to_thread(regular_file2.write_text, 'some text content') + + # Verify all files exist before purge + assert input_file.exists() + assert metadata_file.exists() # Should exist from client creation + assert regular_file1.exists() + assert regular_file2.exists() + + # Purge the key-value store + await kvs_client.purge() # Verify INPUT.json and metadata are preserved + assert input_file.exists(), f'{kvs_input_filename} should be preserved during purge' + assert metadata_file.exists(), f'{METADATA_FILENAME} should be preserved during purge' + + # Verify other files are deleted + assert not regular_file1.exists(), 'Regular files should be deleted during purge' + assert not regular_file2.exists(), 'Regular files should be deleted during purge' + + # Verify INPUT.json content is unchanged + input_content = await asyncio.to_thread(input_file.read_text) + assert input_content == '{"test": "input"}' From 931b0ca1518e3cd8513a79831d1be7f4aa740f41 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 16 Jul 2025 09:31:02 +0200 Subject: [PATCH 25/44] update metadata fixes --- src/apify/storage_clients/__init__.py | 2 +- .../_apify/_key_value_store_client.py | 13 +----------- .../_apify/_request_queue_client.py | 15 -------------- .../storage_clients/_file_system/__init__.py | 3 ++- .../unit/storage_clients/test_file_system.py | 20 +++++++++---------- 5 files changed, 14 insertions(+), 39 deletions(-) diff --git a/src/apify/storage_clients/__init__.py b/src/apify/storage_clients/__init__.py index 209cfaa4..f3e5298c 100644 --- a/src/apify/storage_clients/__init__.py +++ b/src/apify/storage_clients/__init__.py @@ -1,7 +1,7 @@ from crawlee.storage_clients import MemoryStorageClient from ._apify import ApifyStorageClient -from ._file_system import FileSystemStorageClient +from ._file_system import ApifyFileSystemStorageClient as FileSystemStorageClient __all__ = [ 'ApifyStorageClient', diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index 8a1c5433..f203d6f6 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -149,9 +149,7 @@ async def drop(self) -> None: @override async def get_value(self, key: str) -> KeyValueStoreRecord | None: response = await self._api_client.get_record(key) - record = KeyValueStoreRecord.model_validate(response) if response else None - await self._update_metadata() - return record + return KeyValueStoreRecord.model_validate(response) if response else None @override async def set_value(self, key: str, value: Any, content_type: str | None = None) -> None: @@ -161,13 +159,11 @@ async def set_value(self, key: str, value: Any, content_type: str | None = None) value=value, content_type=content_type, ) - await self._update_metadata() @override async def delete_value(self, key: str) -> None: async with self._lock: await self._api_client.delete_record(key=key) - await self._update_metadata() @override async def iterate_keys( @@ -202,8 +198,6 @@ async def iterate_keys( exclusive_start_key = list_key_page.next_exclusive_start_key - await self._update_metadata() - @override async def record_exists(self, key: str) -> bool: return await self._api_client.record_exists(key=key) @@ -231,8 +225,3 @@ async def get_public_url(self, key: str) -> str: public_url = public_url.with_query(signature=create_hmac_signature(url_signing_secret_key, key)) return str(public_url) - - async def _update_metadata(self) -> None: - """Update the key-value store metadata with current information.""" - metadata = await self._api_client.get() - self._metadata = KeyValueStoreMetadata.model_validate(metadata) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index f24696c3..4896c743 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -202,9 +202,6 @@ async def add_batch_of_requests( # Send requests to API response = await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront) - # Update metadata after adding requests - await self._update_metadata() - return AddRequestsResponse.model_validate(response) @override @@ -218,7 +215,6 @@ async def get_request(self, request_id: str) -> Request | None: The request or None if not found. """ response = await self._api_client.get_request(request_id) - await self._update_metadata() if response is None: return None @@ -295,9 +291,6 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | forefront=False, hydrated_request=request, ) - - # Update metadata after marking request as handled - await self._update_metadata() except Exception as exc: logger.debug(f'Error marking request {request.id} as handled: {exc!s}') return None @@ -346,9 +339,6 @@ async def reclaim_request( await self._delete_request_lock(request.id, forefront=forefront) except Exception as err: logger.debug(f'Failed to delete request lock for request {request.id}', exc_info=err) - - # Update metadata after reclaiming request - await self._update_metadata() except Exception as exc: logger.debug(f'Error reclaiming request {request.id}: {exc!s}') return None @@ -648,8 +638,3 @@ def _cache_request( lock_expires_at=None, forefront=forefront, ) - - async def _update_metadata(self) -> None: - """Update the request queue metadata with current information.""" - metadata = await self._api_client.get() - self._metadata = RequestQueueMetadata.model_validate(metadata) diff --git a/src/apify/storage_clients/_file_system/__init__.py b/src/apify/storage_clients/_file_system/__init__.py index 164e04cc..b18af53b 100644 --- a/src/apify/storage_clients/_file_system/__init__.py +++ b/src/apify/storage_clients/_file_system/__init__.py @@ -1 +1,2 @@ -from ._storage_client import ApifyFileSystemStorageClient as FileSystemStorageClient +from ._key_value_store_client import ApifyFileSystemKeyValueStoreClient +from ._storage_client import ApifyFileSystemStorageClient diff --git a/tests/unit/storage_clients/test_file_system.py b/tests/unit/storage_clients/test_file_system.py index 64984e05..c14e9813 100644 --- a/tests/unit/storage_clients/test_file_system.py +++ b/tests/unit/storage_clients/test_file_system.py @@ -5,27 +5,25 @@ from crawlee._consts import METADATA_FILENAME from apify import Configuration -from apify.storage_clients._file_system._key_value_store_client import ApifyFileSystemKeyValueStoreClient +from apify.storage_clients._file_system import ApifyFileSystemKeyValueStoreClient async def test_purge_preserves_input_file_and_metadata() -> None: """Test that purge() preserves INPUT.json and metadata files but removes other files.""" # Get the global configuration (storage directory is set by test fixtures) - config = Configuration.get_global_configuration() + configuration = Configuration.get_global_configuration() - # Create the key-value store client - kvs_client = await ApifyFileSystemKeyValueStoreClient.open( + kvs_storage_client = await ApifyFileSystemKeyValueStoreClient.open( id=None, name='test-kvs', - configuration=config, + configuration=configuration, ) # Create some test files in the KVS directory - kvs_path = kvs_client.path_to_kvs + kvs_path = kvs_storage_client.path_to_kvs # Create various files - kvs_input_filename = f'{config.input_key}.json' - input_file = kvs_path / kvs_input_filename + input_file = kvs_path / f'{configuration.input_key}.json' metadata_file = kvs_path / METADATA_FILENAME regular_file1 = kvs_path / 'regular_file1.json' regular_file2 = kvs_path / 'another_file.txt' @@ -42,8 +40,10 @@ async def test_purge_preserves_input_file_and_metadata() -> None: assert regular_file2.exists() # Purge the key-value store - await kvs_client.purge() # Verify INPUT.json and metadata are preserved - assert input_file.exists(), f'{kvs_input_filename} should be preserved during purge' + await kvs_storage_client.purge() + + # Verify INPUT.json and metadata are preserved + assert input_file.exists(), f'{configuration.input_key} should be preserved during purge' assert metadata_file.exists(), f'{METADATA_FILENAME} should be preserved during purge' # Verify other files are deleted From 1f3c4810cc9496121cddcf05e970400b3a434387 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 16 Jul 2025 09:58:48 +0200 Subject: [PATCH 26/44] KVS metadata extended model --- .../_apify/_key_value_store_client.py | 16 +++++++--------- src/apify/storage_clients/_apify/_models.py | 12 ++++++++++++ uv.lock | 2 +- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index f203d6f6..8fab6211 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -9,9 +9,9 @@ from apify_client import ApifyClientAsync from crawlee.storage_clients._base import KeyValueStoreClient -from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata +from crawlee.storage_clients.models import KeyValueStoreRecord, KeyValueStoreRecordMetadata -from ._models import KeyValueStoreListKeysPage +from ._models import ApifyKeyValueStoreMetadata, KeyValueStoreListKeysPage from apify._crypto import create_hmac_signature if TYPE_CHECKING: @@ -48,9 +48,9 @@ def __init__( """A lock to ensure that only one operation is performed at a time.""" @override - async def get_metadata(self) -> KeyValueStoreMetadata: + async def get_metadata(self) -> ApifyKeyValueStoreMetadata: metadata = await self._api_client.get() - return KeyValueStoreMetadata.model_validate(metadata) + return ApifyKeyValueStoreMetadata.model_validate(metadata) @classmethod async def open( @@ -112,7 +112,7 @@ async def open( # If name is provided, get or create the storage by name. if name is not None and id is None: - id = KeyValueStoreMetadata.model_validate( + id = ApifyKeyValueStoreMetadata.model_validate( await apify_kvss_client.get_or_create(name=name), ).id @@ -219,9 +219,7 @@ async def get_public_url(self, key: str) -> str: ) metadata = await self.get_metadata() - if metadata.model_extra is not None: - url_signing_secret_key = metadata.model_extra.get('urlSigningSecretKey') - if url_signing_secret_key is not None: - public_url = public_url.with_query(signature=create_hmac_signature(url_signing_secret_key, key)) + if metadata.url_signing_secret_key is not None: + public_url = public_url.with_query(signature=create_hmac_signature(metadata.url_signing_secret_key, key)) return str(public_url) diff --git a/src/apify/storage_clients/_apify/_models.py b/src/apify/storage_clients/_apify/_models.py index abb7aca1..1c4248c1 100644 --- a/src/apify/storage_clients/_apify/_models.py +++ b/src/apify/storage_clients/_apify/_models.py @@ -6,10 +6,22 @@ from pydantic import BaseModel, ConfigDict, Field from crawlee._utils.docs import docs_group +from crawlee.storage_clients.models import KeyValueStoreMetadata from apify import Request +@docs_group('Data structures') +class ApifyKeyValueStoreMetadata(KeyValueStoreMetadata): + """Extended key-value store metadata model for Apify platform. + + Includes additional Apify-specific fields. + """ + + url_signing_secret_key: Annotated[str | None, Field(alias='urlSigningSecretKey', default=None)] + """The secret key used for signing URLs for secure access to key-value store records.""" + + @docs_group('Data structures') class ProlongRequestLockResponse(BaseModel): """Response to prolong request lock calls.""" diff --git a/uv.lock b/uv.lock index 1011c5f4..193a3dc3 100644 --- a/uv.lock +++ b/uv.lock @@ -28,7 +28,7 @@ wheels = [ [[package]] name = "apify" -version = "2.7.0" +version = "2.7.1" source = { editable = "." } dependencies = [ { name = "apify-client" }, From 44d8e099d63f641f3728a41682292b6f16e2b486 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 16 Jul 2025 10:44:57 +0200 Subject: [PATCH 27/44] fix url signing secret key --- tests/integration/test_actor_key_value_store.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index 3d0fc22b..799cbea3 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -203,6 +203,7 @@ async def test_generate_public_url_for_kvs_record( ) -> None: async def main() -> None: from apify._crypto import create_hmac_signature + from apify.storage_clients._apify._models import ApifyKeyValueStoreMetadata async with Actor: public_api_url = Actor.config.api_public_base_url @@ -211,15 +212,14 @@ async def main() -> None: kvs = await Actor.open_key_value_store() metadata = await kvs.get_metadata() - assert metadata.model_extra is not None - url_signing_secret_key = metadata.model_extra.get('urlSigningSecretKey') - assert url_signing_secret_key is not None + assert isinstance(metadata, ApifyKeyValueStoreMetadata) + assert metadata.url_signing_secret_key is not None await kvs.set_value(record_key, {'exposedData': 'test'}, 'application/json') record_url = await kvs.get_public_url(record_key) - signature = create_hmac_signature(url_signing_secret_key, record_key) + signature = create_hmac_signature(metadata.url_signing_secret_key, record_key) expected_record_url = ( f'{public_api_url}/v2/key-value-stores/{default_kvs_id}/records/{record_key}?signature={signature}' ) From ca72313dc1001887b4511a51a33114f4d602501b Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Sat, 19 Jul 2025 10:53:34 +0200 Subject: [PATCH 28/44] Apify storage client fixes and new docs groups --- pyproject.toml | 2 +- src/apify/_actor.py | 2 +- src/apify/_charging.py | 6 +- src/apify/_configuration.py | 2 +- src/apify/_models.py | 12 ++-- src/apify/_platform_event_manager.py | 20 +++---- src/apify/_proxy_configuration.py | 4 +- src/apify/_utils.py | 13 ++++- .../storage_clients/_apify/_dataset_client.py | 36 ++++++++---- .../_apify/_key_value_store_client.py | 36 ++++++++---- src/apify/storage_clients/_apify/_models.py | 23 +++++--- .../_apify/_request_queue_client.py | 55 ++++++++++++------- .../storage_clients/_apify/_storage_client.py | 2 + .../_file_system/_storage_client.py | 2 - src/apify/storages/_request_list.py | 2 +- uv.lock | 4 +- 16 files changed, 143 insertions(+), 78 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f965219c..fe63bdf7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "apify-client>=1.12.0", "apify-shared>=1.3.0", "cachetools>=5.5.0", - "crawlee@git+https://github.com/apify/crawlee-python.git@0c4cfc9ada06e35f63213e6a937c4e85defcbecf", + "crawlee@git+https://github.com/apify/crawlee-python.git@master", "cryptography>=42.0.0", "httpx>=0.27.0", # TODO: ensure compatibility with the latest version of lazy-object-proxy diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 78e17bc5..37f462d4 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -54,7 +54,7 @@ @docs_name('Actor') -@docs_group('Classes') +@docs_group('Actor') class _ActorType: """The class of `Actor`. Only make a new instance if you're absolutely sure you need to.""" diff --git a/src/apify/_charging.py b/src/apify/_charging.py index 3aee2777..c16f4cb7 100644 --- a/src/apify/_charging.py +++ b/src/apify/_charging.py @@ -26,7 +26,7 @@ run_validator = TypeAdapter[ActorRun | None](ActorRun | None) -@docs_group('Interfaces') +@docs_group('Charging') class ChargingManager(Protocol): """Provides fine-grained access to pay-per-event functionality.""" @@ -57,7 +57,7 @@ def get_pricing_info(self) -> ActorPricingInfo: """ -@docs_group('Data structures') +@docs_group('Charging') @dataclass(frozen=True) class ChargeResult: """Result of the `ChargingManager.charge` method.""" @@ -72,7 +72,7 @@ class ChargeResult: """How many events of each known type can still be charged within the limit.""" -@docs_group('Data structures') +@docs_group('Charging') @dataclass class ActorPricingInfo: """Result of the `ChargingManager.get_pricing_info` method.""" diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index aa584055..187a98b9 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -25,7 +25,7 @@ def _transform_to_list(value: Any) -> list[str] | None: return value if isinstance(value, list) else str(value).split(',') -@docs_group('Classes') +@docs_group('Configuration') class Configuration(CrawleeConfiguration): """A class for specifying the configuration of an Actor. diff --git a/src/apify/_models.py b/src/apify/_models.py index 5898a3ee..82fa9912 100644 --- a/src/apify/_models.py +++ b/src/apify/_models.py @@ -16,7 +16,7 @@ from typing import TypeAlias -@docs_group('Data structures') +@docs_group('Other') class Webhook(BaseModel): __model_config__ = ConfigDict(populate_by_name=True) @@ -35,14 +35,14 @@ class Webhook(BaseModel): ] = None -@docs_group('Data structures') +@docs_group('Actor') class ActorRunMeta(BaseModel): __model_config__ = ConfigDict(populate_by_name=True) origin: Annotated[MetaOrigin, Field()] -@docs_group('Data structures') +@docs_group('Actor') class ActorRunStats(BaseModel): __model_config__ = ConfigDict(populate_by_name=True) @@ -63,7 +63,7 @@ class ActorRunStats(BaseModel): compute_units: Annotated[float, Field(alias='computeUnits')] -@docs_group('Data structures') +@docs_group('Actor') class ActorRunOptions(BaseModel): __model_config__ = ConfigDict(populate_by_name=True) @@ -74,7 +74,7 @@ class ActorRunOptions(BaseModel): max_total_charge_usd: Annotated[Decimal | None, Field(alias='maxTotalChargeUsd')] = None -@docs_group('Data structures') +@docs_group('Actor') class ActorRunUsage(BaseModel): __model_config__ = ConfigDict(populate_by_name=True) @@ -92,7 +92,7 @@ class ActorRunUsage(BaseModel): proxy_serps: Annotated[float | None, Field(alias='PROXY_SERPS')] = None -@docs_group('Data structures') +@docs_group('Actor') class ActorRun(BaseModel): __model_config__ = ConfigDict(populate_by_name=True) diff --git a/src/apify/_platform_event_manager.py b/src/apify/_platform_event_manager.py index 65540a85..7ae78562 100644 --- a/src/apify/_platform_event_manager.py +++ b/src/apify/_platform_event_manager.py @@ -31,13 +31,13 @@ __all__ = ['EventManager', 'LocalEventManager', 'PlatformEventManager'] -@docs_group('Data structures') +@docs_group('Event data') class PersistStateEvent(BaseModel): name: Literal[Event.PERSIST_STATE] data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))] -@docs_group('Data structures') +@docs_group('Event data') class SystemInfoEventData(BaseModel): mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')] mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')] @@ -64,31 +64,31 @@ def to_crawlee_format(self, dedicated_cpus: float) -> EventSystemInfoData: ) -@docs_group('Data structures') +@docs_group('Event data') class SystemInfoEvent(BaseModel): name: Literal[Event.SYSTEM_INFO] data: SystemInfoEventData -@docs_group('Data structures') +@docs_group('Event data') class MigratingEvent(BaseModel): name: Literal[Event.MIGRATING] data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)] -@docs_group('Data structures') +@docs_group('Event data') class AbortingEvent(BaseModel): name: Literal[Event.ABORTING] data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)] -@docs_group('Data structures') +@docs_group('Event data') class ExitEvent(BaseModel): name: Literal[Event.EXIT] data: Annotated[EventExitData, Field(default_factory=EventExitData)] -@docs_group('Data structures') +@docs_group('Event data') class EventWithoutData(BaseModel): name: Literal[ Event.SESSION_RETIRED, @@ -101,13 +101,13 @@ class EventWithoutData(BaseModel): data: Any = None -@docs_group('Data structures') +@docs_group('Event data') class DeprecatedEvent(BaseModel): name: Literal['cpuInfo'] data: Annotated[dict[str, Any], Field(default_factory=dict)] -@docs_group('Data structures') +@docs_group('Event data') class UnknownEvent(BaseModel): name: str data: Annotated[dict[str, Any], Field(default_factory=dict)] @@ -120,7 +120,7 @@ class UnknownEvent(BaseModel): ) -@docs_group('Classes') +@docs_group('Event managers') class PlatformEventManager(EventManager): """A class for managing Actor events. diff --git a/src/apify/_proxy_configuration.py b/src/apify/_proxy_configuration.py index f56cb2a1..2b0e60da 100644 --- a/src/apify/_proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -70,7 +70,7 @@ def _check( raise ValueError(f'{error_str} does not match pattern {pattern.pattern!r}') -@docs_group('Classes') +@docs_group('Configuration') @dataclass class ProxyInfo(CrawleeProxyInfo): """Provides information about a proxy connection that is used for requests.""" @@ -90,7 +90,7 @@ class ProxyInfo(CrawleeProxyInfo): """ -@docs_group('Classes') +@docs_group('Configuration') class ProxyConfiguration(CrawleeProxyConfiguration): """Configures a connection to a proxy server with the provided options. diff --git a/src/apify/_utils.py b/src/apify/_utils.py index 8686d5c1..3f253795 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -30,7 +30,18 @@ def is_running_in_ipython() -> bool: return getattr(builtins, '__IPYTHON__', False) -GroupName = Literal['Classes', 'Abstract classes', 'Interfaces', 'Data structures', 'Errors', 'Functions'] +# The order of the rendered API groups is defined in the docusaurus-plugin-typedoc-api. +GroupName = Literal[ + 'Actor', + 'Charging', + 'Configuration', + 'Event managers', + 'Event data', + 'Storage clients', + 'Storage data', + 'Storages', + 'Other', +] def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001 diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index f9bf3d6a..7a57e45e 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -105,9 +105,6 @@ async def open( f'(api_public_base_url={api_public_base_url}).' ) - if id and name: - raise ValueError('Only one of "id" or "name" can be specified, not both.') - # Create Apify client with the provided token and API URL. apify_client_async = ApifyClientAsync( token=token, @@ -118,23 +115,40 @@ async def open( ) apify_datasets_client = apify_client_async.datasets() + # If both id and name are provided, raise an error. + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # If id is provided, get the storage by ID. + if id and name is None: + apify_dataset_client = apify_client_async.dataset(dataset_id=id) + # If name is provided, get or create the storage by name. - if name is not None and id is None: + if name and id is None: id = DatasetMetadata.model_validate( await apify_datasets_client.get_or_create(name=name), ).id + apify_dataset_client = apify_client_async.dataset(dataset_id=id) # If both id and name are None, try to get the default storage ID from environment variables. if id is None and name is None: - id = getattr(configuration, 'default_dataset_id', None) + id = configuration.default_dataset_id + apify_dataset_client = apify_client_async.dataset(dataset_id=id) - if id is None: - raise ValueError( - 'Either "id" or "name" must be provided, or the storage ID must be set in environment variable.' - ) + # Fetch its metadata. + metadata = await apify_dataset_client.get() + + # If metadata is None, it means the storage does not exist, so we create it. + if metadata is None: + id = DatasetMetadata.model_validate( + await apify_datasets_client.get_or_create(), + ).id + apify_dataset_client = apify_client_async.dataset(dataset_id=id) - # Get the client for the specific storage by ID. - apify_dataset_client = apify_client_async.dataset(dataset_id=id) + # Verify that the storage exists by fetching its metadata again. + metadata = await apify_dataset_client.get() + if metadata is None: + raise ValueError(f'Opening dataset with id={id} and name={name} failed.') return cls( api_client=apify_dataset_client, diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index 8fab6211..3900ec58 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -97,9 +97,6 @@ async def open( f'(api_public_base_url={api_public_base_url}).' ) - if id and name: - raise ValueError('Only one of "id" or "name" can be specified, not both.') - # Create Apify client with the provided token and API URL. apify_client_async = ApifyClientAsync( token=token, @@ -110,23 +107,40 @@ async def open( ) apify_kvss_client = apify_client_async.key_value_stores() + # If both id and name are provided, raise an error. + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # If id is provided, get the storage by ID. + if id and name is None: + apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id) + # If name is provided, get or create the storage by name. - if name is not None and id is None: + if name and id is None: id = ApifyKeyValueStoreMetadata.model_validate( await apify_kvss_client.get_or_create(name=name), ).id + apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id) # If both id and name are None, try to get the default storage ID from environment variables. if id is None and name is None: - id = getattr(configuration, 'default_key_value_store_id', None) + id = configuration.default_key_value_store_id + apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id) - if id is None: - raise ValueError( - 'Either "id" or "name" must be provided, or the storage ID must be set in environment variable.' - ) + # Fetch its metadata. + metadata = await apify_kvs_client.get() + + # If metadata is None, it means the storage does not exist, so we create it. + if metadata is None: + id = ApifyKeyValueStoreMetadata.model_validate( + await apify_kvss_client.get_or_create(), + ).id + apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id) - # Get the client for the specific storage by ID. - apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id) + # Verify that the storage exists by fetching its metadata again. + metadata = await apify_kvs_client.get() + if metadata is None: + raise ValueError(f'Opening key-value store with id={id} and name={name} failed.') return cls( api_client=apify_kvs_client, diff --git a/src/apify/storage_clients/_apify/_models.py b/src/apify/storage_clients/_apify/_models.py index 1c4248c1..d41e33b2 100644 --- a/src/apify/storage_clients/_apify/_models.py +++ b/src/apify/storage_clients/_apify/_models.py @@ -5,13 +5,13 @@ from pydantic import BaseModel, ConfigDict, Field -from crawlee._utils.docs import docs_group from crawlee.storage_clients.models import KeyValueStoreMetadata from apify import Request +from apify._utils import docs_group -@docs_group('Data structures') +@docs_group('Storage data') class ApifyKeyValueStoreMetadata(KeyValueStoreMetadata): """Extended key-value store metadata model for Apify platform. @@ -22,7 +22,7 @@ class ApifyKeyValueStoreMetadata(KeyValueStoreMetadata): """The secret key used for signing URLs for secure access to key-value store records.""" -@docs_group('Data structures') +@docs_group('Storage data') class ProlongRequestLockResponse(BaseModel): """Response to prolong request lock calls.""" @@ -31,7 +31,7 @@ class ProlongRequestLockResponse(BaseModel): lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')] -@docs_group('Data structures') +@docs_group('Storage data') class RequestQueueHead(BaseModel): """Model for request queue head. @@ -61,7 +61,10 @@ class RequestQueueHead(BaseModel): class KeyValueStoreKeyInfo(BaseModel): - """Model for a key-value store key info.""" + """Model for a key-value store key info. + + Only internal structure. + """ model_config = ConfigDict(populate_by_name=True) @@ -70,7 +73,10 @@ class KeyValueStoreKeyInfo(BaseModel): class KeyValueStoreListKeysPage(BaseModel): - """Model for listing keys in the key-value store.""" + """Model for listing keys in the key-value store. + + Only internal structure. + """ model_config = ConfigDict(populate_by_name=True) @@ -83,7 +89,10 @@ class KeyValueStoreListKeysPage(BaseModel): class CachedRequest(BaseModel): - """Pydantic model for cached request information.""" + """Pydantic model for cached request information. + + Only internal structure. + """ id: str """The ID of the request.""" diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 4896c743..f4c8fed8 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -39,7 +39,6 @@ class ApifyRequestQueueClient(RequestQueueClient): def __init__( self, *, - metadata: RequestQueueMetadata, api_client: RequestQueueClientAsync, api_public_base_url: str, lock: asyncio.Lock, @@ -48,8 +47,6 @@ def __init__( Preferably use the `ApifyRequestQueueClient.open` class method to create a new instance. """ - self._metadata = metadata - self._api_client = api_client """The Apify request queue client for API operations.""" @@ -122,9 +119,6 @@ async def open( f'(api_public_base_url={api_public_base_url}).' ) - if id and name: - raise ValueError('Only one of "id" or "name" can be specified, not both.') - # Create Apify client with the provided token and API URL. apify_client_async = ApifyClientAsync( token=token, @@ -135,29 +129,42 @@ async def open( ) apify_rqs_client = apify_client_async.request_queues() + # If both id and name are provided, raise an error. + if id and name: + raise ValueError('Only one of "id" or "name" can be specified, not both.') + + # If id is provided, get the storage by ID. + if id and name is None: + apify_rq_client = apify_client_async.request_queue(request_queue_id=id) + # If name is provided, get or create the storage by name. - if name is not None and id is None: + if name and id is None: id = RequestQueueMetadata.model_validate( await apify_rqs_client.get_or_create(name=name), ).id + apify_rq_client = apify_client_async.request_queue(request_queue_id=id) # If both id and name are None, try to get the default storage ID from environment variables. if id is None and name is None: - id = getattr(configuration, 'default_request_queue_id', None) + id = configuration.default_request_queue_id + apify_rq_client = apify_client_async.request_queue(request_queue_id=id) - if id is None: - raise ValueError( - 'Either "id" or "name" must be provided, or the storage ID must be set in environment variable.' - ) + # Fetch its metadata. + metadata = await apify_rq_client.get() - # Get the client for the specific storage by ID. - apify_rq_client = apify_client_async.request_queue(request_queue_id=id) + # If metadata is None, it means the storage does not exist, so we create it. + if metadata is None: + id = RequestQueueMetadata.model_validate( + await apify_rqs_client.get_or_create(), + ).id + apify_rq_client = apify_client_async.request_queue(request_queue_id=id) - # Fetch its metadata. - metadata = RequestQueueMetadata.model_validate(await apify_rq_client.get()) + # Verify that the storage exists by fetching its metadata again. + metadata = await apify_rq_client.get() + if metadata is None: + raise ValueError(f'Opening request queue with id={id} and name={name} failed.') return cls( - metadata=metadata, api_client=apify_rq_client, api_public_base_url=api_public_base_url, lock=asyncio.Lock(), @@ -353,6 +360,14 @@ async def is_empty(self) -> bool: True if the queue is empty, False otherwise. """ head = await self._list_head(limit=1, lock_time=None) + + # This if condition is necessary for proper functioning of the queue. + # Investigate why it is needed and if it can be removed. + if len(head.items) == 0: + logger.warning('I am giving up, but I will sleep for a while before checking again.') + await asyncio.sleep(10) + head = await self._list_head(limit=1, lock_time=None) + return len(head.items) == 0 async def _ensure_head_is_non_empty(self) -> None: @@ -477,10 +492,12 @@ async def _list_head( if cached_request and cached_request.hydrated: items.append(cached_request.hydrated) + metadata = await self.get_metadata() + return RequestQueueHead( limit=limit, - had_multiple_clients=self._metadata.had_multiple_clients, - queue_modified_at=self._metadata.modified_at, + had_multiple_clients=metadata.had_multiple_clients, + queue_modified_at=metadata.modified_at, items=items, queue_has_locked_requests=self._queue_has_locked_requests, lock_time=lock_time, diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index 9d43b983..689e2c77 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -9,11 +9,13 @@ from ._dataset_client import ApifyDatasetClient from ._key_value_store_client import ApifyKeyValueStoreClient from ._request_queue_client import ApifyRequestQueueClient +from apify._utils import docs_group if TYPE_CHECKING: from crawlee.configuration import Configuration +@docs_group('Storage clients') class ApifyStorageClient(StorageClient): """Apify storage client.""" diff --git a/src/apify/storage_clients/_file_system/_storage_client.py b/src/apify/storage_clients/_file_system/_storage_client.py index f0039cc9..403943e3 100644 --- a/src/apify/storage_clients/_file_system/_storage_client.py +++ b/src/apify/storage_clients/_file_system/_storage_client.py @@ -4,7 +4,6 @@ from typing_extensions import override -from crawlee._utils.docs import docs_group from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient @@ -14,7 +13,6 @@ from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient -@docs_group('Classes') class ApifyFileSystemStorageClient(FileSystemStorageClient): """Apify-specific implementation of the file system storage client. diff --git a/src/apify/storages/_request_list.py b/src/apify/storages/_request_list.py index 3e784064..b7e79f73 100644 --- a/src/apify/storages/_request_list.py +++ b/src/apify/storages/_request_list.py @@ -38,7 +38,7 @@ class _SimpleUrlInput(_RequestDetails): url_input_adapter = TypeAdapter(list[_RequestsFromUrlInput | _SimpleUrlInput]) -@docs_group('Classes') +@docs_group('Storages') class RequestList(CrawleeRequestList): """Extends crawlee RequestList. diff --git a/uv.lock b/uv.lock index 193a3dc3..d805faae 100644 --- a/uv.lock +++ b/uv.lock @@ -72,7 +72,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=1.12.0" }, { name = "apify-shared", specifier = ">=1.3.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=0c4cfc9ada06e35f63213e6a937c4e85defcbecf" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=master" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "lazy-object-proxy", specifier = "<1.11.0" }, @@ -559,7 +559,7 @@ toml = [ [[package]] name = "crawlee" version = "0.6.12" -source = { git = "https://github.com/apify/crawlee-python.git?rev=0c4cfc9ada06e35f63213e6a937c4e85defcbecf#0c4cfc9ada06e35f63213e6a937c4e85defcbecf" } +source = { git = "https://github.com/apify/crawlee-python.git?rev=master#c56085070a4c56d77ac926f8486c162b69235735" } dependencies = [ { name = "apify-fingerprint-datapoints" }, { name = "browserforge" }, From bc61feee081a250c4ad142f28faba5e651b615e8 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 21 Jul 2025 12:37:18 +0200 Subject: [PATCH 29/44] Add test for `RequestQueue.is_finished` --- tests/integration/test_actor_request_queue.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 9689367a..d4730b00 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -86,3 +86,27 @@ async def test_force_cloud( assert request_queue_request['url'] == 'http://example.com' finally: await request_queue_client.delete() + + +async def test_request_queue_is_finished( + apify_client_async: ApifyClientAsync, + monkeypatch: pytest.MonkeyPatch, +) -> None: + assert apify_client_async.token is not None + monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_client_async.token) + + request_queue_name = generate_unique_resource_name('request_queue') + + async with Actor: + request_queue = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) + await request_queue.add_request(Request.from_url('http://example.com')) + assert not await request_queue.is_finished() + + request = await request_queue.fetch_next_request() + assert request is not None + assert not await request_queue.is_finished(), ( + 'RequestQueue should not be finished unless the request is marked as handled.' + ) + + await request_queue.mark_request_as_handled(request) + assert await request_queue.is_finished() From 16b76dd9a62d524927536a0380bf9491e180e959 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 21 Jul 2025 13:30:13 +0200 Subject: [PATCH 30/44] Check `_queue_has_locked_requests` in `is_empty` --- .../storage_clients/_apify/_request_queue_client.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index f4c8fed8..d7a19837 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -361,14 +361,7 @@ async def is_empty(self) -> bool: """ head = await self._list_head(limit=1, lock_time=None) - # This if condition is necessary for proper functioning of the queue. - # Investigate why it is needed and if it can be removed. - if len(head.items) == 0: - logger.warning('I am giving up, but I will sleep for a while before checking again.') - await asyncio.sleep(10) - head = await self._list_head(limit=1, lock_time=None) - - return len(head.items) == 0 + return len(head.items) == 0 and not self._queue_has_locked_requests async def _ensure_head_is_non_empty(self) -> None: """Ensure that the queue head has requests if they are available in the queue.""" From a3f8c6edb83afab5a3c30b4b8074582c1806bb13 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 22 Jul 2025 15:21:07 +0200 Subject: [PATCH 31/44] Package structure update --- src/apify/_actor.py | 4 +- src/apify/events/__init__.py | 5 + .../_apify_event_manager.py} | 108 ++---------------- src/apify/events/_types.py | 102 +++++++++++++++++ src/apify/events/py.typed | 0 src/apify/request_loaders/__init__.py | 18 +++ .../_apify_request_list.py} | 18 +-- src/apify/request_loaders/py.typed | 0 src/apify/storages/__init__.py | 4 +- tests/unit/actor/test_request_list.py | 13 ++- tests/unit/events/__init__.py | 0 .../test_apify_event_manager.py} | 15 +-- 12 files changed, 161 insertions(+), 126 deletions(-) create mode 100644 src/apify/events/__init__.py rename src/apify/{_platform_event_manager.py => events/_apify_event_manager.py} (58%) create mode 100644 src/apify/events/_types.py create mode 100644 src/apify/events/py.typed create mode 100644 src/apify/request_loaders/__init__.py rename src/apify/{storages/_request_list.py => request_loaders/_apify_request_list.py} (90%) create mode 100644 src/apify/request_loaders/py.typed create mode 100644 tests/unit/events/__init__.py rename tests/unit/{test_platform_event_manager.py => events/test_apify_event_manager.py} (93%) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 22f71225..f2ec00ac 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -30,9 +30,9 @@ from apify._consts import EVENT_LISTENERS_TIMEOUT from apify._crypto import decrypt_input_secrets, load_private_key from apify._models import ActorRun -from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager from apify._proxy_configuration import ProxyConfiguration from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython +from apify.events import ApifyEventManager, EventManager, LocalEventManager from apify.log import _configure_logging, logger from apify.storage_clients import ApifyStorageClient from apify.storages import Dataset, KeyValueStore, RequestQueue @@ -130,7 +130,7 @@ def __init__( # Set the event manager based on whether the Actor is running on the platform or locally. self._event_manager = ( - PlatformEventManager( + ApifyEventManager( config=self._configuration, persist_state_interval=self._configuration.persist_state_interval, ) diff --git a/src/apify/events/__init__.py b/src/apify/events/__init__.py new file mode 100644 index 00000000..c50c4ab8 --- /dev/null +++ b/src/apify/events/__init__.py @@ -0,0 +1,5 @@ +from crawlee.events import EventManager, LocalEventManager + +from ._apify_event_manager import ApifyEventManager + +__all__ = ['ApifyEventManager', 'EventManager', 'LocalEventManager'] diff --git a/src/apify/_platform_event_manager.py b/src/apify/events/_apify_event_manager.py similarity index 58% rename from src/apify/_platform_event_manager.py rename to src/apify/events/_apify_event_manager.py index 41d9379e..5b6e6f55 100644 --- a/src/apify/_platform_event_manager.py +++ b/src/apify/events/_apify_event_manager.py @@ -1,118 +1,26 @@ from __future__ import annotations import asyncio -from datetime import datetime -from typing import TYPE_CHECKING, Annotated, Any, Literal +from typing import TYPE_CHECKING, Annotated import websockets.asyncio.client -from pydantic import BaseModel, Discriminator, Field, TypeAdapter +from pydantic import Discriminator, TypeAdapter from typing_extensions import Self, Unpack, override -from crawlee.events._event_manager import EventManager, EventManagerOptions -from crawlee.events._local_event_manager import LocalEventManager -from crawlee.events._types import ( - Event, - EventAbortingData, - EventExitData, - EventMigratingData, - EventPersistStateData, - EventSystemInfoData, -) +from crawlee.events import EventManager +from crawlee.events._types import Event, EventPersistStateData from apify._utils import docs_group +from apify.events._types import DeprecatedEvent, EventMessage, SystemInfoEventData, UnknownEvent from apify.log import logger if TYPE_CHECKING: from types import TracebackType - from apify._configuration import Configuration - -__all__ = ['EventManager', 'LocalEventManager', 'PlatformEventManager'] - - -@docs_group('Event data') -class SystemInfoEventData(BaseModel): - mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')] - mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')] - mem_max_bytes: Annotated[float, Field(alias='memMaxBytes')] - cpu_avg_usage: Annotated[float, Field(alias='cpuAvgUsage')] - cpu_max_usage: Annotated[float, Field(alias='cpuMaxUsage')] - cpu_current_usage: Annotated[float, Field(alias='cpuCurrentUsage')] - is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')] - created_at: Annotated[datetime, Field(alias='createdAt')] - - def to_crawlee_format(self, dedicated_cpus: float) -> EventSystemInfoData: - return EventSystemInfoData.model_validate( - { - 'cpu_info': { - 'used_ratio': (self.cpu_current_usage / 100) / dedicated_cpus, - 'created_at': self.created_at, - }, - 'memory_info': { - 'total_size': self.mem_max_bytes, - 'current_size': self.mem_current_bytes, - 'created_at': self.created_at, - }, - } - ) - - -@docs_group('Events') -class PersistStateEvent(BaseModel): - name: Literal[Event.PERSIST_STATE] - data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))] - - -@docs_group('Events') -class SystemInfoEvent(BaseModel): - name: Literal[Event.SYSTEM_INFO] - data: SystemInfoEventData - - -@docs_group('Events') -class MigratingEvent(BaseModel): - name: Literal[Event.MIGRATING] - data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)] - - -@docs_group('Events') -class AbortingEvent(BaseModel): - name: Literal[Event.ABORTING] - data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)] - - -@docs_group('Events') -class ExitEvent(BaseModel): - name: Literal[Event.EXIT] - data: Annotated[EventExitData, Field(default_factory=EventExitData)] - - -@docs_group('Events') -class EventWithoutData(BaseModel): - name: Literal[ - Event.SESSION_RETIRED, - Event.BROWSER_LAUNCHED, - Event.BROWSER_RETIRED, - Event.BROWSER_CLOSED, - Event.PAGE_CREATED, - Event.PAGE_CLOSED, - ] - data: Any = None - - -@docs_group('Events') -class DeprecatedEvent(BaseModel): - name: Literal['cpuInfo'] - data: Annotated[dict[str, Any], Field(default_factory=dict)] - - -@docs_group('Events') -class UnknownEvent(BaseModel): - name: str - data: Annotated[dict[str, Any], Field(default_factory=dict)] + from crawlee.events._event_manager import EventManagerOptions + from apify._configuration import Configuration -EventMessage = PersistStateEvent | SystemInfoEvent | MigratingEvent | AbortingEvent | ExitEvent | EventWithoutData event_data_adapter = TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent]( Annotated[EventMessage, Discriminator('name')] | DeprecatedEvent | UnknownEvent @@ -120,7 +28,7 @@ class UnknownEvent(BaseModel): @docs_group('Event managers') -class PlatformEventManager(EventManager): +class ApifyEventManager(EventManager): """A class for managing Actor events. You shouldn't use this class directly, diff --git a/src/apify/events/_types.py b/src/apify/events/_types.py new file mode 100644 index 00000000..f6ff3ee6 --- /dev/null +++ b/src/apify/events/_types.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +from datetime import datetime +from typing import Annotated, Any, Literal + +from pydantic import BaseModel, Field + +from crawlee.events._types import ( + Event, + EventAbortingData, + EventExitData, + EventMigratingData, + EventPersistStateData, + EventSystemInfoData, +) + +from apify._utils import docs_group + + +@docs_group('Event data') +class SystemInfoEventData(BaseModel): + mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')] + mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')] + mem_max_bytes: Annotated[float, Field(alias='memMaxBytes')] + cpu_avg_usage: Annotated[float, Field(alias='cpuAvgUsage')] + cpu_max_usage: Annotated[float, Field(alias='cpuMaxUsage')] + cpu_current_usage: Annotated[float, Field(alias='cpuCurrentUsage')] + is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')] + created_at: Annotated[datetime, Field(alias='createdAt')] + + def to_crawlee_format(self, dedicated_cpus: float) -> EventSystemInfoData: + return EventSystemInfoData.model_validate( + { + 'cpu_info': { + 'used_ratio': (self.cpu_current_usage / 100) / dedicated_cpus, + 'created_at': self.created_at, + }, + 'memory_info': { + 'total_size': self.mem_max_bytes, + 'current_size': self.mem_current_bytes, + 'created_at': self.created_at, + }, + } + ) + + +@docs_group('Events') +class PersistStateEvent(BaseModel): + name: Literal[Event.PERSIST_STATE] + data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))] + + +@docs_group('Events') +class SystemInfoEvent(BaseModel): + name: Literal[Event.SYSTEM_INFO] + data: SystemInfoEventData + + +@docs_group('Events') +class MigratingEvent(BaseModel): + name: Literal[Event.MIGRATING] + data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)] + + +@docs_group('Events') +class AbortingEvent(BaseModel): + name: Literal[Event.ABORTING] + data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)] + + +@docs_group('Events') +class ExitEvent(BaseModel): + name: Literal[Event.EXIT] + data: Annotated[EventExitData, Field(default_factory=EventExitData)] + + +@docs_group('Events') +class EventWithoutData(BaseModel): + name: Literal[ + Event.SESSION_RETIRED, + Event.BROWSER_LAUNCHED, + Event.BROWSER_RETIRED, + Event.BROWSER_CLOSED, + Event.PAGE_CREATED, + Event.PAGE_CLOSED, + ] + data: Any = None + + +@docs_group('Events') +class DeprecatedEvent(BaseModel): + name: Literal['cpuInfo'] + data: Annotated[dict[str, Any], Field(default_factory=dict)] + + +@docs_group('Events') +class UnknownEvent(BaseModel): + name: str + data: Annotated[dict[str, Any], Field(default_factory=dict)] + + +EventMessage = PersistStateEvent | SystemInfoEvent | MigratingEvent | AbortingEvent | ExitEvent | EventWithoutData diff --git a/src/apify/events/py.typed b/src/apify/events/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/src/apify/request_loaders/__init__.py b/src/apify/request_loaders/__init__.py new file mode 100644 index 00000000..faf48e1d --- /dev/null +++ b/src/apify/request_loaders/__init__.py @@ -0,0 +1,18 @@ +from crawlee.request_loaders import ( + RequestList, + RequestLoader, + RequestManager, + RequestManagerTandem, + SitemapRequestLoader, +) + +from ._apify_request_list import ApifyRequestList + +__all__ = [ + 'ApifyRequestList', + 'RequestList', + 'RequestLoader', + 'RequestManager', + 'RequestManagerTandem', + 'SitemapRequestLoader', +] diff --git a/src/apify/storages/_request_list.py b/src/apify/request_loaders/_apify_request_list.py similarity index 90% rename from src/apify/storages/_request_list.py rename to src/apify/request_loaders/_apify_request_list.py index 6ffc0ae6..7065f3dd 100644 --- a/src/apify/storages/_request_list.py +++ b/src/apify/request_loaders/_apify_request_list.py @@ -10,7 +10,7 @@ from crawlee._types import HttpMethod from crawlee.http_clients import HttpClient, HttpxHttpClient -from crawlee.request_loaders import RequestList as CrawleeRequestList +from crawlee.request_loaders import RequestList from apify import Request from apify._utils import docs_group @@ -39,7 +39,7 @@ class _SimpleUrlInput(_RequestDetails): @docs_group('Request loaders') -class RequestList(CrawleeRequestList): +class ApifyRequestList(RequestList): """Extends crawlee RequestList. Method open is used to create RequestList from actor's requestListSources input. @@ -50,7 +50,7 @@ async def open( name: str | None = None, request_list_sources_input: list[dict[str, Any]] | None = None, http_client: HttpClient | None = None, - ) -> RequestList: + ) -> ApifyRequestList: """Initialize a new instance from request list source input. Args: @@ -74,12 +74,12 @@ async def open( ``` """ request_list_sources_input = request_list_sources_input or [] - return await RequestList._create_request_list(name, request_list_sources_input, http_client) + return await ApifyRequestList._create_request_list(name, request_list_sources_input, http_client) @staticmethod async def _create_request_list( name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: HttpClient | None - ) -> RequestList: + ) -> ApifyRequestList: if not http_client: http_client = HttpxHttpClient() @@ -88,10 +88,12 @@ async def _create_request_list( simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)] remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)] - simple_url_requests = RequestList._create_requests_from_input(simple_url_inputs) - remote_url_requests = await RequestList._fetch_requests_from_url(remote_url_inputs, http_client=http_client) + simple_url_requests = ApifyRequestList._create_requests_from_input(simple_url_inputs) + remote_url_requests = await ApifyRequestList._fetch_requests_from_url( + remote_url_inputs, http_client=http_client + ) - return RequestList(name=name, requests=simple_url_requests + remote_url_requests) + return ApifyRequestList(name=name, requests=simple_url_requests + remote_url_requests) @staticmethod def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]: diff --git a/src/apify/request_loaders/py.typed b/src/apify/request_loaders/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/src/apify/storages/__init__.py b/src/apify/storages/__init__.py index 3cd0dfe8..2ed85e84 100644 --- a/src/apify/storages/__init__.py +++ b/src/apify/storages/__init__.py @@ -1,5 +1,3 @@ from crawlee.storages import Dataset, KeyValueStore, RequestQueue -from ._request_list import RequestList - -__all__ = ['Dataset', 'KeyValueStore', 'RequestList', 'RequestQueue'] +__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue'] diff --git a/tests/unit/actor/test_request_list.py b/tests/unit/actor/test_request_list.py index 9efcdce7..42f6717e 100644 --- a/tests/unit/actor/test_request_list.py +++ b/tests/unit/actor/test_request_list.py @@ -11,7 +11,8 @@ from crawlee._request import UserData from crawlee._types import HttpMethod -from apify.storages._request_list import URL_NO_COMMAS_REGEX, RequestList +from apify.request_loaders import ApifyRequestList +from apify.request_loaders._apify_request_list import URL_NO_COMMAS_REGEX @pytest.mark.parametrize( @@ -49,7 +50,7 @@ async def test_request_list_open_request_types( } request_dict_input = {**minimal_request_dict_input, **optional_input} - request_list = await RequestList.open(request_list_sources_input=[request_dict_input]) + request_list = await ApifyRequestList.open(request_list_sources_input=[request_dict_input]) assert not await request_list.is_empty() request = await request_list.fetch_next_request() @@ -90,7 +91,7 @@ async def test_request_list_open_from_url_correctly_send_requests() -> None: routes = [respx.get(entry['requestsFromUrl']) for entry in request_list_sources_input] - await RequestList.open(request_list_sources_input=request_list_sources_input) + await ApifyRequestList.open(request_list_sources_input=request_list_sources_input) for route in routes: assert route.called @@ -134,7 +135,7 @@ class MockedUrlInfo: for mocked_url in mocked_urls: respx.get(mocked_url.url).mock(return_value=Response(200, text=mocked_url.response_text)) - request_list = await RequestList.open(request_list_sources_input=request_list_sources_input) + request_list = await ApifyRequestList.open(request_list_sources_input=request_list_sources_input) generated_requests = [] while request := await request_list.fetch_next_request(): generated_requests.append(request) @@ -157,7 +158,7 @@ async def test_request_list_open_from_url_additional_inputs() -> None: respx.get(example_start_url_input['requestsFromUrl']).mock(return_value=Response(200, text=expected_url)) - request_list = await RequestList.open(request_list_sources_input=[example_start_url_input]) + request_list = await ApifyRequestList.open(request_list_sources_input=[example_start_url_input]) request = await request_list.fetch_next_request() # Check all properties correctly created for request @@ -174,7 +175,7 @@ async def test_request_list_open_from_url_additional_inputs() -> None: async def test_request_list_open_name() -> None: name = 'some_name' - request_list = await RequestList.open(name=name) + request_list = await ApifyRequestList.open(name=name) assert request_list.name == name diff --git a/tests/unit/events/__init__.py b/tests/unit/events/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/test_platform_event_manager.py b/tests/unit/events/test_apify_event_manager.py similarity index 93% rename from tests/unit/test_platform_event_manager.py rename to tests/unit/events/test_apify_event_manager.py index 7389d4da..410a577a 100644 --- a/tests/unit/test_platform_event_manager.py +++ b/tests/unit/events/test_apify_event_manager.py @@ -15,7 +15,8 @@ from crawlee.events._types import Event from apify import Configuration -from apify._platform_event_manager import PlatformEventManager, SystemInfoEventData +from apify.events import ApifyEventManager +from apify.events._types import SystemInfoEventData if TYPE_CHECKING: from collections.abc import Callable @@ -26,7 +27,7 @@ async def test_lifecycle_local(caplog: pytest.LogCaptureFixture) -> None: caplog.set_level(logging.DEBUG, logger='apify') config = Configuration.get_global_configuration() - async with PlatformEventManager(config): + async with ApifyEventManager(config): pass assert len(caplog.records) == 1 @@ -40,7 +41,7 @@ async def test_lifecycle_local(caplog: pytest.LogCaptureFixture) -> None: async def test_event_handling_local() -> None: config = Configuration.get_global_configuration() - async with PlatformEventManager(config) as event_manager: + async with ApifyEventManager(config) as event_manager: event_calls = defaultdict(list) def on_event(event: Event, id: int | None = None) -> Callable: @@ -110,7 +111,7 @@ async def test_event_async_handling_local() -> None: dummy_system_info = Mock() config = Configuration.get_global_configuration() - async with PlatformEventManager(config) as event_manager: + async with ApifyEventManager(config) as event_manager: event_calls = [] async def event_handler(data: Any) -> None: @@ -129,7 +130,7 @@ async def event_handler(data: Any) -> None: async def test_lifecycle_on_platform_without_websocket(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv(ActorEnvVars.EVENTS_WEBSOCKET_URL, 'ws://localhost:56565') - event_manager = PlatformEventManager(Configuration.get_global_configuration()) + event_manager = ApifyEventManager(Configuration.get_global_configuration()) with pytest.raises(RuntimeError, match='Error connecting to platform events websocket!'): async with event_manager: @@ -152,7 +153,7 @@ async def handler(websocket: websockets.asyncio.server.ServerConnection) -> None port: int = ws_server.sockets[0].getsockname()[1] # type: ignore[index] monkeypatch.setenv(ActorEnvVars.EVENTS_WEBSOCKET_URL, f'ws://localhost:{port}') - async with PlatformEventManager(Configuration.get_global_configuration()): + async with ApifyEventManager(Configuration.get_global_configuration()): assert len(connected_ws_clients) == 1 @@ -191,7 +192,7 @@ async def send_platform_event(event_name: Event, data: Any = None) -> None: } SystemInfoEventData.model_validate(dummy_system_info) - async with PlatformEventManager(Configuration.get_global_configuration()) as event_manager: + async with ApifyEventManager(Configuration.get_global_configuration()) as event_manager: event_calls = [] def listener(data: Any) -> None: From 594a8e556703b0aa0b2df2ca13624dc6a7110051 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 22 Jul 2025 16:27:48 +0200 Subject: [PATCH 32/44] Fix request list (HttpResponse.read is now async) --- src/apify/request_loaders/_apify_request_list.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/apify/request_loaders/_apify_request_list.py b/src/apify/request_loaders/_apify_request_list.py index 7065f3dd..3524153e 100644 --- a/src/apify/request_loaders/_apify_request_list.py +++ b/src/apify/request_loaders/_apify_request_list.py @@ -3,7 +3,6 @@ import asyncio import re from asyncio import Task -from functools import partial from typing import Annotated, Any from pydantic import BaseModel, Field, TypeAdapter @@ -121,13 +120,15 @@ async def _fetch_requests_from_url( """ created_requests: list[Request] = [] - def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None: + async def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None: """Extract links from response body and use them to create `Request` objects. Use the regular expression to find all matching links in the response body, then create `Request` objects from these links and the provided input attributes. """ - matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) + response = await (task.result()).read() + matches = re.finditer(URL_NO_COMMAS_REGEX, response.decode('utf-8')) + created_requests.extend( [ Request.from_url( @@ -150,7 +151,11 @@ def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Ta ) ) - get_response_task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input)) + get_response_task.add_done_callback( + lambda task, inp=remote_url_requests_input: asyncio.create_task( # type: ignore[misc] + create_requests_from_response(inp, task) + ) + ) remote_url_requests.append(get_response_task) await asyncio.gather(*remote_url_requests) From e1afe2d7dd99fd304c4c1d08bc27080cb60687f9 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 24 Jul 2025 16:09:13 +0200 Subject: [PATCH 33/44] init upgrading guide to v3 --- docs/04_upgrading/upgrading_to_v2.md | 4 ++-- docs/04_upgrading/upgrading_to_v3.md | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 docs/04_upgrading/upgrading_to_v3.md diff --git a/docs/04_upgrading/upgrading_to_v2.md b/docs/04_upgrading/upgrading_to_v2.md index 90062305..1fd1d111 100644 --- a/docs/04_upgrading/upgrading_to_v2.md +++ b/docs/04_upgrading/upgrading_to_v2.md @@ -3,7 +3,7 @@ id: upgrading-to-v2 title: Upgrading to v2 --- -This page summarizes most of the breaking changes between Apify Python SDK v1.x and v2.0. +This page summarizes the breaking changes between Apify Python SDK v1.x and v2.0. ## Python version support @@ -12,7 +12,7 @@ Support for Python 3.8 has been dropped. The Apify Python SDK v2.x now requires ## Storages - The SDK now uses [crawlee](https://github.com/apify/crawlee-python) for local storage emulation. This change should not affect intended usage (working with `Dataset`, `KeyValueStore` and `RequestQueue` classes from the `apify.storages` module or using the shortcuts exposed by the `Actor` class) in any way. -- There is a difference in the `RequestQueue.add_request` method: it accepts an `apify.Request` object instead of a free-form dictionary. +- There is a difference in the `RequestQueue.add_request` method: it accepts an `apify.Request` object instead of a free-form dictionary. - A quick way to migrate from dict-based arguments is to wrap it with a `Request.model_validate()` call. - The preferred way is using the `Request.from_url` helper which prefills the `unique_key` and `id` attributes, or instantiating it directly, e.g., `Request(url='https://example.tld', ...)`. - For simple use cases, `add_request` also accepts plain strings that contain an URL, e.g. `queue.add_request('https://example.tld')`. diff --git a/docs/04_upgrading/upgrading_to_v3.md b/docs/04_upgrading/upgrading_to_v3.md new file mode 100644 index 00000000..eba1f2d4 --- /dev/null +++ b/docs/04_upgrading/upgrading_to_v3.md @@ -0,0 +1,18 @@ +--- +id: upgrading-to-v2 +title: Upgrading to v2 +--- + +This page summarizes the breaking changes between Apify Python SDK v2.x and v3.0. + +## Python version support + +Support for Python 3.9 has been dropped. The Apify Python SDK v3.x now requires Python 3.10 or later. Make sure your environment is running a compatible version before upgrading. + +## Storages + + + +## Storage clients + + From 8ce69020916267f4c372f7aa8b68b1c5919c2eb5 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 25 Jul 2025 13:19:16 +0200 Subject: [PATCH 34/44] addres RQ feedback from Pepa --- .../_apify/_request_queue_client.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index d7a19837..a3af7842 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -40,7 +40,6 @@ def __init__( self, *, api_client: RequestQueueClientAsync, - api_public_base_url: str, lock: asyncio.Lock, ) -> None: """Initialize a new instance. @@ -50,9 +49,6 @@ def __init__( self._api_client = api_client """The Apify request queue client for API operations.""" - self._api_public_base_url = api_public_base_url - """The public base URL for accessing the key-value store records.""" - self._lock = lock """A lock to ensure that only one operation is performed at a time.""" @@ -166,7 +162,6 @@ async def open( return cls( api_client=apify_rq_client, - api_public_base_url=api_public_base_url, lock=asyncio.Lock(), ) @@ -198,13 +193,14 @@ async def add_batch_of_requests( Returns: Response containing information about the added requests. """ - # Prepare requests for API by converting to dictionaries - requests_dict = [request.model_dump(by_alias=True) for request in requests] - - # Remove 'id' fields from requests as the API doesn't accept them - for request_dict in requests_dict: - if 'id' in request_dict: - del request_dict['id'] + # Prepare requests for API by converting to dictionaries. + requests_dict = [ + request.model_dump( + by_alias=True, + exclude={'id'}, # Exclude ID fields from requests since the API doesn't accept them. + ) + for request in requests + ] # Send requests to API response = await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront) From 42810f072256d1bcdaab919d4343252d97eea05a Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 25 Jul 2025 13:39:47 +0200 Subject: [PATCH 35/44] minor RQ client update --- .../storage_clients/_apify/_request_queue_client.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index a3af7842..9cf44d4e 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -1,6 +1,5 @@ from __future__ import annotations -import asyncio from collections import deque from datetime import datetime, timedelta, timezone from logging import getLogger @@ -40,7 +39,6 @@ def __init__( self, *, api_client: RequestQueueClientAsync, - lock: asyncio.Lock, ) -> None: """Initialize a new instance. @@ -49,14 +47,11 @@ def __init__( self._api_client = api_client """The Apify request queue client for API operations.""" - self._lock = lock - """A lock to ensure that only one operation is performed at a time.""" - self._queue_head = deque[str]() """A deque to store request IDs in the queue head.""" self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=self._MAX_CACHED_REQUESTS) - """A cache to store request objects.""" + """A cache to store request objects. Request ID is used as the cache key.""" self._queue_has_locked_requests: bool | None = None """Whether the queue has requests locked by another client.""" @@ -162,7 +157,6 @@ async def open( return cls( api_client=apify_rq_client, - lock=asyncio.Lock(), ) @override @@ -174,8 +168,7 @@ async def purge(self) -> None: @override async def drop(self) -> None: - async with self._lock: - await self._api_client.delete() + await self._api_client.delete() @override async def add_batch_of_requests( @@ -632,7 +625,7 @@ def _cache_request( """Cache a request for future use. Args: - cache_key: The key to use for caching the request. + cache_key: The key to use for caching the request. It should be request ID. processed_request: The processed request information. forefront: Whether the request was added to the forefront of the queue. hydrated_request: The hydrated request object, if available. From ec2a9f0c24e8d4f6e1db8a130d82c4f1326f8bda Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 29 Jul 2025 12:07:32 +0200 Subject: [PATCH 36/44] Fix 2 tests in RQ Apify storage client --- .../storage_clients/_apify/_request_queue_client.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 9cf44d4e..faa4ab87 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -215,7 +215,7 @@ async def get_request(self, request_id: str) -> Request | None: if response is None: return None - return Request.model_validate(**response) + return Request.model_validate(response) @override async def fetch_next_request(self) -> Request | None: @@ -256,6 +256,15 @@ async def fetch_next_request(self) -> Request | None: ) return None + # Use get request to ensure we have the full request object. + request = await self.get_request(request.id) + if request is None: + logger.debug( + 'Request fetched from the beginning of queue was not found in the RQ', + extra={'nextRequestId': next_request_id}, + ) + return None + return request @override From 71ac38d90528052d7a8dc98469fa4dfd25196f72 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Sun, 3 Aug 2025 09:49:27 +0200 Subject: [PATCH 37/44] Update request queue to use manual request tracking --- .../storage_clients/_apify/_dataset_client.py | 2 + .../_apify/_key_value_store_client.py | 2 + .../_apify/_request_queue_client.py | 85 +++++++++++++++++-- 3 files changed, 84 insertions(+), 5 deletions(-) diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index 7a57e45e..385d6522 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -131,6 +131,8 @@ async def open( apify_dataset_client = apify_client_async.dataset(dataset_id=id) # If both id and name are None, try to get the default storage ID from environment variables. + # The default storage ID environment variable is set by the Apify platform. It also contains + # a new storage ID after Actor's reboot or migration. if id is None and name is None: id = configuration.default_dataset_id apify_dataset_client = apify_client_async.dataset(dataset_id=id) diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index 3900ec58..fb841320 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -123,6 +123,8 @@ async def open( apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id) # If both id and name are None, try to get the default storage ID from environment variables. + # The default storage ID environment variable is set by the Apify platform. It also contains + # a new storage ID after Actor's reboot or migration. if id is None and name is None: id = configuration.default_key_value_store_id apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index faa4ab87..8a6dfa89 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -39,6 +39,10 @@ def __init__( self, *, api_client: RequestQueueClientAsync, + id: str, + name: str | None, + total_request_count: int, + handled_request_count: int, ) -> None: """Initialize a new instance. @@ -47,6 +51,12 @@ def __init__( self._api_client = api_client """The Apify request queue client for API operations.""" + self._id = id + """The ID of the request queue.""" + + self._name = name + """The name of the request queue.""" + self._queue_head = deque[str]() """A deque to store request IDs in the queue head.""" @@ -59,10 +69,38 @@ def __init__( self._should_check_for_forefront_requests = False """Whether to check for forefront requests in the next list_head call.""" + self._had_multiple_clients = False + """Whether the request queue has been accessed by multiple clients.""" + + self._initial_total_count = total_request_count + """The initial total request count (from the API) when the queue was opened.""" + + self._initial_handled_count = handled_request_count + """The initial handled request count (from the API) when the queue was opened.""" + + self._assumed_total_count = 0 + """The number of requests we assume are in the queue (tracked manually for this instance).""" + + self._assumed_handled_count = 0 + """The number of requests we assume have been handled (tracked manually for this instance).""" + @override async def get_metadata(self) -> RequestQueueMetadata: - metadata = await self._api_client.get() - return RequestQueueMetadata.model_validate(metadata) + total_count = self._initial_total_count + self._assumed_total_count + handled_count = self._initial_handled_count + self._assumed_handled_count + pending_count = total_count - handled_count + + return RequestQueueMetadata( + id=self._id, + name=self._name, + total_request_count=total_count, + handled_request_count=handled_count, + pending_request_count=pending_count, + created_at=datetime.now(timezone.utc), + modified_at=datetime.now(timezone.utc), + accessed_at=datetime.now(timezone.utc), + had_multiple_clients=self._had_multiple_clients, + ) @classmethod async def open( @@ -136,6 +174,8 @@ async def open( apify_rq_client = apify_client_async.request_queue(request_queue_id=id) # If both id and name are None, try to get the default storage ID from environment variables. + # The default storage ID environment variable is set by the Apify platform. It also contains + # a new storage ID after Actor's reboot or migration. if id is None and name is None: id = configuration.default_request_queue_id apify_rq_client = apify_client_async.request_queue(request_queue_id=id) @@ -155,8 +195,20 @@ async def open( if metadata is None: raise ValueError(f'Opening request queue with id={id} and name={name} failed.') + metadata_model = RequestQueueMetadata.model_validate( + await apify_rqs_client.get_or_create(), + ) + + # Ensure we have a valid ID. + if id is None: + raise ValueError('Request queue ID cannot be None.') + return cls( api_client=apify_rq_client, + id=id, + name=name, + total_request_count=metadata_model.total_request_count, + handled_request_count=metadata_model.handled_request_count, ) @override @@ -195,10 +247,19 @@ async def add_batch_of_requests( for request in requests ] - # Send requests to API + # Send requests to API. response = await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront) - return AddRequestsResponse.model_validate(response) + # Update assumed total count for newly added requests. + api_response = AddRequestsResponse.model_validate(response) + new_request_count = 0 + for processed_request in api_response.processed_requests: + if not processed_request.was_already_present and not processed_request.was_already_handled: + new_request_count += 1 + + self._assumed_total_count += new_request_count + + return api_response @override async def get_request(self, request_id: str) -> Request | None: @@ -288,6 +349,10 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | processed_request = await self._update_request(request) processed_request.unique_key = request.unique_key + # Update assumed handled count if this wasn't already handled + if not processed_request.was_already_handled: + self._assumed_handled_count += 1 + # Update the cache with the handled request cache_key = unique_key_to_request_id(request.unique_key) self._cache_request( @@ -320,11 +385,21 @@ async def reclaim_request( Returns: Information about the queue operation. `None` if the given request was not in progress. """ + # Check if the request was marked as handled and clear it. When reclaiming, + # we want to put the request back for processing. + if request.was_already_handled: + request.handled_at = None + try: - # Update the request in the API + # Update the request in the API. processed_request = await self._update_request(request, forefront=forefront) processed_request.unique_key = request.unique_key + # If the request was previously handled, decrement our handled count since + # we're putting it back for processing. + if request.was_already_handled and not processed_request.was_already_handled: + self._assumed_handled_count -= 1 + # Update the cache cache_key = unique_key_to_request_id(request.unique_key) self._cache_request( From a8881dd3684a185c836d89bd9b29da10a3f9c33c Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Sun, 3 Aug 2025 11:16:42 +0200 Subject: [PATCH 38/44] httpx vs impit --- .github/workflows/run_code_checks.yaml | 1 - src/apify/request_loaders/_apify_request_list.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run_code_checks.yaml b/.github/workflows/run_code_checks.yaml index 2fe95637..4323b479 100644 --- a/.github/workflows/run_code_checks.yaml +++ b/.github/workflows/run_code_checks.yaml @@ -36,7 +36,6 @@ jobs: integration_tests: name: Integration tests - needs: [lint_check, type_check, unit_tests] uses: apify/workflows/.github/workflows/python_integration_tests.yaml@main secrets: inherit with: diff --git a/src/apify/request_loaders/_apify_request_list.py b/src/apify/request_loaders/_apify_request_list.py index 3524153e..272defed 100644 --- a/src/apify/request_loaders/_apify_request_list.py +++ b/src/apify/request_loaders/_apify_request_list.py @@ -8,7 +8,7 @@ from pydantic import BaseModel, Field, TypeAdapter from crawlee._types import HttpMethod -from crawlee.http_clients import HttpClient, HttpxHttpClient +from crawlee.http_clients import HttpClient, ImpitHttpClient from crawlee.request_loaders import RequestList from apify import Request @@ -80,7 +80,7 @@ async def _create_request_list( name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: HttpClient | None ) -> ApifyRequestList: if not http_client: - http_client = HttpxHttpClient() + http_client = ImpitHttpClient() url_inputs = url_input_adapter.validate_python(request_list_sources_input) From 89e572eed60b99b23ba9154ef7f0835076b6c83f Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 5 Aug 2025 11:55:43 +0200 Subject: [PATCH 39/44] rm broken crawlers integration tests --- .../integration/actor_source_base/Dockerfile | 2 +- .../actor_source_base/requirements.txt | 2 - .../test_crawlers_with_storages.py | 111 ------------------ 3 files changed, 1 insertion(+), 114 deletions(-) delete mode 100644 tests/integration/test_crawlers_with_storages.py diff --git a/tests/integration/actor_source_base/Dockerfile b/tests/integration/actor_source_base/Dockerfile index 1e5df612..194a712a 100644 --- a/tests/integration/actor_source_base/Dockerfile +++ b/tests/integration/actor_source_base/Dockerfile @@ -16,4 +16,4 @@ RUN echo "Python version:" \ && echo "All installed Python packages:" \ && pip freeze -CMD ["sh", "-c", "python server.py & python -m src"] +CMD ["sh", "-c", "python -m src"] diff --git a/tests/integration/actor_source_base/requirements.txt b/tests/integration/actor_source_base/requirements.txt index fe77c2dc..0df1ff38 100644 --- a/tests/integration/actor_source_base/requirements.txt +++ b/tests/integration/actor_source_base/requirements.txt @@ -1,4 +1,2 @@ # The test fixture will put the Apify SDK wheel path on the next line APIFY_SDK_WHEEL_PLACEHOLDER -uvicorn[standard] -crawlee[parsel] diff --git a/tests/integration/test_crawlers_with_storages.py b/tests/integration/test_crawlers_with_storages.py deleted file mode 100644 index cb1f7e2b..00000000 --- a/tests/integration/test_crawlers_with_storages.py +++ /dev/null @@ -1,111 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from .conftest import MakeActorFunction, RunActorFunction - - -async def test_actor_on_platform_max_crawl_depth( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: - """Test that the actor respects max_crawl_depth.""" - - async def main() -> None: - """The crawler entry point.""" - import re - - from crawlee.crawlers import ParselCrawler, ParselCrawlingContext - - from apify import Actor - - async with Actor: - crawler = ParselCrawler(max_crawl_depth=2) - finished = [] - enqueue_pattern = re.compile(r'http://localhost:8080/2+$') - - @crawler.router.default_handler - async def default_handler(context: ParselCrawlingContext) -> None: - """Default request handler.""" - context.log.info(f'Processing {context.request.url} ...') - await context.enqueue_links(include=[enqueue_pattern]) - finished.append(context.request.url) - - await crawler.run(['http://localhost:8080/']) - assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22'] - - actor = await make_actor(label='crawler-max-depth', main_func=main) - run_result = await run_actor(actor) - - assert run_result.status == 'SUCCEEDED' - - -async def test_actor_on_platform_max_requests_per_crawl( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: - """Test that the actor respects max_requests_per_crawl.""" - - async def main() -> None: - """The crawler entry point.""" - from crawlee import ConcurrencySettings - from crawlee.crawlers import ParselCrawler, ParselCrawlingContext - - from apify import Actor - - async with Actor: - crawler = ParselCrawler( - max_requests_per_crawl=3, concurrency_settings=ConcurrencySettings(max_concurrency=1) - ) - finished = [] - - @crawler.router.default_handler - async def default_handler(context: ParselCrawlingContext) -> None: - """Default request handler.""" - context.log.info(f'Processing {context.request.url} ...') - await context.enqueue_links() - finished.append(context.request.url) - - await crawler.run(['http://localhost:8080/']) - assert len(finished) == 3 - - actor = await make_actor(label='crawler-max-requests', main_func=main) - run_result = await run_actor(actor) - - assert run_result.status == 'SUCCEEDED' - - -async def test_actor_on_platform_max_request_retries( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: - """Test that the actor respects max_request_retries.""" - - async def main() -> None: - """The crawler entry point.""" - from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext - - from apify import Actor - - async with Actor: - max_retries = 3 - crawler = ParselCrawler(max_request_retries=max_retries) - failed_counter = 0 - - @crawler.error_handler - async def error_handler(_: BasicCrawlingContext, __: Exception) -> None: - nonlocal failed_counter - failed_counter += 1 - - @crawler.router.default_handler - async def default_handler(_: ParselCrawlingContext) -> None: - raise RuntimeError('Some error') - - await crawler.run(['http://localhost:8080/']) - assert failed_counter == max_retries, f'{failed_counter=}' - - actor = await make_actor(label='crawler-max-retries', main_func=main) - run_result = await run_actor(actor) - - assert run_result.status == 'SUCCEEDED' From ae3044e3e7d6dcdd05ee7600bb98ff26b20ef08c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 5 Aug 2025 15:11:02 +0200 Subject: [PATCH 40/44] Try to patch the integration tests for the crawlee branch --- .../integration/actor_source_base/Dockerfile | 2 +- .../actor_source_base/requirements.txt | 2 + .../test_crawlers_with_storages.py | 111 ++++++++++++++++++ 3 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 tests/integration/test_crawlers_with_storages.py diff --git a/tests/integration/actor_source_base/Dockerfile b/tests/integration/actor_source_base/Dockerfile index 194a712a..1e5df612 100644 --- a/tests/integration/actor_source_base/Dockerfile +++ b/tests/integration/actor_source_base/Dockerfile @@ -16,4 +16,4 @@ RUN echo "Python version:" \ && echo "All installed Python packages:" \ && pip freeze -CMD ["sh", "-c", "python -m src"] +CMD ["sh", "-c", "python server.py & python -m src"] diff --git a/tests/integration/actor_source_base/requirements.txt b/tests/integration/actor_source_base/requirements.txt index 0df1ff38..66a782ba 100644 --- a/tests/integration/actor_source_base/requirements.txt +++ b/tests/integration/actor_source_base/requirements.txt @@ -1,2 +1,4 @@ # The test fixture will put the Apify SDK wheel path on the next line APIFY_SDK_WHEEL_PLACEHOLDER +uvicorn[standard] +crawlee[parsel] @ git+https://github.com/apify/crawlee-python.git@master diff --git a/tests/integration/test_crawlers_with_storages.py b/tests/integration/test_crawlers_with_storages.py new file mode 100644 index 00000000..cb1f7e2b --- /dev/null +++ b/tests/integration/test_crawlers_with_storages.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .conftest import MakeActorFunction, RunActorFunction + + +async def test_actor_on_platform_max_crawl_depth( + make_actor: MakeActorFunction, + run_actor: RunActorFunction, +) -> None: + """Test that the actor respects max_crawl_depth.""" + + async def main() -> None: + """The crawler entry point.""" + import re + + from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + from apify import Actor + + async with Actor: + crawler = ParselCrawler(max_crawl_depth=2) + finished = [] + enqueue_pattern = re.compile(r'http://localhost:8080/2+$') + + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + await context.enqueue_links(include=[enqueue_pattern]) + finished.append(context.request.url) + + await crawler.run(['http://localhost:8080/']) + assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22'] + + actor = await make_actor(label='crawler-max-depth', main_func=main) + run_result = await run_actor(actor) + + assert run_result.status == 'SUCCEEDED' + + +async def test_actor_on_platform_max_requests_per_crawl( + make_actor: MakeActorFunction, + run_actor: RunActorFunction, +) -> None: + """Test that the actor respects max_requests_per_crawl.""" + + async def main() -> None: + """The crawler entry point.""" + from crawlee import ConcurrencySettings + from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + from apify import Actor + + async with Actor: + crawler = ParselCrawler( + max_requests_per_crawl=3, concurrency_settings=ConcurrencySettings(max_concurrency=1) + ) + finished = [] + + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + await context.enqueue_links() + finished.append(context.request.url) + + await crawler.run(['http://localhost:8080/']) + assert len(finished) == 3 + + actor = await make_actor(label='crawler-max-requests', main_func=main) + run_result = await run_actor(actor) + + assert run_result.status == 'SUCCEEDED' + + +async def test_actor_on_platform_max_request_retries( + make_actor: MakeActorFunction, + run_actor: RunActorFunction, +) -> None: + """Test that the actor respects max_request_retries.""" + + async def main() -> None: + """The crawler entry point.""" + from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext + + from apify import Actor + + async with Actor: + max_retries = 3 + crawler = ParselCrawler(max_request_retries=max_retries) + failed_counter = 0 + + @crawler.error_handler + async def error_handler(_: BasicCrawlingContext, __: Exception) -> None: + nonlocal failed_counter + failed_counter += 1 + + @crawler.router.default_handler + async def default_handler(_: ParselCrawlingContext) -> None: + raise RuntimeError('Some error') + + await crawler.run(['http://localhost:8080/']) + assert failed_counter == max_retries, f'{failed_counter=}' + + actor = await make_actor(label='crawler-max-retries', main_func=main) + run_result = await run_actor(actor) + + assert run_result.status == 'SUCCEEDED' From 6c959572b63b32fc74eaf239fbfa869764207656 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 11 Aug 2025 16:37:06 +0200 Subject: [PATCH 41/44] Resolve #540 integration test --- tests/integration/test_crawlers_with_storages.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/integration/test_crawlers_with_storages.py b/tests/integration/test_crawlers_with_storages.py index 3dd32707..cb1f7e2b 100644 --- a/tests/integration/test_crawlers_with_storages.py +++ b/tests/integration/test_crawlers_with_storages.py @@ -2,8 +2,6 @@ from typing import TYPE_CHECKING -import pytest - if TYPE_CHECKING: from .conftest import MakeActorFunction, RunActorFunction @@ -78,7 +76,6 @@ async def default_handler(context: ParselCrawlingContext) -> None: assert run_result.status == 'SUCCEEDED' -@pytest.mark.skip(reason='Sometimes crawler does not respect max_request_retries argument, see issue #540') async def test_actor_on_platform_max_request_retries( make_actor: MakeActorFunction, run_actor: RunActorFunction, From 8b7fc771aebb18608224e6021ac9ce16c65f2612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Tue, 12 Aug 2025 08:35:33 +0200 Subject: [PATCH 42/44] fix: Suggest solving the forefront handling of reclaimed requests (#537) Drop forefront info from local chache as it is unreliable and complicates the flow. Only the platform knows the real forefront, do not try to keep imperfect copy of it locally and rather design the system to work without being aware of the cached forefront. --- src/apify/storage_clients/_apify/_models.py | 3 - .../_apify/_request_queue_client.py | 55 ++++++------------- 2 files changed, 16 insertions(+), 42 deletions(-) diff --git a/src/apify/storage_clients/_apify/_models.py b/src/apify/storage_clients/_apify/_models.py index d41e33b2..df981121 100644 --- a/src/apify/storage_clients/_apify/_models.py +++ b/src/apify/storage_clients/_apify/_models.py @@ -105,6 +105,3 @@ class CachedRequest(BaseModel): lock_expires_at: datetime | None = None """The expiration time of the lock on the request.""" - - forefront: bool = False - """Whether the request was added to the forefront of the queue.""" diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 8a6dfa89..95af78a1 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -358,7 +358,6 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | self._cache_request( cache_key, processed_request, - forefront=False, hydrated_request=request, ) except Exception as exc: @@ -405,7 +404,6 @@ async def reclaim_request( self._cache_request( cache_key, processed_request, - forefront=forefront, hydrated_request=request, ) @@ -463,9 +461,7 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None: # Try to prolong the lock if it's expired try: lock_secs = int(self._DEFAULT_LOCK_TIME.total_seconds()) - response = await self._prolong_request_lock( - request_id, forefront=cached_entry.forefront, lock_secs=lock_secs - ) + response = await self._prolong_request_lock(request_id, lock_secs=lock_secs) cached_entry.lock_expires_at = response.lock_expires_at except Exception: # If prolonging the lock fails, we lost the request @@ -478,7 +474,7 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None: try: # Try to acquire or prolong the lock lock_secs = int(self._DEFAULT_LOCK_TIME.total_seconds()) - await self._prolong_request_lock(request_id, forefront=False, lock_secs=lock_secs) + await self._prolong_request_lock(request_id, lock_secs=lock_secs) # Fetch the request data request = await self.get_request(request_id) @@ -498,7 +494,6 @@ async def _get_or_hydrate_request(self, request_id: str) -> Request | None: was_already_present=True, was_already_handled=request.handled_at is not None, ), - forefront=False, hydrated_request=request, ) except Exception as exc: @@ -569,6 +564,12 @@ async def _list_head( lock_time=lock_time, ) + leftover_buffer = list[str]() + if self._should_check_for_forefront_requests: + leftover_buffer = list(self._queue_head) + self._queue_head.clear() + self._should_check_for_forefront_requests = False + # Otherwise fetch from API lock_time = lock_time or self._DEFAULT_LOCK_TIME lock_secs = int(lock_time.total_seconds()) @@ -581,15 +582,6 @@ async def _list_head( # Update the queue head cache self._queue_has_locked_requests = response.get('queueHasLockedRequests', False) - # Clear current queue head if we're checking for forefront requests - if self._should_check_for_forefront_requests: - self._queue_head.clear() - self._should_check_for_forefront_requests = False - - # Process and cache the requests - head_id_buffer = list[str]() - forefront_head_id_buffer = list[str]() - for request_data in response.get('items', []): request = Request.model_validate(request_data) @@ -604,36 +596,23 @@ async def _list_head( ) continue - # Check if this request was already cached and if it was added to forefront - cache_key = unique_key_to_request_id(request.unique_key) - cached_request = self._requests_cache.get(cache_key) - forefront = cached_request.forefront if cached_request else False - - # Add to appropriate buffer based on forefront flag - if forefront: - forefront_head_id_buffer.insert(0, request.id) - else: - head_id_buffer.append(request.id) - # Cache the request self._cache_request( - cache_key, + unique_key_to_request_id(request.unique_key), ProcessedRequest( id=request.id, unique_key=request.unique_key, was_already_present=True, was_already_handled=False, ), - forefront=forefront, hydrated_request=request, ) - # Update the queue head deque - for request_id in head_id_buffer: - self._queue_head.append(request_id) + self._queue_head.append(request.id) - for request_id in forefront_head_id_buffer: - self._queue_head.appendleft(request_id) + for leftover_request_id in leftover_buffer: + # After adding new requests to the forefront, any existing leftover locked request is kept in the end. + self._queue_head.append(leftover_request_id) return RequestQueueHead.model_validate(response) @@ -641,14 +620,12 @@ async def _prolong_request_lock( self, request_id: str, *, - forefront: bool = False, lock_secs: int, ) -> ProlongRequestLockResponse: """Prolong the lock on a specific request in the queue. Args: request_id: The identifier of the request whose lock is to be prolonged. - forefront: Whether to put the request in the beginning or the end of the queue after lock expires. lock_secs: The additional amount of time, in seconds, that the request will remain locked. Returns: @@ -656,7 +633,9 @@ async def _prolong_request_lock( """ response = await self._api_client.prolong_request_lock( request_id=request_id, - forefront=forefront, + # All requests reaching this code were the tip of the queue at the moment when they were fetched, + # so if their lock expires, they should be put back to the forefront as their handling is long overdue. + forefront=True, lock_secs=lock_secs, ) @@ -703,7 +682,6 @@ def _cache_request( cache_key: str, processed_request: ProcessedRequest, *, - forefront: bool, hydrated_request: Request | None = None, ) -> None: """Cache a request for future use. @@ -719,5 +697,4 @@ def _cache_request( was_already_handled=processed_request.was_already_handled, hydrated=hydrated_request, lock_expires_at=None, - forefront=forefront, ) From 76eea4715f1b341638af4806ea87bebdf7a8adcf Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 12 Aug 2025 08:55:55 +0200 Subject: [PATCH 43/44] test_actor_on_platform_max_request_retries is still flaky --- tests/integration/test_crawlers_with_storages.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration/test_crawlers_with_storages.py b/tests/integration/test_crawlers_with_storages.py index cb1f7e2b..3dd32707 100644 --- a/tests/integration/test_crawlers_with_storages.py +++ b/tests/integration/test_crawlers_with_storages.py @@ -2,6 +2,8 @@ from typing import TYPE_CHECKING +import pytest + if TYPE_CHECKING: from .conftest import MakeActorFunction, RunActorFunction @@ -76,6 +78,7 @@ async def default_handler(context: ParselCrawlingContext) -> None: assert run_result.status == 'SUCCEEDED' +@pytest.mark.skip(reason='Sometimes crawler does not respect max_request_retries argument, see issue #540') async def test_actor_on_platform_max_request_retries( make_actor: MakeActorFunction, run_actor: RunActorFunction, From 9d5e86e4a9a92c4456c06cd7acf6a475abc6f598 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Tue, 12 Aug 2025 16:53:15 +0200 Subject: [PATCH 44/44] fix: Try to solve race conditions (#544) - Closes: #540 --- .../_apify/_request_queue_client.py | 98 ++++++++++--------- tests/integration/test_actor_request_queue.py | 27 ++--- .../test_crawlers_with_storages.py | 4 +- 3 files changed, 69 insertions(+), 60 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 95af78a1..519cd95a 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio from collections import deque from datetime import datetime, timedelta, timezone from logging import getLogger @@ -84,6 +85,9 @@ def __init__( self._assumed_handled_count = 0 """The number of requests we assume have been handled (tracked manually for this instance).""" + self._fetch_lock = asyncio.Lock() + """Fetch lock to minimize race conditions when communicating with API.""" + @override async def get_metadata(self) -> RequestQueueMetadata: total_count = self._initial_total_count + self._assumed_total_count @@ -290,15 +294,17 @@ async def fetch_next_request(self) -> Request | None: Returns: The request or `None` if there are no more pending requests. """ - # Ensure the queue head has requests if available - await self._ensure_head_is_non_empty() + # Ensure the queue head has requests if available. Fetching the head with lock to prevent race conditions. + async with self._fetch_lock: + await self._ensure_head_is_non_empty() - # If queue head is empty after ensuring, there are no requests - if not self._queue_head: - return None + # If queue head is empty after ensuring, there are no requests + if not self._queue_head: + return None + + # Get the next request ID from the queue head + next_request_id = self._queue_head.popleft() - # Get the next request ID from the queue head - next_request_id = self._queue_head.popleft() request = await self._get_or_hydrate_request(next_request_id) # Handle potential inconsistency where request might not be in the main table yet @@ -344,6 +350,8 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | if request.handled_at is None: request.handled_at = datetime.now(tz=timezone.utc) + if cached_request := self._requests_cache[request.id]: + cached_request.was_already_handled = request.was_already_handled try: # Update the request in the API processed_request = await self._update_request(request) @@ -389,39 +397,41 @@ async def reclaim_request( if request.was_already_handled: request.handled_at = None - try: - # Update the request in the API. - processed_request = await self._update_request(request, forefront=forefront) - processed_request.unique_key = request.unique_key - - # If the request was previously handled, decrement our handled count since - # we're putting it back for processing. - if request.was_already_handled and not processed_request.was_already_handled: - self._assumed_handled_count -= 1 - - # Update the cache - cache_key = unique_key_to_request_id(request.unique_key) - self._cache_request( - cache_key, - processed_request, - hydrated_request=request, - ) + # Reclaim with lock to prevent race conditions that could lead to double processing of the same request. + async with self._fetch_lock: + try: + # Update the request in the API. + processed_request = await self._update_request(request, forefront=forefront) + processed_request.unique_key = request.unique_key + + # If the request was previously handled, decrement our handled count since + # we're putting it back for processing. + if request.was_already_handled and not processed_request.was_already_handled: + self._assumed_handled_count -= 1 + + # Update the cache + cache_key = unique_key_to_request_id(request.unique_key) + self._cache_request( + cache_key, + processed_request, + hydrated_request=request, + ) - # If we're adding to the forefront, we need to check for forefront requests - # in the next list_head call - if forefront: - self._should_check_for_forefront_requests = True + # If we're adding to the forefront, we need to check for forefront requests + # in the next list_head call + if forefront: + self._should_check_for_forefront_requests = True - # Try to release the lock on the request - try: - await self._delete_request_lock(request.id, forefront=forefront) - except Exception as err: - logger.debug(f'Failed to delete request lock for request {request.id}', exc_info=err) - except Exception as exc: - logger.debug(f'Error reclaiming request {request.id}: {exc!s}') - return None - else: - return processed_request + # Try to release the lock on the request + try: + await self._delete_request_lock(request.id, forefront=forefront) + except Exception as err: + logger.debug(f'Failed to delete request lock for request {request.id}', exc_info=err) + except Exception as exc: + logger.debug(f'Error reclaiming request {request.id}: {exc!s}') + return None + else: + return processed_request @override async def is_empty(self) -> bool: @@ -430,9 +440,11 @@ async def is_empty(self) -> bool: Returns: True if the queue is empty, False otherwise. """ - head = await self._list_head(limit=1, lock_time=None) - - return len(head.items) == 0 and not self._queue_has_locked_requests + # Check _list_head and self._queue_has_locked_requests with lock to make sure they are consistent. + # Without the lock the `is_empty` is prone to falsely report True with some low probability race condition. + async with self._fetch_lock: + head = await self._list_head(limit=1, lock_time=None) + return len(head.items) == 0 and not self._queue_has_locked_requests async def _ensure_head_is_non_empty(self) -> None: """Ensure that the queue head has requests if they are available in the queue.""" @@ -545,7 +557,6 @@ async def _list_head( # Return from cache if available and we're not checking for new forefront requests if self._queue_head and not self._should_check_for_forefront_requests: logger.debug(f'Using cached queue head with {len(self._queue_head)} requests') - # Create a list of requests from the cached queue head items = [] for request_id in list(self._queue_head)[:limit]: @@ -563,7 +574,6 @@ async def _list_head( queue_has_locked_requests=self._queue_has_locked_requests, lock_time=lock_time, ) - leftover_buffer = list[str]() if self._should_check_for_forefront_requests: leftover_buffer = list(self._queue_head) @@ -607,13 +617,11 @@ async def _list_head( ), hydrated_request=request, ) - self._queue_head.append(request.id) for leftover_request_id in leftover_buffer: # After adding new requests to the forefront, any existing leftover locked request is kept in the end. self._queue_head.append(leftover_request_id) - return RequestQueueHead.model_validate(response) async def _prolong_request_lock( diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index d4730b00..64a846b5 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -98,15 +98,18 @@ async def test_request_queue_is_finished( request_queue_name = generate_unique_resource_name('request_queue') async with Actor: - request_queue = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) - await request_queue.add_request(Request.from_url('http://example.com')) - assert not await request_queue.is_finished() - - request = await request_queue.fetch_next_request() - assert request is not None - assert not await request_queue.is_finished(), ( - 'RequestQueue should not be finished unless the request is marked as handled.' - ) - - await request_queue.mark_request_as_handled(request) - assert await request_queue.is_finished() + try: + request_queue = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) + await request_queue.add_request(Request.from_url('http://example.com')) + assert not await request_queue.is_finished() + + request = await request_queue.fetch_next_request() + assert request is not None + assert not await request_queue.is_finished(), ( + 'RequestQueue should not be finished unless the request is marked as handled.' + ) + + await request_queue.mark_request_as_handled(request) + assert await request_queue.is_finished() + finally: + await request_queue.drop() diff --git a/tests/integration/test_crawlers_with_storages.py b/tests/integration/test_crawlers_with_storages.py index 3dd32707..a2ba1e4d 100644 --- a/tests/integration/test_crawlers_with_storages.py +++ b/tests/integration/test_crawlers_with_storages.py @@ -2,8 +2,6 @@ from typing import TYPE_CHECKING -import pytest - if TYPE_CHECKING: from .conftest import MakeActorFunction, RunActorFunction @@ -78,7 +76,6 @@ async def default_handler(context: ParselCrawlingContext) -> None: assert run_result.status == 'SUCCEEDED' -@pytest.mark.skip(reason='Sometimes crawler does not respect max_request_retries argument, see issue #540') async def test_actor_on_platform_max_request_retries( make_actor: MakeActorFunction, run_actor: RunActorFunction, @@ -87,6 +84,7 @@ async def test_actor_on_platform_max_request_retries( async def main() -> None: """The crawler entry point.""" + from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext from apify import Actor