From 70ff5cf71c89c6ee2b18e806833b179b5450818c Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 4 Jun 2024 16:13:40 +0200 Subject: [PATCH 01/68] Update pyproject.toml --- pyproject.toml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4787e547..d5314d95 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "apify" -version = "1.7.2" +version = "2.0.0" description = "Apify SDK for Python" readme = "README.md" license = { text = "Apache Software License" } @@ -20,7 +20,7 @@ classifiers = [ "Topic :: Software Development :: Libraries", ] -requires-python = ">=3.8" +requires-python = ">=3.9" # We use inclusive ordered comparison clause for non-Apify packages intentionally in order to enhance the Apify SDK's # compatibility with a wide range of external packages. This decision was discussed in detail in the following PR: @@ -31,6 +31,7 @@ dependencies = [ "aiofiles >= 22.1.0", "aioshutil >= 1.0", "colorama >= 0.4.6", + "crawlee >= 0.0.5", "cryptography >= 39.0.0", "httpx >= 0.24.0", "psutil >= 5.9.0", @@ -153,3 +154,6 @@ known-local-folder = ["apify"] [tool.ruff.lint.pydocstyle] convention = "google" + +[tool.basedpyright] +typeCheckingMode = "standard" From 12d3a09b907f7f04f8d227e3881ede24c79d458d Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 4 Jun 2024 22:24:13 +0200 Subject: [PATCH 02/68] Use storage code from crawlee, add ApifyStorageClient --- pyproject.toml | 2 + src/apify/_memory_storage/__init__.py | 3 - .../_memory_storage/file_storage_utils.py | 71 --- .../_memory_storage/memory_storage_client.py | 219 ------- .../resource_clients/__init__.py | 19 - .../resource_clients/base_resource_client.py | 141 ---- .../base_resource_collection_client.py | 114 ---- .../resource_clients/dataset.py | 452 ------------- .../resource_clients/dataset_collection.py | 48 -- .../resource_clients/key_value_store.py | 533 ---------------- .../key_value_store_collection.py | 48 -- .../resource_clients/request_queue.py | 466 -------------- .../request_queue_collection.py | 48 -- src/apify/_utils.py | 36 +- src/apify/actor.py | 72 +-- src/apify/apify_storage_client/__init__.py | 0 .../apify_storage_client.py | 52 ++ .../apify_storage_client/dataset_client.py | 183 ++++++ .../dataset_collection_client.py | 50 ++ .../key_value_store_client.py | 74 +++ .../key_value_store_collection_client.py | 43 ++ .../request_queue_client.py | 146 +++++ .../request_queue_collection_client.py | 49 ++ src/apify/storages/__init__.py | 15 +- src/apify/storages/base_storage.py | 181 ------ src/apify/storages/dataset.py | 494 -------------- src/apify/storages/key_value_store.py | 257 -------- src/apify/storages/request_queue.py | 602 ------------------ src/apify/storages/storage_client_manager.py | 72 --- tests/integration/conftest.py | 3 +- .../resource_clients/test_key_value_store.py | 2 +- website/generate_module_shortcuts.py | 2 +- 32 files changed, 654 insertions(+), 3843 deletions(-) delete mode 100644 src/apify/_memory_storage/__init__.py delete mode 100644 src/apify/_memory_storage/file_storage_utils.py delete mode 100644 src/apify/_memory_storage/memory_storage_client.py delete mode 100644 src/apify/_memory_storage/resource_clients/__init__.py delete mode 100644 src/apify/_memory_storage/resource_clients/base_resource_client.py delete mode 100644 src/apify/_memory_storage/resource_clients/base_resource_collection_client.py delete mode 100644 src/apify/_memory_storage/resource_clients/dataset.py delete mode 100644 src/apify/_memory_storage/resource_clients/dataset_collection.py delete mode 100644 src/apify/_memory_storage/resource_clients/key_value_store.py delete mode 100644 src/apify/_memory_storage/resource_clients/key_value_store_collection.py delete mode 100644 src/apify/_memory_storage/resource_clients/request_queue.py delete mode 100644 src/apify/_memory_storage/resource_clients/request_queue_collection.py create mode 100644 src/apify/apify_storage_client/__init__.py create mode 100644 src/apify/apify_storage_client/apify_storage_client.py create mode 100644 src/apify/apify_storage_client/dataset_client.py create mode 100644 src/apify/apify_storage_client/dataset_collection_client.py create mode 100644 src/apify/apify_storage_client/key_value_store_client.py create mode 100644 src/apify/apify_storage_client/key_value_store_collection_client.py create mode 100644 src/apify/apify_storage_client/request_queue_client.py create mode 100644 src/apify/apify_storage_client/request_queue_collection_client.py delete mode 100644 src/apify/storages/base_storage.py delete mode 100644 src/apify/storages/dataset.py delete mode 100644 src/apify/storages/key_value_store.py delete mode 100644 src/apify/storages/request_queue.py delete mode 100644 src/apify/storages/storage_client_manager.py diff --git a/pyproject.toml b/pyproject.toml index d5314d95..307174de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,12 +89,14 @@ apify = ["py.typed"] line-length = 150 select = ["ALL"] ignore = [ + "ANN101", # Missing type annotation for `self` in method "ANN401", # Dynamically typed expressions (typing.Any) are disallowed in {filename} "BLE001", # Do not catch blind exception "C901", # `{name}` is too complex "COM812", # This rule may cause conflicts when used with the formatter "D100", # Missing docstring in public module "D104", # Missing docstring in public package + "D107", # Missing docstring in `__init__` "EM", # flake8-errmsg "G004", # Logging statement uses f-string "ISC001", # This rule may cause conflicts when used with the formatter diff --git a/src/apify/_memory_storage/__init__.py b/src/apify/_memory_storage/__init__.py deleted file mode 100644 index 6b51836d..00000000 --- a/src/apify/_memory_storage/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .memory_storage_client import MemoryStorageClient - -__all__ = ['MemoryStorageClient'] diff --git a/src/apify/_memory_storage/file_storage_utils.py b/src/apify/_memory_storage/file_storage_utils.py deleted file mode 100644 index 64645001..00000000 --- a/src/apify/_memory_storage/file_storage_utils.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import annotations - -import os - -import aiofiles -from aiofiles.os import makedirs -from apify_shared.utils import json_dumps - -from apify._utils import force_remove - - -async def update_metadata(*, data: dict, entity_directory: str, write_metadata: bool) -> None: - # Skip writing the actual metadata file. This is done after ensuring the directory exists so we have the directory present - if not write_metadata: - return - - # Ensure the directory for the entity exists - await makedirs(entity_directory, exist_ok=True) - - # Write the metadata to the file - file_path = os.path.join(entity_directory, '__metadata__.json') - async with aiofiles.open(file_path, mode='wb') as f: - await f.write(json_dumps(data).encode('utf-8')) - - -async def _update_dataset_items( - *, - data: list[tuple[str, dict]], - entity_directory: str, - persist_storage: bool, -) -> None: - # Skip writing files to the disk if the client has the option set to false - if not persist_storage: - return - - # Ensure the directory for the entity exists - await makedirs(entity_directory, exist_ok=True) - - # Save all the new items to the disk - for idx, item in data: - file_path = os.path.join(entity_directory, f'{idx}.json') - async with aiofiles.open(file_path, mode='wb') as f: - await f.write(json_dumps(item).encode('utf-8')) - - -async def update_request_queue_item( - *, - request_id: str, - request: dict, - entity_directory: str, - persist_storage: bool, -) -> None: - # Skip writing files to the disk if the client has the option set to false - if not persist_storage: - return - - # Ensure the directory for the entity exists - await makedirs(entity_directory, exist_ok=True) - - # Write the request to the file - file_path = os.path.join(entity_directory, f'{request_id}.json') - async with aiofiles.open(file_path, mode='wb') as f: - await f.write(json_dumps(request).encode('utf-8')) - - -async def delete_request(*, request_id: str, entity_directory: str) -> None: - # Ensure the directory for the entity exists - await makedirs(entity_directory, exist_ok=True) - - file_path = os.path.join(entity_directory, f'{request_id}.json') - await force_remove(file_path) diff --git a/src/apify/_memory_storage/memory_storage_client.py b/src/apify/_memory_storage/memory_storage_client.py deleted file mode 100644 index ed55cc46..00000000 --- a/src/apify/_memory_storage/memory_storage_client.py +++ /dev/null @@ -1,219 +0,0 @@ -from __future__ import annotations - -import asyncio -import contextlib -import os -from pathlib import Path - -import aioshutil -from aiofiles import ospath -from aiofiles.os import rename, scandir -from apify_shared.consts import ApifyEnvVars -from apify_shared.utils import ignore_docs - -from apify._memory_storage.resource_clients.dataset import DatasetClient -from apify._memory_storage.resource_clients.dataset_collection import DatasetCollectionClient -from apify._memory_storage.resource_clients.key_value_store import KeyValueStoreClient -from apify._memory_storage.resource_clients.key_value_store_collection import KeyValueStoreCollectionClient -from apify._memory_storage.resource_clients.request_queue import RequestQueueClient -from apify._memory_storage.resource_clients.request_queue_collection import RequestQueueCollectionClient -from apify._utils import maybe_parse_bool - -""" -Memory storage emulates data storages that are available on the Apify platform. -Specifically, it emulates clients for datasets, key-value stores and request queues. -The data are held in-memory and persisted locally if `persist_storage` is True. -The metadata of the storages is also persisted if `write_metadata` is True. -""" - - -@ignore_docs -class MemoryStorageClient: - """Class representing an in-memory storage.""" - - _local_data_directory: str - _datasets_directory: str - _key_value_stores_directory: str - _request_queues_directory: str - _write_metadata: bool - _persist_storage: bool - _datasets_handled: list[DatasetClient] - _key_value_stores_handled: list[KeyValueStoreClient] - _request_queues_handled: list[RequestQueueClient] - - _purged_on_start: bool = False - _purge_lock: asyncio.Lock - - """Indicates whether a purge was already performed on this instance""" - - def __init__( - self: MemoryStorageClient, - *, - local_data_directory: str | None = None, - write_metadata: bool | None = None, - persist_storage: bool | None = None, - ) -> None: - """Initialize the MemoryStorageClient. - - Args: - local_data_directory (str, optional): A local directory where all data will be persisted - persist_storage (bool, optional): Whether to persist the data to the `local_data_directory` or just keep them in memory - write_metadata (bool, optional): Whether to persist metadata of the storages as well - """ - self._local_data_directory = local_data_directory or os.getenv(ApifyEnvVars.LOCAL_STORAGE_DIR) or './storage' - self._datasets_directory = os.path.join(self._local_data_directory, 'datasets') - self._key_value_stores_directory = os.path.join(self._local_data_directory, 'key_value_stores') - self._request_queues_directory = os.path.join(self._local_data_directory, 'request_queues') - self._write_metadata = write_metadata if write_metadata is not None else '*' in os.getenv('DEBUG', '') - self._persist_storage = persist_storage if persist_storage is not None else maybe_parse_bool(os.getenv(ApifyEnvVars.PERSIST_STORAGE, 'true')) - self._datasets_handled = [] - self._key_value_stores_handled = [] - self._request_queues_handled = [] - self._purge_lock = asyncio.Lock() - - def datasets(self: MemoryStorageClient) -> DatasetCollectionClient: - """Retrieve the sub-client for manipulating datasets.""" - return DatasetCollectionClient(base_storage_directory=self._datasets_directory, memory_storage_client=self) - - def dataset(self: MemoryStorageClient, dataset_id: str) -> DatasetClient: - """Retrieve the sub-client for manipulating a single dataset. - - Args: - dataset_id (str): ID of the dataset to be manipulated - """ - return DatasetClient(base_storage_directory=self._datasets_directory, memory_storage_client=self, id=dataset_id) - - def key_value_stores(self: MemoryStorageClient) -> KeyValueStoreCollectionClient: - """Retrieve the sub-client for manipulating key-value stores.""" - return KeyValueStoreCollectionClient(base_storage_directory=self._key_value_stores_directory, memory_storage_client=self) - - def key_value_store(self: MemoryStorageClient, key_value_store_id: str) -> KeyValueStoreClient: - """Retrieve the sub-client for manipulating a single key-value store. - - Args: - key_value_store_id (str): ID of the key-value store to be manipulated - """ - return KeyValueStoreClient(base_storage_directory=self._key_value_stores_directory, memory_storage_client=self, id=key_value_store_id) - - def request_queues(self: MemoryStorageClient) -> RequestQueueCollectionClient: - """Retrieve the sub-client for manipulating request queues.""" - return RequestQueueCollectionClient(base_storage_directory=self._request_queues_directory, memory_storage_client=self) - - def request_queue( - self: MemoryStorageClient, - request_queue_id: str, - *, - client_key: str | None = None, # noqa: ARG002 - ) -> RequestQueueClient: - """Retrieve the sub-client for manipulating a single request queue. - - Args: - request_queue_id (str): ID of the request queue to be manipulated - client_key (str): A unique identifier of the client accessing the request queue - """ - return RequestQueueClient(base_storage_directory=self._request_queues_directory, memory_storage_client=self, id=request_queue_id) - - async def _purge_on_start(self: MemoryStorageClient) -> None: - # Optimistic, non-blocking check - if self._purged_on_start is True: - return - - async with self._purge_lock: - # Another check under the lock just to be sure - if self._purged_on_start is True: - return # type: ignore[unreachable] # Mypy doesn't understand that the _purged_on_start can change while we're getting the async lock - - await self._purge() - self._purged_on_start = True - - async def _purge(self: MemoryStorageClient) -> None: - """Clean up the default storage directories before the run starts. - - Specifically, `purge` cleans up: - - local directory containing the default dataset; - - all records from the default key-value store in the local directory, except for the "INPUT" key; - - local directory containing the default request queue. - """ - # Key-value stores - if await ospath.exists(self._key_value_stores_directory): - key_value_store_folders = await scandir(self._key_value_stores_directory) - for key_value_store_folder in key_value_store_folders: - if key_value_store_folder.name.startswith('__APIFY_TEMPORARY') or key_value_store_folder.name.startswith('__OLD'): - await self._batch_remove_files(key_value_store_folder.path) - elif key_value_store_folder.name == 'default': - await self._handle_default_key_value_store(key_value_store_folder.path) - - # Datasets - if await ospath.exists(self._datasets_directory): - dataset_folders = await scandir(self._datasets_directory) - for dataset_folder in dataset_folders: - if dataset_folder.name == 'default' or dataset_folder.name.startswith('__APIFY_TEMPORARY'): - await self._batch_remove_files(dataset_folder.path) - # Request queues - if await ospath.exists(self._request_queues_directory): - request_queue_folders = await scandir(self._request_queues_directory) - for request_queue_folder in request_queue_folders: - if request_queue_folder.name == 'default' or request_queue_folder.name.startswith('__APIFY_TEMPORARY'): - await self._batch_remove_files(request_queue_folder.path) - - async def _handle_default_key_value_store(self: MemoryStorageClient, folder: str) -> None: - """Remove everything from the default key-value store folder except `possible_input_keys`.""" - folder_exists = await ospath.exists(folder) - temporary_path = os.path.normpath(os.path.join(folder, '../__APIFY_MIGRATING_KEY_VALUE_STORE__')) - - # For optimization, we want to only attempt to copy a few files from the default key-value store - possible_input_keys = [ - 'INPUT', - 'INPUT.json', - 'INPUT.bin', - 'INPUT.txt', - ] - - if folder_exists: - # Create a temporary folder to save important files in - Path(temporary_path).mkdir(parents=True, exist_ok=True) - - # Go through each file and save the ones that are important - for entity in possible_input_keys: - original_file_path = os.path.join(folder, entity) - temp_file_path = os.path.join(temporary_path, entity) - with contextlib.suppress(Exception): - await rename(original_file_path, temp_file_path) - - # Remove the original folder and all its content - counter = 0 - temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__')) - done = False - try: - while not done: - await rename(folder, temp_path_for_old_folder) - done = True - except Exception: - counter += 1 - temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__')) - - # Replace the temporary folder with the original folder - await rename(temporary_path, folder) - - # Remove the old folder - await self._batch_remove_files(temp_path_for_old_folder) - - async def _batch_remove_files(self: MemoryStorageClient, folder: str, counter: int = 0) -> None: - folder_exists = await ospath.exists(folder) - - if folder_exists: - temporary_folder = ( - folder - if os.path.basename(folder).startswith('__APIFY_TEMPORARY_') - else os.path.normpath(os.path.join(folder, f'../__APIFY_TEMPORARY_{counter}__')) - ) - - try: - # Rename the old folder to the new one to allow background deletions - await rename(folder, temporary_folder) - except Exception: - # Folder exists already, try again with an incremented counter - return await self._batch_remove_files(folder, counter + 1) - - await aioshutil.rmtree(temporary_folder, ignore_errors=True) - return None diff --git a/src/apify/_memory_storage/resource_clients/__init__.py b/src/apify/_memory_storage/resource_clients/__init__.py deleted file mode 100644 index 0a79ebe3..00000000 --- a/src/apify/_memory_storage/resource_clients/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from .base_resource_client import BaseResourceClient -from .base_resource_collection_client import BaseResourceCollectionClient -from .dataset import DatasetClient -from .dataset_collection import DatasetCollectionClient -from .key_value_store import KeyValueStoreClient -from .key_value_store_collection import KeyValueStoreCollectionClient -from .request_queue import RequestQueueClient -from .request_queue_collection import RequestQueueCollectionClient - -__all__ = [ - 'BaseResourceClient', - 'BaseResourceCollectionClient', - 'DatasetClient', - 'DatasetCollectionClient', - 'KeyValueStoreClient', - 'KeyValueStoreCollectionClient', - 'RequestQueueClient', - 'RequestQueueCollectionClient', -] diff --git a/src/apify/_memory_storage/resource_clients/base_resource_client.py b/src/apify/_memory_storage/resource_clients/base_resource_client.py deleted file mode 100644 index 73dcf052..00000000 --- a/src/apify/_memory_storage/resource_clients/base_resource_client.py +++ /dev/null @@ -1,141 +0,0 @@ -from __future__ import annotations - -import json -import os -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING - -from apify_shared.utils import ignore_docs - -if TYPE_CHECKING: - from typing_extensions import Self - - from apify._memory_storage.memory_storage_client import MemoryStorageClient - - -@ignore_docs -class BaseResourceClient(ABC): - """Base class for resource clients.""" - - _id: str - _name: str | None - _resource_directory: str - - @abstractmethod - def __init__( - self: BaseResourceClient, - *, - base_storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> None: - """Initialize the BaseResourceClient.""" - raise NotImplementedError('You must override this method in the subclass!') - - @abstractmethod - async def get(self: BaseResourceClient) -> dict | None: - """Retrieve the storage. - - Returns: - dict, optional: The retrieved storage, or None, if it does not exist - """ - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - @abstractmethod - def _get_storages_dir(cls: type[BaseResourceClient], memory_storage_client: MemoryStorageClient) -> str: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - @abstractmethod - def _get_storage_client_cache( - cls, # noqa: ANN102 # type annotated cls does not work with Self as a return type - memory_storage_client: MemoryStorageClient, - ) -> list[Self]: - raise NotImplementedError('You must override this method in the subclass!') - - @abstractmethod - def _to_resource_info(self: BaseResourceClient) -> dict: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - @abstractmethod - def _create_from_directory( - cls, # noqa: ANN102 # type annotated cls does not work with Self as a return type - storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> Self: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - def _find_or_create_client_by_id_or_name( - cls, # noqa: ANN102 # type annotated cls does not work with Self as a return type - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> Self | None: - assert id is not None or name is not None # noqa: S101 - - storage_client_cache = cls._get_storage_client_cache(memory_storage_client) - storages_dir = cls._get_storages_dir(memory_storage_client) - - # First check memory cache - found = next( - ( - storage_client - for storage_client in storage_client_cache - if storage_client._id == id or (storage_client._name and name and storage_client._name.lower() == name.lower()) - ), - None, - ) - - if found is not None: - return found - - storage_path = None - - # First try to find the storage by looking up the directory by name - if name: - possible_storage_path = os.path.join(storages_dir, name) - if os.access(possible_storage_path, os.F_OK): - storage_path = possible_storage_path - - # If it's not found, try going through the storages dir and finding it by metadata - if not storage_path and os.access(storages_dir, os.F_OK): - for entry in os.scandir(storages_dir): - if not entry.is_dir(): - continue - metadata_path = os.path.join(entry.path, '__metadata__.json') - if not os.access(metadata_path, os.F_OK): - continue - with open(metadata_path, encoding='utf-8') as metadata_file: - metadata = json.load(metadata_file) - if id and id == metadata.get('id'): - storage_path = entry.path - name = metadata.get(name) - break - if name and name == metadata.get('name'): - storage_path = entry.path - id = metadata.get(id) # noqa: A001 - break - - # As a last resort, try to check if the accessed storage is the default one, - # and the folder has no metadata - # TODO: make this respect the APIFY_DEFAULT_XXX_ID env var - # https://github.com/apify/apify-sdk-python/issues/149 - if id == 'default': - possible_storage_path = os.path.join(storages_dir, id) - if os.access(possible_storage_path, os.F_OK): - storage_path = possible_storage_path - - if not storage_path: - return None - - resource_client = cls._create_from_directory(storage_path, memory_storage_client, id, name) - - storage_client_cache.append(resource_client) - - return resource_client diff --git a/src/apify/_memory_storage/resource_clients/base_resource_collection_client.py b/src/apify/_memory_storage/resource_clients/base_resource_collection_client.py deleted file mode 100644 index 2f41876a..00000000 --- a/src/apify/_memory_storage/resource_clients/base_resource_collection_client.py +++ /dev/null @@ -1,114 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from operator import itemgetter -from typing import TYPE_CHECKING, Generic, TypeVar, cast - -from apify_shared.models import ListPage -from apify_shared.utils import ignore_docs - -from apify._memory_storage.file_storage_utils import update_metadata -from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient - -if TYPE_CHECKING: - from apify._memory_storage.memory_storage_client import MemoryStorageClient - - -ResourceClientType = TypeVar('ResourceClientType', bound=BaseResourceClient, contravariant=True) # noqa: PLC0105 - - -@ignore_docs -class BaseResourceCollectionClient(ABC, Generic[ResourceClientType]): - """Base class for resource collection clients.""" - - _base_storage_directory: str - _memory_storage_client: MemoryStorageClient - - def __init__( - self: BaseResourceCollectionClient, - *, - base_storage_directory: str, - memory_storage_client: MemoryStorageClient, - ) -> None: - """Initialize the DatasetCollectionClient with the passed arguments.""" - self._base_storage_directory = base_storage_directory - self._memory_storage_client = memory_storage_client - - @abstractmethod - def _get_storage_client_cache(self: BaseResourceCollectionClient) -> list[ResourceClientType]: - raise NotImplementedError('You must override this method in the subclass!') - - @abstractmethod - def _get_resource_client_class(self: BaseResourceCollectionClient) -> type[ResourceClientType]: - raise NotImplementedError('You must override this method in the subclass!') - - @abstractmethod - async def list(self: BaseResourceCollectionClient) -> ListPage: - """List the available storages. - - Returns: - ListPage: The list of available storages matching the specified filters. - """ - storage_client_cache = self._get_storage_client_cache() - - items = [storage._to_resource_info() for storage in storage_client_cache] - - return ListPage( - { - 'total': len(items), - 'count': len(items), - 'offset': 0, - 'limit': len(items), - 'desc': False, - 'items': sorted(items, key=itemgetter('createdAt')), - } - ) - - @abstractmethod - async def get_or_create( - self: BaseResourceCollectionClient, - *, - name: str | None = None, - schema: dict | None = None, - _id: str | None = None, - ) -> dict: - """Retrieve a named storage, or create a new one when it doesn't exist. - - Args: - name (str, optional): The name of the storage to retrieve or create. - schema (Dict, optional): The schema of the storage - - Returns: - dict: The retrieved or newly-created storage. - """ - resource_client_class = self._get_resource_client_class() - storage_client_cache = self._get_storage_client_cache() - - if name or _id: - found = resource_client_class._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, - name=name, - id=_id, - ) - if found: - resource_info = found._to_resource_info() - return cast(dict, resource_info) - - new_resource = resource_client_class( - id=_id, - name=name, - base_storage_directory=self._base_storage_directory, - memory_storage_client=self._memory_storage_client, - ) - storage_client_cache.append(new_resource) - - resource_info = new_resource._to_resource_info() - - # Write to the disk - await update_metadata( - data=resource_info, - entity_directory=new_resource._resource_directory, - write_metadata=self._memory_storage_client._write_metadata, - ) - - return cast(dict, resource_info) diff --git a/src/apify/_memory_storage/resource_clients/dataset.py b/src/apify/_memory_storage/resource_clients/dataset.py deleted file mode 100644 index f8c82655..00000000 --- a/src/apify/_memory_storage/resource_clients/dataset.py +++ /dev/null @@ -1,452 +0,0 @@ -from __future__ import annotations - -import asyncio -import json -import os -from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any, AsyncIterator - -import aioshutil -from apify_shared.models import ListPage -from apify_shared.utils import ignore_docs - -from apify._crypto import crypto_random_object_id -from apify._memory_storage.file_storage_utils import _update_dataset_items, update_metadata -from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient -from apify._utils import force_rename, raise_on_duplicate_storage, raise_on_non_existing_storage -from apify.consts import StorageTypes - -if TYPE_CHECKING: - from apify_shared.types import JSONSerializable - - from apify._memory_storage.memory_storage_client import MemoryStorageClient - -# This is what API returns in the x-apify-pagination-limit -# header when no limit query parameter is used. -LIST_ITEMS_LIMIT = 999_999_999_999 - -# Number of characters of the dataset item file names. -# E.g.: 000000019.json - 9 digits -LOCAL_ENTRY_NAME_DIGITS = 9 - - -@ignore_docs -class DatasetClient(BaseResourceClient): - """Sub-client for manipulating a single dataset.""" - - _id: str - _resource_directory: str - _memory_storage_client: MemoryStorageClient - _name: str | None - _dataset_entries: dict[str, dict] - _created_at: datetime - _accessed_at: datetime - _modified_at: datetime - _item_count = 0 - _file_operation_lock: asyncio.Lock - - def __init__( - self: DatasetClient, - *, - base_storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> None: - """Initialize the DatasetClient.""" - self._id = id or crypto_random_object_id() - self._resource_directory = os.path.join(base_storage_directory, name or self._id) - self._memory_storage_client = memory_storage_client - self._name = name - self._dataset_entries = {} - self._created_at = datetime.now(timezone.utc) - self._accessed_at = datetime.now(timezone.utc) - self._modified_at = datetime.now(timezone.utc) - self._file_operation_lock = asyncio.Lock() - - async def get(self: DatasetClient) -> dict | None: - """Retrieve the dataset. - - Returns: - dict, optional: The retrieved dataset, or None, if it does not exist - """ - found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name) - - if found: - async with found._file_operation_lock: - await found._update_timestamps(has_been_modified=False) - return found._to_resource_info() - - return None - - async def update(self: DatasetClient, *, name: str | None = None) -> dict: - """Update the dataset with specified fields. - - Args: - name (str, optional): The new name for the dataset - - Returns: - dict: The updated dataset - """ - # Check by id - existing_dataset_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, - id=self._id, - name=self._name, - ) - - if existing_dataset_by_id is None: - raise_on_non_existing_storage(StorageTypes.DATASET, self._id) - - # Skip if no changes - if name is None: - return existing_dataset_by_id._to_resource_info() - - async with existing_dataset_by_id._file_operation_lock: - # Check that name is not in use already - existing_dataset_by_name = next( - (dataset for dataset in self._memory_storage_client._datasets_handled if dataset._name and dataset._name.lower() == name.lower()), - None, - ) - - if existing_dataset_by_name is not None: - raise_on_duplicate_storage(StorageTypes.DATASET, 'name', name) - - existing_dataset_by_id._name = name - - previous_dir = existing_dataset_by_id._resource_directory - - existing_dataset_by_id._resource_directory = os.path.join(self._memory_storage_client._datasets_directory, name) - - await force_rename(previous_dir, existing_dataset_by_id._resource_directory) - - # Update timestamps - await existing_dataset_by_id._update_timestamps(has_been_modified=True) - - return existing_dataset_by_id._to_resource_info() - - async def delete(self: DatasetClient) -> None: - """Delete the dataset.""" - dataset = next((dataset for dataset in self._memory_storage_client._datasets_handled if dataset._id == self._id), None) - - if dataset is not None: - async with dataset._file_operation_lock: - self._memory_storage_client._datasets_handled.remove(dataset) - dataset._item_count = 0 - dataset._dataset_entries.clear() - - if os.path.exists(dataset._resource_directory): - await aioshutil.rmtree(dataset._resource_directory) - - async def list_items( - self: DatasetClient, - *, - offset: int | None = 0, - limit: int | None = LIST_ITEMS_LIMIT, - clean: bool | None = None, # noqa: ARG002 - desc: bool | None = None, - fields: list[str] | None = None, # noqa: ARG002 - omit: list[str] | None = None, # noqa: ARG002 - unwind: str | None = None, # noqa: ARG002 - skip_empty: bool | None = None, # noqa: ARG002 - skip_hidden: bool | None = None, # noqa: ARG002 - flatten: list[str] | None = None, # noqa: ARG002 - view: str | None = None, # noqa: ARG002 - ) -> ListPage: - """List the items of the dataset. - - Args: - offset (int, optional): Number of items that should be skipped at the start. The default value is 0 - limit (int, optional): Maximum number of items to return. By default there is no limit. - desc (bool, optional): By default, results are returned in the same order as they were stored. - To reverse the order, set this parameter to True. - clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character). - The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters. - Note that since some objects might be skipped from the output, that the result might contain less items than the limit value. - fields (list of str, optional): A list of fields which should be picked from the items, - only these fields will remain in the resulting record objects. - Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter. - You can use this feature to effectively fix the output format. - omit (list of str, optional): A list of fields which should be omitted from the items. - unwind (str, optional): Name of a field which should be unwound. - If the field is an array then every element of the array will become a separate record and merged with parent object. - If the unwound field is an object then it is merged with the parent object. - If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object, - then the item gets preserved as it is. Note that the unwound items ignore the desc parameter. - skip_empty (bool, optional): If True, then empty items are skipped from the output. - Note that if used, the results might contain less items than the limit value. - skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character. - flatten (list of str, optional): A list of fields that should be flattened - view (str, optional): Name of the dataset view to be used - - Returns: - ListPage: A page of the list of dataset items according to the specified filters. - """ - # Check by id - existing_dataset_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, - id=self._id, - name=self._name, - ) - - if existing_dataset_by_id is None: - raise_on_non_existing_storage(StorageTypes.DATASET, self._id) - - async with existing_dataset_by_id._file_operation_lock: - start, end = existing_dataset_by_id._get_start_and_end_indexes( - max(existing_dataset_by_id._item_count - (offset or 0) - (limit or LIST_ITEMS_LIMIT), 0) if desc else offset or 0, - limit, - ) - - items = [] - - for idx in range(start, end): - entry_number = self._generate_local_entry_name(idx) - items.append(existing_dataset_by_id._dataset_entries[entry_number]) - - await existing_dataset_by_id._update_timestamps(has_been_modified=False) - - if desc: - items.reverse() - - return ListPage( - { - 'count': len(items), - 'desc': desc or False, - 'items': items, - 'limit': limit or LIST_ITEMS_LIMIT, - 'offset': offset or 0, - 'total': existing_dataset_by_id._item_count, - } - ) - - async def iterate_items( - self: DatasetClient, - *, - offset: int = 0, - limit: int | None = None, - clean: bool | None = None, # noqa: ARG002 - desc: bool | None = None, - fields: list[str] | None = None, # noqa: ARG002 - omit: list[str] | None = None, # noqa: ARG002 - unwind: str | None = None, # noqa: ARG002 - skip_empty: bool | None = None, # noqa: ARG002 - skip_hidden: bool | None = None, # noqa: ARG002 - ) -> AsyncIterator[dict]: - """Iterate over the items in the dataset. - - Args: - offset (int, optional): Number of items that should be skipped at the start. The default value is 0 - limit (int, optional): Maximum number of items to return. By default there is no limit. - desc (bool, optional): By default, results are returned in the same order as they were stored. - To reverse the order, set this parameter to True. - clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character). - The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters. - Note that since some objects might be skipped from the output, that the result might contain less items than the limit value. - fields (list of str, optional): A list of fields which should be picked from the items, - only these fields will remain in the resulting record objects. - Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter. - You can use this feature to effectively fix the output format. - omit (list of str, optional): A list of fields which should be omitted from the items. - unwind (str, optional): Name of a field which should be unwound. - If the field is an array then every element of the array will become a separate record and merged with parent object. - If the unwound field is an object then it is merged with the parent object. - If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object, - then the item gets preserved as it is. Note that the unwound items ignore the desc parameter. - skip_empty (bool, optional): If True, then empty items are skipped from the output. - Note that if used, the results might contain less items than the limit value. - skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character. - - Yields: - dict: An item from the dataset - """ - cache_size = 1000 - first_item = offset - - # If there is no limit, set last_item to None until we get the total from the first API response - last_item = None if limit is None else offset + limit - - current_offset = first_item - while last_item is None or current_offset < last_item: - current_limit = cache_size if last_item is None else min(cache_size, last_item - current_offset) - - current_items_page = await self.list_items( - offset=current_offset, - limit=current_limit, - desc=desc, - ) - - current_offset += current_items_page.count - if last_item is None or current_items_page.total < last_item: - last_item = current_items_page.total - - for item in current_items_page.items: - yield item - - async def get_items_as_bytes(self: DatasetClient, *_args: Any, **_kwargs: Any) -> bytes: - raise NotImplementedError('This method is not supported in local memory storage.') - - async def stream_items(self: DatasetClient, *_args: Any, **_kwargs: Any) -> AsyncIterator: - raise NotImplementedError('This method is not supported in local memory storage.') - - async def push_items(self: DatasetClient, items: JSONSerializable) -> None: - """Push items to the dataset. - - Args: - items: The items which to push in the dataset. Either a stringified JSON, a dictionary, or a list of strings or dictionaries. - """ - # Check by id - existing_dataset_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_dataset_by_id is None: - raise_on_non_existing_storage(StorageTypes.DATASET, self._id) - - normalized = self._normalize_items(items) - - added_ids: list[str] = [] - for entry in normalized: - existing_dataset_by_id._item_count += 1 - idx = self._generate_local_entry_name(existing_dataset_by_id._item_count) - - existing_dataset_by_id._dataset_entries[idx] = entry - added_ids.append(idx) - - data_entries = [(id, existing_dataset_by_id._dataset_entries[id]) for id in added_ids] # noqa: A001 - - async with existing_dataset_by_id._file_operation_lock: - await existing_dataset_by_id._update_timestamps(has_been_modified=True) - - await _update_dataset_items( - data=data_entries, - entity_directory=existing_dataset_by_id._resource_directory, - persist_storage=self._memory_storage_client._persist_storage, - ) - - def _to_resource_info(self: DatasetClient) -> dict: - """Retrieve the dataset info.""" - return { - 'id': self._id, - 'name': self._name, - 'itemCount': self._item_count, - 'accessedAt': self._accessed_at, - 'createdAt': self._created_at, - 'modifiedAt': self._modified_at, - } - - async def _update_timestamps(self: DatasetClient, has_been_modified: bool) -> None: # noqa: FBT001 - """Update the timestamps of the dataset.""" - self._accessed_at = datetime.now(timezone.utc) - - if has_been_modified: - self._modified_at = datetime.now(timezone.utc) - - dataset_info = self._to_resource_info() - await update_metadata( - data=dataset_info, - entity_directory=self._resource_directory, - write_metadata=self._memory_storage_client._write_metadata, - ) - - def _get_start_and_end_indexes(self: DatasetClient, offset: int, limit: int | None = None) -> tuple[int, int]: - actual_limit = limit or self._item_count - start = offset + 1 - end = min(offset + actual_limit, self._item_count) + 1 - return (start, end) - - def _generate_local_entry_name(self: DatasetClient, idx: int) -> str: - return str(idx).zfill(LOCAL_ENTRY_NAME_DIGITS) - - def _normalize_items(self: DatasetClient, items: JSONSerializable) -> list[dict]: - def normalize_item(item: Any) -> dict | None: - if isinstance(item, str): - item = json.loads(item) - - if isinstance(item, list): - received = ',\n'.join(item) - raise TypeError(f'Each dataset item can only be a single JSON object, not an array. Received: [{received}]') - - if (not isinstance(item, dict)) and item is not None: - raise TypeError(f'Each dataset item must be a JSON object. Received: {item}') - - return item - - if isinstance(items, str): - items = json.loads(items) - - result = list(map(normalize_item, items)) if isinstance(items, list) else [normalize_item(items)] - # filter(None, ..) returns items that are True - return list(filter(None, result)) - - @classmethod - def _get_storages_dir(cls: type[DatasetClient], memory_storage_client: MemoryStorageClient) -> str: - return memory_storage_client._datasets_directory - - @classmethod - def _get_storage_client_cache( - cls: type[DatasetClient], - memory_storage_client: MemoryStorageClient, - ) -> list[DatasetClient]: - return memory_storage_client._datasets_handled - - @classmethod - def _create_from_directory( - cls: type[DatasetClient], - storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> DatasetClient: - item_count = 0 - created_at = datetime.now(timezone.utc) - accessed_at = datetime.now(timezone.utc) - modified_at = datetime.now(timezone.utc) - entries: dict[str, dict] = {} - - has_seen_metadata_file = False - - # Access the dataset folder - for entry in os.scandir(storage_directory): - if entry.is_file(): - if entry.name == '__metadata__.json': - has_seen_metadata_file = True - - # We have found the dataset's metadata file, build out information based on it - with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f: - metadata = json.load(f) - id = metadata['id'] # noqa: A001 - name = metadata['name'] - item_count = metadata['itemCount'] - created_at = datetime.fromisoformat(metadata['createdAt']) - accessed_at = datetime.fromisoformat(metadata['accessedAt']) - modified_at = datetime.fromisoformat(metadata['modifiedAt']) - - continue - - with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f: - entry_content = json.load(f) - entry_name = entry.name.split('.')[0] - - entries[entry_name] = entry_content - - if not has_seen_metadata_file: - item_count += 1 - - new_client = DatasetClient( - base_storage_directory=memory_storage_client._datasets_directory, - memory_storage_client=memory_storage_client, - id=id, - name=name, - ) - - # Overwrite properties - new_client._accessed_at = accessed_at - new_client._created_at = created_at - new_client._modified_at = modified_at - new_client._item_count = item_count - - for entry_id, content in entries.items(): - new_client._dataset_entries[entry_id] = content - - return new_client diff --git a/src/apify/_memory_storage/resource_clients/dataset_collection.py b/src/apify/_memory_storage/resource_clients/dataset_collection.py deleted file mode 100644 index 0ef7b3f0..00000000 --- a/src/apify/_memory_storage/resource_clients/dataset_collection.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from apify_shared.utils import ignore_docs - -from apify._memory_storage.resource_clients.base_resource_collection_client import BaseResourceCollectionClient -from apify._memory_storage.resource_clients.dataset import DatasetClient - -if TYPE_CHECKING: - from apify_shared.models import ListPage - - -@ignore_docs -class DatasetCollectionClient(BaseResourceCollectionClient): - """Sub-client for manipulating datasets.""" - - def _get_storage_client_cache(self: DatasetCollectionClient) -> list[DatasetClient]: - return self._memory_storage_client._datasets_handled - - def _get_resource_client_class(self: DatasetCollectionClient) -> type[DatasetClient]: - return DatasetClient - - async def list(self: DatasetCollectionClient) -> ListPage: - """List the available datasets. - - Returns: - ListPage: The list of available datasets matching the specified filters. - """ - return await super().list() - - async def get_or_create( - self: DatasetCollectionClient, - *, - name: str | None = None, - schema: dict | None = None, - _id: str | None = None, - ) -> dict: - """Retrieve a named dataset, or create a new one when it doesn't exist. - - Args: - name (str, optional): The name of the dataset to retrieve or create. - schema (dict, optional): The schema of the dataset - - Returns: - dict: The retrieved or newly-created dataset. - """ - return await super().get_or_create(name=name, schema=schema, _id=_id) diff --git a/src/apify/_memory_storage/resource_clients/key_value_store.py b/src/apify/_memory_storage/resource_clients/key_value_store.py deleted file mode 100644 index 2920089d..00000000 --- a/src/apify/_memory_storage/resource_clients/key_value_store.py +++ /dev/null @@ -1,533 +0,0 @@ -from __future__ import annotations - -import asyncio -import io -import json -import mimetypes -import os -import pathlib -from datetime import datetime, timezone -from operator import itemgetter -from typing import TYPE_CHECKING, Any, AsyncIterator, TypedDict - -import aiofiles -import aioshutil -from aiofiles.os import makedirs -from apify_shared.utils import ignore_docs, is_file_or_bytes, json_dumps - -from apify._crypto import crypto_random_object_id -from apify._memory_storage.file_storage_utils import update_metadata -from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient -from apify._utils import ( - force_remove, - force_rename, - guess_file_extension, - maybe_parse_body, - raise_on_duplicate_storage, - raise_on_non_existing_storage, -) -from apify.consts import DEFAULT_API_PARAM_LIMIT, StorageTypes -from apify.log import logger - -if TYPE_CHECKING: - from typing_extensions import NotRequired - - from apify._memory_storage.memory_storage_client import MemoryStorageClient - - -class KeyValueStoreRecord(TypedDict): - key: str - value: Any - contentType: str | None - filename: NotRequired[str] - - -def _filename_from_record(record: KeyValueStoreRecord) -> str: - if record.get('filename') is not None: - return record['filename'] - - content_type = record.get('contentType') - if not content_type or content_type == 'application/octet-stream': - return record['key'] - - extension = guess_file_extension(content_type) - if record['key'].endswith(f'.{extension}'): - return record['key'] - - return f'{record["key"]}.{extension}' - - -@ignore_docs -class KeyValueStoreClient(BaseResourceClient): - """Sub-client for manipulating a single key-value store.""" - - _id: str - _resource_directory: str - _memory_storage_client: MemoryStorageClient - _name: str | None - _records: dict[str, KeyValueStoreRecord] - _created_at: datetime - _accessed_at: datetime - _modified_at: datetime - _file_operation_lock: asyncio.Lock - - def __init__( - self: KeyValueStoreClient, - *, - base_storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> None: - """Initialize the KeyValueStoreClient.""" - self._id = id or crypto_random_object_id() - self._resource_directory = os.path.join(base_storage_directory, name or self._id) - self._memory_storage_client = memory_storage_client - self._name = name - self._records = {} - self._created_at = datetime.now(timezone.utc) - self._accessed_at = datetime.now(timezone.utc) - self._modified_at = datetime.now(timezone.utc) - self._file_operation_lock = asyncio.Lock() - - async def get(self: KeyValueStoreClient) -> dict | None: - """Retrieve the key-value store. - - Returns: - dict, optional: The retrieved key-value store, or None if it does not exist - """ - found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name) - - if found: - async with found._file_operation_lock: - await found._update_timestamps(has_been_modified=False) - return found._to_resource_info() - - return None - - async def update(self: KeyValueStoreClient, *, name: str | None = None) -> dict: - """Update the key-value store with specified fields. - - Args: - name (str, optional): The new name for key-value store - - Returns: - dict: The updated key-value store - """ - # Check by id - existing_store_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) - - # Skip if no changes - if name is None: - return existing_store_by_id._to_resource_info() - - async with existing_store_by_id._file_operation_lock: - # Check that name is not in use already - existing_store_by_name = next( - (store for store in self._memory_storage_client._key_value_stores_handled if store._name and store._name.lower() == name.lower()), - None, - ) - - if existing_store_by_name is not None: - raise_on_duplicate_storage(StorageTypes.KEY_VALUE_STORE, 'name', name) - - existing_store_by_id._name = name - - previous_dir = existing_store_by_id._resource_directory - - existing_store_by_id._resource_directory = os.path.join(self._memory_storage_client._key_value_stores_directory, name) - - await force_rename(previous_dir, existing_store_by_id._resource_directory) - - # Update timestamps - await existing_store_by_id._update_timestamps(has_been_modified=True) - - return existing_store_by_id._to_resource_info() - - async def delete(self: KeyValueStoreClient) -> None: - """Delete the key-value store.""" - store = next((store for store in self._memory_storage_client._key_value_stores_handled if store._id == self._id), None) - - if store is not None: - async with store._file_operation_lock: - self._memory_storage_client._key_value_stores_handled.remove(store) - store._records.clear() - - if os.path.exists(store._resource_directory): - await aioshutil.rmtree(store._resource_directory) - - async def list_keys( - self: KeyValueStoreClient, - *, - limit: int = DEFAULT_API_PARAM_LIMIT, - exclusive_start_key: str | None = None, - ) -> dict: - """List the keys in the key-value store. - - Args: - limit (int, optional): Number of keys to be returned. Maximum value is 1000 - exclusive_start_key (str, optional): All keys up to this one (including) are skipped from the result - - Returns: - dict: The list of keys in the key-value store matching the given arguments - """ - # Check by id - existing_store_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) - - items = [] - - for record in existing_store_by_id._records.values(): - size = len(record['value']) - items.append( - { - 'key': record['key'], - 'size': size, - } - ) - - if len(items) == 0: - return { - 'count': len(items), - 'limit': limit, - 'exclusiveStartKey': exclusive_start_key, - 'isTruncated': False, - 'nextExclusiveStartKey': None, - 'items': items, - } - - # Lexically sort to emulate the API - items = sorted(items, key=itemgetter('key')) - - truncated_items = items - if exclusive_start_key is not None: - key_pos = next((idx for idx, i in enumerate(items) if i['key'] == exclusive_start_key), None) - if key_pos is not None: - truncated_items = items[(key_pos + 1) :] - - limited_items = truncated_items[:limit] - - last_item_in_store = items[-1] - last_selected_item = limited_items[-1] - is_last_selected_item_absolutely_last = last_item_in_store == last_selected_item - next_exclusive_start_key = None if is_last_selected_item_absolutely_last else last_selected_item['key'] - - async with existing_store_by_id._file_operation_lock: - await existing_store_by_id._update_timestamps(has_been_modified=False) - - return { - 'count': len(items), - 'limit': limit, - 'exclusiveStartKey': exclusive_start_key, - 'isTruncated': not is_last_selected_item_absolutely_last, - 'nextExclusiveStartKey': next_exclusive_start_key, - 'items': limited_items, - } - - async def _get_record_internal( - self: KeyValueStoreClient, - key: str, - as_bytes: bool = False, # noqa: FBT001, FBT002 - ) -> dict | None: - # Check by id - existing_store_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) - - stored_record = existing_store_by_id._records.get(key) - - if stored_record is None: - return None - - record = { - 'key': stored_record['key'], - 'value': stored_record['value'], - 'contentType': stored_record.get('contentType'), - } - - if not as_bytes: - try: - record['value'] = maybe_parse_body(record['value'], record['contentType']) - except ValueError: - logger.exception('Error parsing key-value store record') - - async with existing_store_by_id._file_operation_lock: - await existing_store_by_id._update_timestamps(has_been_modified=False) - - return record - - async def get_record(self: KeyValueStoreClient, key: str) -> dict | None: - """Retrieve the given record from the key-value store. - - Args: - key (str): Key of the record to retrieve - - Returns: - dict, optional: The requested record, or None, if the record does not exist - """ - return await self._get_record_internal(key) - - async def get_record_as_bytes(self: KeyValueStoreClient, key: str) -> dict | None: - """Retrieve the given record from the key-value store, without parsing it. - - Args: - key (str): Key of the record to retrieve - - Returns: - dict, optional: The requested record, or None, if the record does not exist - """ - return await self._get_record_internal(key, as_bytes=True) - - async def stream_record(self: KeyValueStoreClient, _key: str) -> AsyncIterator[dict | None]: - raise NotImplementedError('This method is not supported in local memory storage.') - - async def set_record(self: KeyValueStoreClient, key: str, value: Any, content_type: str | None = None) -> None: - """Set a value to the given record in the key-value store. - - Args: - key (str): The key of the record to save the value to - value (Any): The value to save into the record - content_type (str, optional): The content type of the saved value - """ - # Check by id - existing_store_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) - - if isinstance(value, io.IOBase): - raise NotImplementedError('File-like values are not supported in local memory storage') - - if content_type is None: - if is_file_or_bytes(value): - content_type = 'application/octet-stream' - elif isinstance(value, str): - content_type = 'text/plain; charset=utf-8' - else: - content_type = 'application/json; charset=utf-8' - - if 'application/json' in content_type and not is_file_or_bytes(value) and not isinstance(value, str): - value = json_dumps(value).encode('utf-8') - - async with existing_store_by_id._file_operation_lock: - await existing_store_by_id._update_timestamps(has_been_modified=True) - record: KeyValueStoreRecord = { - 'key': key, - 'value': value, - 'contentType': content_type, - } - - old_record = existing_store_by_id._records.get(key) - existing_store_by_id._records[key] = record - - if self._memory_storage_client._persist_storage: - if old_record is not None and _filename_from_record(old_record) != _filename_from_record(record): - await existing_store_by_id._delete_persisted_record(old_record) - - await existing_store_by_id._persist_record(record) - - async def _persist_record(self: KeyValueStoreClient, record: KeyValueStoreRecord) -> None: - store_directory = self._resource_directory - record_filename = _filename_from_record(record) - record['filename'] = record_filename - - # Ensure the directory for the entity exists - await makedirs(store_directory, exist_ok=True) - - # Create files for the record - record_path = os.path.join(store_directory, record_filename) - record_metadata_path = os.path.join(store_directory, record_filename + '.__metadata__.json') - - # Convert to bytes if string - if isinstance(record['value'], str): - record['value'] = record['value'].encode('utf-8') - - async with aiofiles.open(record_path, mode='wb') as f: - await f.write(record['value']) - - if self._memory_storage_client._write_metadata: - async with aiofiles.open(record_metadata_path, mode='wb') as f: - await f.write( - json_dumps( - { - 'key': record['key'], - 'contentType': record['contentType'], - } - ).encode('utf-8') - ) - - async def delete_record(self: KeyValueStoreClient, key: str) -> None: - """Delete the specified record from the key-value store. - - Args: - key (str): The key of the record which to delete - """ - # Check by id - existing_store_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) - - record = existing_store_by_id._records.get(key) - - if record is not None: - async with existing_store_by_id._file_operation_lock: - del existing_store_by_id._records[key] - await existing_store_by_id._update_timestamps(has_been_modified=True) - if self._memory_storage_client._persist_storage: - await existing_store_by_id._delete_persisted_record(record) - - async def _delete_persisted_record(self: KeyValueStoreClient, record: KeyValueStoreRecord) -> None: - store_directory = self._resource_directory - record_filename = _filename_from_record(record) - - # Ensure the directory for the entity exists - await makedirs(store_directory, exist_ok=True) - - # Create files for the record - record_path = os.path.join(store_directory, record_filename) - record_metadata_path = os.path.join(store_directory, record_filename + '.__metadata__.json') - - await force_remove(record_path) - await force_remove(record_metadata_path) - - def _to_resource_info(self: KeyValueStoreClient) -> dict: - """Retrieve the key-value store info.""" - return { - 'id': self._id, - 'name': self._name, - 'accessedAt': self._accessed_at, - 'createdAt': self._created_at, - 'modifiedAt': self._modified_at, - 'userId': '1', - } - - async def _update_timestamps(self: KeyValueStoreClient, has_been_modified: bool) -> None: # noqa: FBT001 - self._accessed_at = datetime.now(timezone.utc) - - if has_been_modified: - self._modified_at = datetime.now(timezone.utc) - - kv_store_info = self._to_resource_info() - await update_metadata( - data=kv_store_info, - entity_directory=self._resource_directory, - write_metadata=self._memory_storage_client._write_metadata, - ) - - @classmethod - def _get_storages_dir(cls: type[KeyValueStoreClient], memory_storage_client: MemoryStorageClient) -> str: - return memory_storage_client._key_value_stores_directory - - @classmethod - def _get_storage_client_cache( - cls: type[KeyValueStoreClient], - memory_storage_client: MemoryStorageClient, - ) -> list[KeyValueStoreClient]: - return memory_storage_client._key_value_stores_handled - - @classmethod - def _create_from_directory( - cls: type[KeyValueStoreClient], - storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> KeyValueStoreClient: - created_at = datetime.now(timezone.utc) - accessed_at = datetime.now(timezone.utc) - modified_at = datetime.now(timezone.utc) - - store_metadata_path = os.path.join(storage_directory, '__metadata__.json') - if os.path.exists(store_metadata_path): - with open(store_metadata_path, encoding='utf-8') as f: - metadata = json.load(f) - id = metadata['id'] # noqa: A001 - name = metadata['name'] - created_at = datetime.fromisoformat(metadata['createdAt']) - accessed_at = datetime.fromisoformat(metadata['accessedAt']) - modified_at = datetime.fromisoformat(metadata['modifiedAt']) - - new_client = KeyValueStoreClient( - base_storage_directory=memory_storage_client._key_value_stores_directory, - memory_storage_client=memory_storage_client, - id=id, - name=name, - ) - - # Overwrite internal properties - new_client._accessed_at = accessed_at - new_client._created_at = created_at - new_client._modified_at = modified_at - - # Scan the key value store folder, check each entry in there and parse it as a store record - for entry in os.scandir(storage_directory): - if not entry.is_file(): - continue - - # Ignore metadata files on their own - if entry.name.endswith('__metadata__.json'): - continue - - with open(os.path.join(storage_directory, entry.name), 'rb') as f: - file_content = f.read() - - # Try checking if this file has a metadata file associated with it - metadata = None - if os.path.exists(os.path.join(storage_directory, entry.name + '.__metadata__.json')): - with open(os.path.join(storage_directory, entry.name + '.__metadata__.json'), encoding='utf-8') as metadata_file: - try: - metadata = json.load(metadata_file) - assert metadata.get('key') is not None # noqa: S101 - assert metadata.get('contentType') is not None # noqa: S101 - except Exception: - logger.warning( - f"""Metadata of key-value store entry "{entry.name}" for store {name or id} could not be parsed.""" - 'The metadata file will be ignored.', - exc_info=True, - ) - - if not metadata: - content_type, _ = mimetypes.guess_type(entry.name) - if content_type is None: - content_type = 'application/octet-stream' - - metadata = { - 'key': pathlib.Path(entry.name).stem, - 'contentType': content_type, - } - - try: - maybe_parse_body(file_content, metadata['contentType']) - except Exception: - metadata['contentType'] = 'application/octet-stream' - logger.warning( - f"""Key-value store entry "{metadata['key']}" for store {name or id} could not be parsed.""" - 'The entry will be assumed as binary.', - exc_info=True, - ) - - new_client._records[metadata['key']] = { - 'key': metadata['key'], - 'contentType': metadata['contentType'], - 'filename': entry.name, - 'value': file_content, - } - - return new_client diff --git a/src/apify/_memory_storage/resource_clients/key_value_store_collection.py b/src/apify/_memory_storage/resource_clients/key_value_store_collection.py deleted file mode 100644 index 9acb156e..00000000 --- a/src/apify/_memory_storage/resource_clients/key_value_store_collection.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from apify_shared.utils import ignore_docs - -from apify._memory_storage.resource_clients.base_resource_collection_client import BaseResourceCollectionClient -from apify._memory_storage.resource_clients.key_value_store import KeyValueStoreClient - -if TYPE_CHECKING: - from apify_shared.models import ListPage - - -@ignore_docs -class KeyValueStoreCollectionClient(BaseResourceCollectionClient): - """Sub-client for manipulating key-value stores.""" - - def _get_storage_client_cache(self: KeyValueStoreCollectionClient) -> list[KeyValueStoreClient]: - return self._memory_storage_client._key_value_stores_handled - - def _get_resource_client_class(self: KeyValueStoreCollectionClient) -> type[KeyValueStoreClient]: - return KeyValueStoreClient - - async def list(self: KeyValueStoreCollectionClient) -> ListPage: - """List the available key-value stores. - - Returns: - ListPage: The list of available key-value stores matching the specified filters. - """ - return await super().list() - - async def get_or_create( - self: KeyValueStoreCollectionClient, - *, - name: str | None = None, - schema: dict | None = None, - _id: str | None = None, - ) -> dict: - """Retrieve a named key-value store, or create a new one when it doesn't exist. - - Args: - name (str, optional): The name of the key-value store to retrieve or create. - schema (Dict, optional): The schema of the key-value store - - Returns: - dict: The retrieved or newly-created key-value store. - """ - return await super().get_or_create(name=name, schema=schema, _id=_id) diff --git a/src/apify/_memory_storage/resource_clients/request_queue.py b/src/apify/_memory_storage/resource_clients/request_queue.py deleted file mode 100644 index 1798c586..00000000 --- a/src/apify/_memory_storage/resource_clients/request_queue.py +++ /dev/null @@ -1,466 +0,0 @@ -from __future__ import annotations - -import asyncio -import json -import os -from datetime import datetime, timezone -from decimal import Decimal -from typing import TYPE_CHECKING - -import aioshutil -from apify_shared.utils import filter_out_none_values_recursively, ignore_docs, json_dumps -from sortedcollections import ValueSortedDict - -from apify._crypto import crypto_random_object_id -from apify._memory_storage.file_storage_utils import delete_request, update_metadata, update_request_queue_item -from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient -from apify._utils import force_rename, raise_on_duplicate_storage, raise_on_non_existing_storage, unique_key_to_request_id -from apify.consts import StorageTypes - -if TYPE_CHECKING: - from apify._memory_storage.memory_storage_client import MemoryStorageClient - - -@ignore_docs -class RequestQueueClient(BaseResourceClient): - """Sub-client for manipulating a single request queue.""" - - _id: str - _resource_directory: str - _memory_storage_client: MemoryStorageClient - _name: str | None - _requests: ValueSortedDict - _created_at: datetime - _accessed_at: datetime - _modified_at: datetime - _handled_request_count = 0 - _pending_request_count = 0 - _last_used_timestamp = Decimal(0.0) - _file_operation_lock: asyncio.Lock - - def __init__( - self: RequestQueueClient, - *, - base_storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> None: - """Initialize the RequestQueueClient.""" - self._id = id or crypto_random_object_id() - self._resource_directory = os.path.join(base_storage_directory, name or self._id) - self._memory_storage_client = memory_storage_client - self._name = name - self._requests = ValueSortedDict(lambda req: req.get('orderNo') or -float('inf')) - self._created_at = datetime.now(timezone.utc) - self._accessed_at = datetime.now(timezone.utc) - self._modified_at = datetime.now(timezone.utc) - self._file_operation_lock = asyncio.Lock() - - async def get(self: RequestQueueClient) -> dict | None: - """Retrieve the request queue. - - Returns: - dict, optional: The retrieved request queue, or None, if it does not exist - """ - found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name) - - if found: - async with found._file_operation_lock: - await found._update_timestamps(has_been_modified=False) - return found._to_resource_info() - - return None - - async def update(self: RequestQueueClient, *, name: str | None = None) -> dict: - """Update the request queue with specified fields. - - Args: - name (str, optional): The new name for the request queue - - Returns: - dict: The updated request queue - """ - # Check by id - existing_queue_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - - # Skip if no changes - if name is None: - return existing_queue_by_id._to_resource_info() - - async with existing_queue_by_id._file_operation_lock: - # Check that name is not in use already - existing_queue_by_name = next( - (queue for queue in self._memory_storage_client._request_queues_handled if queue._name and queue._name.lower() == name.lower()), None - ) - - if existing_queue_by_name is not None: - raise_on_duplicate_storage(StorageTypes.REQUEST_QUEUE, 'name', name) - - existing_queue_by_id._name = name - - previous_dir = existing_queue_by_id._resource_directory - - existing_queue_by_id._resource_directory = os.path.join(self._memory_storage_client._request_queues_directory, name) - - await force_rename(previous_dir, existing_queue_by_id._resource_directory) - - # Update timestamps - await existing_queue_by_id._update_timestamps(has_been_modified=True) - - return existing_queue_by_id._to_resource_info() - - async def delete(self: RequestQueueClient) -> None: - """Delete the request queue.""" - queue = next((queue for queue in self._memory_storage_client._request_queues_handled if queue._id == self._id), None) - - if queue is not None: - async with queue._file_operation_lock: - self._memory_storage_client._request_queues_handled.remove(queue) - queue._pending_request_count = 0 - queue._handled_request_count = 0 - queue._requests.clear() - - if os.path.exists(queue._resource_directory): - await aioshutil.rmtree(queue._resource_directory) - - async def list_head(self: RequestQueueClient, *, limit: int | None = None) -> dict: - """Retrieve a given number of requests from the beginning of the queue. - - Args: - limit (int, optional): How many requests to retrieve - - Returns: - dict: The desired number of requests from the beginning of the queue. - """ - existing_queue_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - - async with existing_queue_by_id._file_operation_lock: - await existing_queue_by_id._update_timestamps(has_been_modified=False) - - items: list[dict] = [] - - # Iterate all requests in the queue which have sorted key larger than infinity, which means `orderNo` is not `None` - # This will iterate them in order of `orderNo` - for request_key in existing_queue_by_id._requests.irange_key(min_key=-float('inf'), inclusive=(False, True)): - if len(items) == limit: - break - - request = existing_queue_by_id._requests.get(request_key) - - # Check that the request still exists and was not handled, - # in case something deleted it or marked it as handled concurrenctly - if request and request['orderNo']: - items.append(request) - - return { - 'limit': limit, - 'hadMultipleClients': False, - 'queueModifiedAt': existing_queue_by_id._modified_at, - 'items': [self._json_to_request(item['json']) for item in items], - } - - async def add_request(self: RequestQueueClient, request: dict, *, forefront: bool | None = None) -> dict: - """Add a request to the queue. - - Args: - request (dict): The request to add to the queue - forefront (bool, optional): Whether to add the request to the head or the end of the queue - - Returns: - dict: The added request. - """ - existing_queue_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - - request_model = self._create_internal_request(request, forefront) - - async with existing_queue_by_id._file_operation_lock: - existing_request_with_id = existing_queue_by_id._requests.get(request_model['id']) - - # We already have the request present, so we return information about it - if existing_request_with_id is not None: - await existing_queue_by_id._update_timestamps(has_been_modified=False) - - return { - 'requestId': existing_request_with_id['id'], - 'wasAlreadyHandled': existing_request_with_id['orderNo'] is None, - 'wasAlreadyPresent': True, - } - - existing_queue_by_id._requests[request_model['id']] = request_model - if request_model['orderNo'] is None: - existing_queue_by_id._handled_request_count += 1 - else: - existing_queue_by_id._pending_request_count += 1 - await existing_queue_by_id._update_timestamps(has_been_modified=True) - await update_request_queue_item( - request=request_model, - request_id=request_model['id'], - entity_directory=existing_queue_by_id._resource_directory, - persist_storage=self._memory_storage_client._persist_storage, - ) - - return { - 'requestId': request_model['id'], - # We return wasAlreadyHandled: false even though the request may - # have been added as handled, because that's how API behaves. - 'wasAlreadyHandled': False, - 'wasAlreadyPresent': False, - } - - async def get_request(self: RequestQueueClient, request_id: str) -> dict | None: - """Retrieve a request from the queue. - - Args: - request_id (str): ID of the request to retrieve - - Returns: - dict, optional: The retrieved request, or None, if it did not exist. - """ - existing_queue_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - - async with existing_queue_by_id._file_operation_lock: - await existing_queue_by_id._update_timestamps(has_been_modified=False) - - request = existing_queue_by_id._requests.get(request_id) - return self._json_to_request(request['json'] if request is not None else None) - - async def update_request(self: RequestQueueClient, request: dict, *, forefront: bool | None = None) -> dict: - """Update a request in the queue. - - Args: - request (dict): The updated request - forefront (bool, optional): Whether to put the updated request in the beginning or the end of the queue - - Returns: - dict: The updated request - """ - existing_queue_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - - request_model = self._create_internal_request(request, forefront) - - # First we need to check the existing request to be - # able to return information about its handled state. - - existing_request = existing_queue_by_id._requests.get(request_model['id']) - - # Undefined means that the request is not present in the queue. - # We need to insert it, to behave the same as API. - if existing_request is None: - return await self.add_request(request, forefront=forefront) - - async with existing_queue_by_id._file_operation_lock: - # When updating the request, we need to make sure that - # the handled counts are updated correctly in all cases. - existing_queue_by_id._requests[request_model['id']] = request_model - - pending_count_adjustment = 0 - is_request_handled_state_changing = not isinstance(existing_request['orderNo'], type(request_model['orderNo'])) - request_was_handled_before_update = existing_request['orderNo'] is None - - # We add 1 pending request if previous state was handled - if is_request_handled_state_changing: - pending_count_adjustment = 1 if request_was_handled_before_update else -1 - - existing_queue_by_id._pending_request_count += pending_count_adjustment - existing_queue_by_id._handled_request_count -= pending_count_adjustment - await existing_queue_by_id._update_timestamps(has_been_modified=True) - await update_request_queue_item( - request=request_model, - request_id=request_model['id'], - entity_directory=existing_queue_by_id._resource_directory, - persist_storage=self._memory_storage_client._persist_storage, - ) - - return { - 'requestId': request_model['id'], - 'wasAlreadyHandled': request_was_handled_before_update, - 'wasAlreadyPresent': True, - } - - async def delete_request(self: RequestQueueClient, request_id: str) -> None: - """Delete a request from the queue. - - Args: - request_id (str): ID of the request to delete. - """ - existing_queue_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - - async with existing_queue_by_id._file_operation_lock: - request = existing_queue_by_id._requests.get(request_id) - - if request: - del existing_queue_by_id._requests[request_id] - if request['orderNo'] is None: - existing_queue_by_id._handled_request_count -= 1 - else: - existing_queue_by_id._pending_request_count -= 1 - await existing_queue_by_id._update_timestamps(has_been_modified=True) - await delete_request(entity_directory=existing_queue_by_id._resource_directory, request_id=request_id) - - def _to_resource_info(self: RequestQueueClient) -> dict: - """Retrieve the request queue store info.""" - return { - 'accessedAt': self._accessed_at, - 'createdAt': self._created_at, - 'hadMultipleClients': False, - 'handledRequestCount': self._handled_request_count, - 'id': self._id, - 'modifiedAt': self._modified_at, - 'name': self._name, - 'pendingRequestCount': self._pending_request_count, - 'stats': {}, - 'totalRequestCount': len(self._requests), - 'userId': '1', - } - - async def _update_timestamps(self: RequestQueueClient, has_been_modified: bool) -> None: # noqa: FBT001 - self._accessed_at = datetime.now(timezone.utc) - - if has_been_modified: - self._modified_at = datetime.now(timezone.utc) - - request_queue_info = self._to_resource_info() - await update_metadata( - data=request_queue_info, - entity_directory=self._resource_directory, - write_metadata=self._memory_storage_client._write_metadata, - ) - - def _json_to_request(self: RequestQueueClient, request_json: str | None) -> dict | None: - if request_json is None: - return None - request = json.loads(request_json) - return filter_out_none_values_recursively(request) - - def _create_internal_request(self: RequestQueueClient, request: dict, forefront: bool | None) -> dict: - order_no = self._calculate_order_no(request, forefront) - id = unique_key_to_request_id(request['uniqueKey']) # noqa: A001 - - if request.get('id') is not None and request['id'] != id: - raise ValueError('Request ID does not match its unique_key.') - - json_request = json_dumps({**request, 'id': id}) - return { - 'id': id, - 'json': json_request, - 'method': request.get('method'), - 'orderNo': order_no, - 'retryCount': request.get('retryCount', 0), - 'uniqueKey': request['uniqueKey'], - 'url': request['url'], - } - - def _calculate_order_no(self: RequestQueueClient, request: dict, forefront: bool | None) -> Decimal | None: - if request.get('handledAt') is not None: - return None - - # Get the current timestamp in milliseconds - timestamp = Decimal(datetime.now(timezone.utc).timestamp()) * 1000 - timestamp = round(timestamp, 6) - - # Make sure that this timestamp was not used yet, so that we have unique orderNos - if timestamp <= self._last_used_timestamp: - timestamp = self._last_used_timestamp + Decimal(0.000001) - - self._last_used_timestamp = timestamp - - return -timestamp if forefront else timestamp - - @classmethod - def _get_storages_dir(cls: type[RequestQueueClient], memory_storage_client: MemoryStorageClient) -> str: - return memory_storage_client._request_queues_directory - - @classmethod - def _get_storage_client_cache( - cls: type[RequestQueueClient], - memory_storage_client: MemoryStorageClient, - ) -> list[RequestQueueClient]: - return memory_storage_client._request_queues_handled - - @classmethod - def _create_from_directory( - cls: type[RequestQueueClient], - storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> RequestQueueClient: - created_at = datetime.now(timezone.utc) - accessed_at = datetime.now(timezone.utc) - modified_at = datetime.now(timezone.utc) - handled_request_count = 0 - pending_request_count = 0 - entries: list[dict] = [] - - # Access the request queue folder - for entry in os.scandir(storage_directory): - if entry.is_file(): - if entry.name == '__metadata__.json': - # We have found the queue's metadata file, build out information based on it - with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f: - metadata = json.load(f) - id = metadata['id'] # noqa: A001 - name = metadata['name'] - created_at = datetime.fromisoformat(metadata['createdAt']) - accessed_at = datetime.fromisoformat(metadata['accessedAt']) - modified_at = datetime.fromisoformat(metadata['modifiedAt']) - handled_request_count = metadata['handledRequestCount'] - pending_request_count = metadata['pendingRequestCount'] - - continue - - with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f: - request = json.load(f) - if request.get('orderNo'): - request['orderNo'] = Decimal(request.get('orderNo')) - entries.append(request) - - new_client = cls( - base_storage_directory=memory_storage_client._request_queues_directory, - memory_storage_client=memory_storage_client, - id=id, - name=name, - ) - - # Overwrite properties - new_client._accessed_at = accessed_at - new_client._created_at = created_at - new_client._modified_at = modified_at - new_client._handled_request_count = handled_request_count - new_client._pending_request_count = pending_request_count - - for request in entries: - new_client._requests[request['id']] = request - - return new_client diff --git a/src/apify/_memory_storage/resource_clients/request_queue_collection.py b/src/apify/_memory_storage/resource_clients/request_queue_collection.py deleted file mode 100644 index dd69c918..00000000 --- a/src/apify/_memory_storage/resource_clients/request_queue_collection.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from apify_shared.utils import ignore_docs - -from apify._memory_storage.resource_clients.base_resource_collection_client import BaseResourceCollectionClient -from apify._memory_storage.resource_clients.request_queue import RequestQueueClient - -if TYPE_CHECKING: - from apify_shared.models import ListPage - - -@ignore_docs -class RequestQueueCollectionClient(BaseResourceCollectionClient): - """Sub-client for manipulating request queues.""" - - def _get_storage_client_cache(self: RequestQueueCollectionClient) -> list[RequestQueueClient]: - return self._memory_storage_client._request_queues_handled - - def _get_resource_client_class(self: RequestQueueCollectionClient) -> type[RequestQueueClient]: - return RequestQueueClient - - async def list(self: RequestQueueCollectionClient) -> ListPage: - """List the available request queues. - - Returns: - ListPage: The list of available request queues matching the specified filters. - """ - return await super().list() - - async def get_or_create( - self: RequestQueueCollectionClient, - *, - name: str | None = None, - schema: dict | None = None, - _id: str | None = None, - ) -> dict: - """Retrieve a named request queue, or create a new one when it doesn't exist. - - Args: - name (str, optional): The name of the request queue to retrieve or create. - schema (dict, optional): The schema of the request queue - - Returns: - dict: The retrieved or newly-created request queue. - """ - return await super().get_or_create(name=name, schema=schema, _id=_id) diff --git a/src/apify/_utils.py b/src/apify/_utils.py index 6ebcff20..14c05e67 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -116,8 +116,7 @@ def __get__(self: dualproperty, obj: DualPropertyOwner | None, owner: type[DualP @overload -def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE) -> bool | None: - ... +def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE) -> bool | None: ... @overload @@ -126,48 +125,39 @@ def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE, default: bool) -> bool: @overload -def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE) -> datetime | str | None: - ... +def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE) -> datetime | str | None: ... @overload -def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE, default: datetime) -> datetime | str: - ... +def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE, default: datetime) -> datetime | str: ... @overload -def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE) -> float | None: - ... +def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE) -> float | None: ... @overload -def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE, default: float) -> float: - ... +def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE, default: float) -> float: ... @overload -def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE) -> int | None: - ... +def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE) -> int | None: ... @overload -def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE, default: int) -> int: - ... +def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE, default: int) -> int: ... @overload -def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE, default: str) -> str: - ... +def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE, default: str) -> str: ... @overload -def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE) -> str | None: - ... +def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE) -> str | None: ... @overload -def fetch_and_parse_env_var(env_var: ActorEnvVars | ApifyEnvVars) -> Any: - ... +def fetch_and_parse_env_var(env_var: ActorEnvVars | ApifyEnvVars) -> Any: ... def fetch_and_parse_env_var(env_var: Any, default: Any = None) -> Any: @@ -374,13 +364,11 @@ def is_running_in_ipython() -> bool: @overload -def budget_ow(value: str | float | bool, predicate: tuple[type, bool], value_name: str) -> None: - ... +def budget_ow(value: str | float | bool, predicate: tuple[type, bool], value_name: str) -> None: ... @overload -def budget_ow(value: dict, predicate: dict[str, tuple[type, bool]]) -> None: - ... +def budget_ow(value: dict, predicate: dict[str, tuple[type, bool]]) -> None: ... def budget_ow( diff --git a/src/apify/actor.py b/src/apify/actor.py index 2c0b2239..2b14325c 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -11,6 +11,7 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorEventTypes, ActorExitCodes, ApifyEnvVars, WebhookEventType from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value +from crawlee.storage_client_manager import StorageClientManager from apify._crypto import decrypt_input_secrets, load_private_key from apify._utils import ( @@ -23,19 +24,18 @@ run_func_at_interval_async, wrap_internal, ) +from apify.apify_storage_client.apify_storage_client import ApifyStorageClient from apify.config import Configuration from apify.consts import EVENT_LISTENERS_TIMEOUT_SECS from apify.event_manager import EventManager from apify.log import logger from apify.proxy_configuration import ProxyConfiguration -from apify.storages import Dataset, KeyValueStore, RequestQueue, StorageClientManager +from apify.storages import Dataset, KeyValueStore, RequestQueue if TYPE_CHECKING: import logging from types import TracebackType - from apify._memory_storage import MemoryStorageClient - T = TypeVar('T') MainReturnType = TypeVar('MainReturnType') @@ -70,8 +70,7 @@ class Actor(metaclass=_ActorContextManager): _default_instance: Actor | None = None _apify_client: ApifyClientAsync - _memory_storage_client: MemoryStorageClient - _config: Configuration + _configuration: Configuration _event_manager: EventManager _send_system_info_interval_task: asyncio.Task | None = None _send_persist_state_interval_task: asyncio.Task | None = None @@ -122,9 +121,9 @@ def __init__(self: Actor, config: Configuration | None = None) -> None: self.set_status_message = wrap_internal(self._set_status_message_internal, self.set_status_message) # type: ignore self.create_proxy_configuration = wrap_internal(self._create_proxy_configuration_internal, self.create_proxy_configuration) # type: ignore - self._config: Configuration = config or Configuration() + self._configuration = config or Configuration() self._apify_client = self.new_client() - self._event_manager = EventManager(config=self._config) + self._event_manager = EventManager(config=self._configuration) self._is_initialized = False @@ -181,8 +180,8 @@ def apify_client(self_or_cls: type[Actor] | Actor) -> ApifyClientAsync: # noqa: def config(self_or_cls: type[Actor] | Actor) -> Configuration: # noqa: N805 """The Configuration instance the Actor instance uses.""" if isinstance(self_or_cls, type): - return self_or_cls._get_default_instance()._config - return self_or_cls._config + return self_or_cls._get_default_instance()._configuration + return self_or_cls._configuration @dualproperty def event_manager(self_or_cls: type[Actor] | Actor) -> EventManager: # noqa: N805 @@ -229,16 +228,15 @@ async def _init_internal(self: Actor) -> None: # TODO: Print outdated SDK version warning (we need a new env var for this) # https://github.com/apify/apify-sdk-python/issues/146 - StorageClientManager.set_config(self._config) - if self._config.token: - StorageClientManager.set_cloud_client(self._apify_client) + if self._configuration.token: + StorageClientManager.set_cloud_client(ApifyStorageClient(configuration=self._configuration)) await self._event_manager.init() self._send_persist_state_interval_task = asyncio.create_task( run_func_at_interval_async( lambda: self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': False}), - self._config.persist_state_interval_millis / 1000, + self._configuration.persist_state_interval_millis / 1000, ), ) @@ -246,7 +244,7 @@ async def _init_internal(self: Actor) -> None: self._send_system_info_interval_task = asyncio.create_task( run_func_at_interval_async( lambda: self._event_manager.emit(ActorEventTypes.SYSTEM_INFO, self.get_system_info()), - self._config.system_info_interval_millis / 1000, + self._configuration.system_info_interval_millis / 1000, ), ) @@ -268,8 +266,8 @@ def get_system_info(self: Actor) -> dict: 'cpuCurrentUsage': cpu_usage_percent, 'memCurrentBytes': memory_usage_bytes, } - if self._config.max_used_cpu_ratio: - result['isCpuOverloaded'] = cpu_usage_percent > 100 * self._config.max_used_cpu_ratio + if self._configuration.max_used_cpu_ratio: + result['isCpuOverloaded'] = cpu_usage_percent > 100 * self._configuration.max_used_cpu_ratio return result @@ -496,8 +494,8 @@ def _new_client_internal( min_delay_between_retries_millis: int | None = None, timeout_secs: int | None = None, ) -> ApifyClientAsync: - token = token or self._config.token - api_url = api_url or self._config.api_base_url + token = token or self._configuration.token + api_url = api_url or self._configuration.api_base_url return ApifyClientAsync( token=token, api_url=api_url, @@ -546,7 +544,7 @@ async def _open_dataset_internal( ) -> Dataset: self._raise_if_not_initialized() - return await Dataset.open(id=id, name=name, force_cloud=force_cloud, config=self._config) + return await Dataset.open(id=id, name=name, configuration=self._configuration) @classmethod async def open_key_value_store( @@ -584,7 +582,7 @@ async def _open_key_value_store_internal( ) -> KeyValueStore: self._raise_if_not_initialized() - return await KeyValueStore.open(id=id, name=name, force_cloud=force_cloud, config=self._config) + return await KeyValueStore.open(id=id, name=name, configuration=self._configuration) @classmethod async def open_request_queue( @@ -623,7 +621,7 @@ async def _open_request_queue_internal( ) -> RequestQueue: self._raise_if_not_initialized() - return await RequestQueue.open(id=id, name=name, force_cloud=force_cloud, config=self._config) + return await RequestQueue.open(id=id, name=name, configuration=self._configuration) @classmethod async def push_data(cls: type[Actor], data: Any) -> None: @@ -651,9 +649,9 @@ async def get_input(cls: type[Actor]) -> Any: async def _get_input_internal(self: Actor) -> Any: self._raise_if_not_initialized() - input_value = await self.get_value(self._config.input_key) - input_secrets_private_key = self._config.input_secrets_private_key_file - input_secrets_key_passphrase = self._config.input_secrets_private_key_passphrase + input_value = await self.get_value(self._configuration.input_key) + input_secrets_private_key = self._configuration.input_secrets_private_key_file + input_secrets_key_passphrase = self._configuration.input_secrets_private_key_passphrase if input_secrets_private_key and input_secrets_key_passphrase: private_key = load_private_key( input_secrets_private_key, @@ -767,7 +765,7 @@ def is_at_home(cls: type[Actor]) -> bool: return cls._get_default_instance().is_at_home() def _is_at_home_internal(self: Actor) -> bool: - return self._config.is_at_home + return self._configuration.is_at_home @classmethod def get_env(cls: type[Actor]) -> dict: @@ -1111,12 +1109,12 @@ async def _metamorph_internal( return if not custom_after_sleep_millis: - custom_after_sleep_millis = self._config.metamorph_after_sleep_millis + custom_after_sleep_millis = self._configuration.metamorph_after_sleep_millis # If is_at_home() is True, config.actor_run_id is always set - assert self._config.actor_run_id is not None # noqa: S101 + assert self._configuration.actor_run_id is not None # noqa: S101 - await self._apify_client.run(self._config.actor_run_id).metamorph( + await self._apify_client.run(self._configuration.actor_run_id).metamorph( target_actor_id=target_actor_id, run_input=run_input, target_actor_build=target_actor_build, @@ -1159,7 +1157,7 @@ async def _reboot_internal( return if not custom_after_sleep_millis: - custom_after_sleep_millis = self._config.metamorph_after_sleep_millis + custom_after_sleep_millis = self._configuration.metamorph_after_sleep_millis await self._cancel_event_emitting_intervals() @@ -1168,8 +1166,8 @@ async def _reboot_internal( await self._event_manager.close(event_listeners_timeout_secs=event_listeners_timeout_secs) - assert self._config.actor_run_id is not None # noqa: S101 - await self._apify_client.run(self._config.actor_run_id).reboot() + assert self._configuration.actor_run_id is not None # noqa: S101 + await self._apify_client.run(self._configuration.actor_run_id).reboot() if custom_after_sleep_millis: await asyncio.sleep(custom_after_sleep_millis / 1000) @@ -1233,10 +1231,10 @@ async def _add_webhook_internal( return None # If is_at_home() is True, config.actor_run_id is always set - assert self._config.actor_run_id is not None # noqa: S101 + assert self._configuration.actor_run_id is not None # noqa: S101 return await self._apify_client.webhooks().create( - actor_run_id=self._config.actor_run_id, + actor_run_id=self._configuration.actor_run_id, event_types=event_types, request_url=request_url, payload_template=payload_template, @@ -1277,9 +1275,11 @@ async def _set_status_message_internal( return None # If is_at_home() is True, config.actor_run_id is always set - assert self._config.actor_run_id is not None # noqa: S101 + assert self._configuration.actor_run_id is not None # noqa: S101 - return await self._apify_client.run(self._config.actor_run_id).update(status_message=status_message, is_status_message_terminal=is_terminal) + return await self._apify_client.run(self._configuration.actor_run_id).update( + status_message=status_message, is_status_message_terminal=is_terminal + ) @classmethod async def create_proxy_configuration( @@ -1348,7 +1348,7 @@ async def _create_proxy_configuration_internal( country_code=country_code, proxy_urls=proxy_urls, new_url_function=new_url_function, - _actor_config=self._config, + _actor_config=self._configuration, _apify_client=self._apify_client, ) diff --git a/src/apify/apify_storage_client/__init__.py b/src/apify/apify_storage_client/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/apify/apify_storage_client/apify_storage_client.py b/src/apify/apify_storage_client/apify_storage_client.py new file mode 100644 index 00000000..c00aed5c --- /dev/null +++ b/src/apify/apify_storage_client/apify_storage_client.py @@ -0,0 +1,52 @@ +from apify_client import ApifyClientAsync +from crawlee.base_storage_client.base_storage_client import BaseStorageClient +from typing_extensions import override + +from .dataset_client import DatasetClient +from .dataset_collection_client import DatasetCollectionClient +from .key_value_store_client import KeyValueStoreClient +from .key_value_store_collection_client import KeyValueStoreCollectionClient +from .request_queue_client import RequestQueueClient +from .request_queue_collection_client import RequestQueueCollectionClient +from apify.config import Configuration + + +class ApifyStorageClient(BaseStorageClient): + """A storage client implementation based on the Apify platform storage.""" + + def __init__(self, *, configuration: Configuration) -> None: + self._apify_client = ApifyClientAsync( + token=configuration.token, + api_url=configuration.api_base_url, + max_retries=8, # TODO + min_delay_between_retries_millis=500, + timeout_secs=360, + ) + + @override + def dataset(self, id: str) -> DatasetClient: # noqa: A002 + return DatasetClient(self._apify_client.dataset(id)) + + @override + def datasets(self) -> DatasetCollectionClient: + return DatasetCollectionClient(self._apify_client.datasets()) + + @override + def key_value_store(self, id: str) -> KeyValueStoreClient: + return KeyValueStoreClient(self._apify_client.key_value_store(id)) + + @override + def key_value_stores(self) -> KeyValueStoreCollectionClient: + return KeyValueStoreCollectionClient(self._apify_client.key_value_stores()) + + @override + def request_queue(self, id: str) -> RequestQueueClient: + return RequestQueueClient(self._apify_client.request_queue(id)) + + @override + def request_queues(self) -> RequestQueueCollectionClient: + return RequestQueueCollectionClient(self._apify_client.request_queues()) + + @override + async def purge_on_start(self) -> None: + pass diff --git a/src/apify/apify_storage_client/dataset_client.py b/src/apify/apify_storage_client/dataset_client.py new file mode 100644 index 00000000..e2f5113a --- /dev/null +++ b/src/apify/apify_storage_client/dataset_client.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from crawlee.base_storage_client.base_dataset_client import BaseDatasetClient +from crawlee.models import DatasetItemsListPage, DatasetMetadata +from typing_extensions import override + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + + from apify_client.clients import DatasetClientAsync + from crawlee.types import JSONSerializable + + +class DatasetClient(BaseDatasetClient): + """Dataset resource client implementation based on the Apify platform storage.""" + + def __init__(self, apify_dataset_client: DatasetClientAsync) -> None: + self._client = apify_dataset_client + + @override + async def get(self) -> DatasetMetadata | None: + result = await self._client.get() + return DatasetMetadata.model_validate(result) if result else None + + @override + async def update( + self, + *, + name: str | None = None, + ) -> DatasetMetadata: + return DatasetMetadata.model_validate( + await self._client.update( + name=name, + ) + ) + + @override + async def delete(self) -> None: + await self._client.delete() + + @override + async def list_items( + self, + *, + offset: int | None = 0, + limit: int | None = BaseDatasetClient._LIST_ITEMS_LIMIT, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + flatten: list[str] | None = None, + view: str | None = None, + ) -> DatasetItemsListPage: + return DatasetItemsListPage.model_validate( + await self._client.list_items( + offset=offset, + limit=limit, + clean=clean, + desc=desc, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_hidden=skip_hidden, + flatten=flatten, + view=view, + ) + ) + + @override + async def iterate_items( + self, + *, + offset: int = 0, + limit: int | None = None, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + ) -> AsyncIterator[dict]: + return self._client.iterate_items( + offset=offset, + limit=limit, + clean=clean, + desc=desc, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_hidden=skip_hidden, + ) + + @override + async def get_items_as_bytes( + self, + *, + item_format: str = 'json', + offset: int | None = None, + limit: int | None = None, + desc: bool = False, + clean: bool = False, + bom: bool = False, + delimiter: str | None = None, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_header_row: bool = False, + skip_hidden: bool = False, + xml_root: str | None = None, + xml_row: str | None = None, + flatten: list[str] | None = None, + ) -> bytes: + return await self._client.get_items_as_bytes( + item_format=item_format, + offset=offset, + limit=limit, + desc=desc, + clean=clean, + bom=bom, + delimiter=delimiter, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_header_row=skip_header_row, + skip_hidden=skip_hidden, + xml_root=xml_root, + xml_row=xml_row, + flatten=flatten, + ) + + @override + async def stream_items( + self, + *, + item_format: str = 'json', + offset: int | None = None, + limit: int | None = None, + desc: bool = False, + clean: bool = False, + bom: bool = False, + delimiter: str | None = None, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_header_row: bool = False, + skip_hidden: bool = False, + xml_root: str | None = None, + xml_row: str | None = None, + ) -> AsyncIterator[dict]: # TODO incorrect type + return self._client.stream_items( + item_format=item_format, + offset=offset, + limit=limit, + desc=desc, + clean=clean, + bom=bom, + delimiter=delimiter, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_header_row=skip_header_row, + skip_hidden=skip_hidden, + xml_root=xml_root, + xml_row=xml_row, + ) + + @override + async def push_items(self, items: JSONSerializable) -> None: + await self._client.push_items( + items=items, + ) diff --git a/src/apify/apify_storage_client/dataset_collection_client.py b/src/apify/apify_storage_client/dataset_collection_client.py new file mode 100644 index 00000000..148d2716 --- /dev/null +++ b/src/apify/apify_storage_client/dataset_collection_client.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from crawlee.base_storage_client.base_dataset_collection_client import BaseDatasetCollectionClient +from crawlee.models import DatasetListPage, DatasetMetadata +from typing_extensions import override + +if TYPE_CHECKING: + from apify_client.clients import DatasetCollectionClientAsync + + +class DatasetCollectionClient(BaseDatasetCollectionClient): + """Dataset collection resource client implementation based on the Apify platform storage.""" + + def __init__(self, apify_dataset_collection_client: DatasetCollectionClientAsync) -> None: + self._client = apify_dataset_collection_client + + @override + async def get_or_create( + self, + *, + id: str | None = None, # TODO unused + name: str | None = None, + schema: dict | None = None, + ) -> DatasetMetadata: + return DatasetMetadata.model_validate( + await self._client.get_or_create( + name=name, + schema=schema, + ) + ) + + @override + async def list( + self, + *, + unnamed: bool = False, + limit: int | None = None, + offset: int | None = None, + desc: bool = False, + ) -> DatasetListPage: + return DatasetListPage.model_validate( + await self._client.list( + unnamed=unnamed, + limit=limit, + offset=offset, + desc=desc, + ) + ) diff --git a/src/apify/apify_storage_client/key_value_store_client.py b/src/apify/apify_storage_client/key_value_store_client.py new file mode 100644 index 00000000..073412d9 --- /dev/null +++ b/src/apify/apify_storage_client/key_value_store_client.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from crawlee.base_storage_client.base_key_value_store_client import BaseKeyValueStoreClient +from crawlee.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord +from typing_extensions import override + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + + from apify_client.clients import KeyValueStoreClientAsync + + +class KeyValueStoreClient(BaseKeyValueStoreClient): + """Key-value store resource client implementation based on the Apify platform storage.""" + + def __init__(self, apify_key_value_store_client: KeyValueStoreClientAsync) -> None: + self._client = apify_key_value_store_client + + @override + async def get(self) -> KeyValueStoreMetadata | None: + result = await self._client.get() + return KeyValueStoreMetadata.model_validate(result) if result else None + + @override + async def update( + self, + *, + name: str | None = None, + ) -> KeyValueStoreMetadata: + return KeyValueStoreMetadata.model_validate(await self._client.update()) + + @override + async def delete(self) -> None: + await self._client.delete() + + @override + async def list_keys( + self, + *, + limit: int = 1000, + exclusive_start_key: str | None = None, + ) -> KeyValueStoreListKeysPage: + return KeyValueStoreListKeysPage.model_validate(await self._client.list_keys()) + + @override + async def get_record(self, key: str) -> KeyValueStoreRecord | None: + result = await self._client.get_record(key) + return KeyValueStoreRecord.model_validate(result) if result else None + + @override + async def get_record_as_bytes(self, key: str) -> KeyValueStoreRecord | None: + result = await self._client.get_record_as_bytes(key) + return KeyValueStoreRecord.model_validate(result) if result else None + + @override + async def stream_record(self, key: str) -> AsyncIterator[KeyValueStoreRecord | None]: # TODO incorrect type + async with self._client.stream_record(key) as response: + return KeyValueStoreRecord.model_validate(response) + + @override + async def set_record(self, key: str, value: Any, content_type: str | None = None) -> None: + await self._client.set_record( + key=key, + value=value, + content_type=content_type, + ) + + @override + async def delete_record(self, key: str) -> None: + await self._client.delete_record( + key=key, + ) diff --git a/src/apify/apify_storage_client/key_value_store_collection_client.py b/src/apify/apify_storage_client/key_value_store_collection_client.py new file mode 100644 index 00000000..9b825992 --- /dev/null +++ b/src/apify/apify_storage_client/key_value_store_collection_client.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from crawlee.base_storage_client.base_key_value_store_collection_client import BaseKeyValueStoreCollectionClient +from crawlee.models import KeyValueStoreListPage, KeyValueStoreMetadata +from typing_extensions import override + +if TYPE_CHECKING: + from apify_client.clients import KeyValueStoreCollectionClientAsync + + +class KeyValueStoreCollectionClient(BaseKeyValueStoreCollectionClient): + """Key-value store collection resource client implementation based on the Apify platform storage.""" + + def __init__(self, apify_dataset_collection_client: KeyValueStoreCollectionClientAsync) -> None: + self._client = apify_dataset_collection_client + + @override + async def get_or_create( + self, + *, + id: str | None = None, # TODO unused + name: str | None = None, + schema: dict | None = None, + ) -> KeyValueStoreMetadata: + return KeyValueStoreMetadata.model_validate( + await self._client.get_or_create( + name=name, + schema=schema, + ) + ) + + @override + async def list( + self, + *, + unnamed: bool = False, + limit: int | None = None, + offset: int | None = None, + desc: bool = False, + ) -> KeyValueStoreListPage: + return KeyValueStoreListPage.model_validate(await self._client.list(unnamed=unnamed, limit=limit, offset=offset, desc=desc)) diff --git a/src/apify/apify_storage_client/request_queue_client.py b/src/apify/apify_storage_client/request_queue_client.py new file mode 100644 index 00000000..5db8077b --- /dev/null +++ b/src/apify/apify_storage_client/request_queue_client.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from crawlee.base_storage_client.base_request_queue_client import BaseRequestQueueClient +from crawlee.models import Request, RequestQueueHead, RequestQueueMetadata, RequestQueueOperationInfo +from typing_extensions import override + +if TYPE_CHECKING: + from apify_client.clients import RequestQueueClientAsync + + +class RequestQueueClient(BaseRequestQueueClient): + """Request queue resource client implementation based on the Apify platform storage.""" + + def __init__(self, apify_request_queue_client: RequestQueueClientAsync) -> None: + self._client = apify_request_queue_client + + @override + async def get(self) -> RequestQueueMetadata | None: + result = await self._client.get() + return RequestQueueMetadata.model_validate(result) if result else None + + @override + async def update( + self, + *, + name: str | None = None, + ) -> RequestQueueMetadata: + return RequestQueueMetadata.model_validate( + await self._client.update( + name=name, + ) + ) + + @override + async def delete(self) -> None: + await self._client.delete() + + @override + async def list_head(self, *, limit: int | None = None) -> RequestQueueHead: + return RequestQueueHead.model_validate( + self._client.list_head( + limit=limit, + ), + ) + + @override + async def list_and_lock_head(self, *, lock_secs: int, limit: int | None = None) -> dict: + return await self._client.list_and_lock_head( + lock_secs=lock_secs, + limit=limit, + ) + + @override + async def add_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> RequestQueueOperationInfo: + return RequestQueueOperationInfo.model_validate( + await self._client.add_request( + request=request.model_dump(by_alias=True), + forefront=forefront, + ) + ) + + @override + async def get_request(self, request_id: str) -> Request | None: + result = await self._client.get_request(request_id) + return Request.model_validate(result) if result else None + + @override + async def update_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> RequestQueueOperationInfo: + return RequestQueueOperationInfo.model_validate( + await self._client.update_request( + request=request.model_dump(by_alias=True), + forefront=forefront, + ) + ) + + @override + async def delete_request(self, request_id: str) -> None: + await self._client.delete_request(request_id) + + @override + async def prolong_request_lock( + self, + request_id: str, + *, + forefront: bool = False, + lock_secs: int, + ) -> dict: + return await self._client.prolong_request_lock( + request_id=request_id, + forefront=forefront, + lock_secs=lock_secs, + ) + + @override + async def delete_request_lock( + self, + request_id: str, + *, + forefront: bool = False, + ) -> None: + await self._client.delete_request_lock( + request_id=request_id, + forefront=forefront, + ) + + @override + async def batch_add_requests( + self, + requests: list[Request], + *, + forefront: bool = False, + ) -> dict: + return await self._client.batch_add_requests( + requests=[r.model_dump(by_alias=True) for r in requests], + forefront=forefront, + ) + + @override + async def batch_delete_requests(self, requests: list[Request]) -> dict: + return await self._client.batch_delete_requests( + requests=[r.model_dump(by_alias=True) for r in requests], + ) + + @override + async def list_requests( + self, + *, + limit: int | None = None, + exclusive_start_id: str | None = None, + ) -> dict: # TODO type + return await self._client.list_requests( + limit=limit, + exclusive_start_id=exclusive_start_id, + ) diff --git a/src/apify/apify_storage_client/request_queue_collection_client.py b/src/apify/apify_storage_client/request_queue_collection_client.py new file mode 100644 index 00000000..a33f6aac --- /dev/null +++ b/src/apify/apify_storage_client/request_queue_collection_client.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from crawlee.base_storage_client.base_request_queue_collection_client import BaseRequestQueueCollectionClient +from crawlee.models import RequestQueueListPage, RequestQueueMetadata +from typing_extensions import override + +if TYPE_CHECKING: + from apify_client.clients import RequestQueueCollectionClientAsync + + +class RequestQueueCollectionClient(BaseRequestQueueCollectionClient): + """Request queue collection resource client implementation based on the Apify platform storage.""" + + def __init__(self, apify_request_queue_collection_client: RequestQueueCollectionClientAsync) -> None: + self._client = apify_request_queue_collection_client + + @override + async def get_or_create( + self, + *, + id: str | None = None, # TODO unused + name: str | None = None, + schema: dict | None = None, # TODO unused + ) -> RequestQueueMetadata: + return RequestQueueMetadata.model_validate( + await self._client.get_or_create( + name=name, + ) + ) + + @override + async def list( + self, + *, + unnamed: bool = False, + limit: int | None = None, + offset: int | None = None, + desc: bool = False, + ) -> RequestQueueListPage: + return RequestQueueListPage.model_validate( + await self._client.list( + unnamed=unnamed, + limit=limit, + offset=offset, + desc=desc, + ) + ) diff --git a/src/apify/storages/__init__.py b/src/apify/storages/__init__.py index e954ef20..80205cf4 100644 --- a/src/apify/storages/__init__.py +++ b/src/apify/storages/__init__.py @@ -1,11 +1,4 @@ -from .dataset import Dataset -from .key_value_store import KeyValueStore -from .request_queue import RequestQueue -from .storage_client_manager import StorageClientManager - -__all__ = [ - 'Dataset', - 'KeyValueStore', - 'RequestQueue', - 'StorageClientManager', -] +# ruff: noqa: PLC0414 +from crawlee.storages.dataset import Dataset as Dataset +from crawlee.storages.key_value_store import KeyValueStore as KeyValueStore +from crawlee.storages.request_queue import RequestQueue as RequestQueue diff --git a/src/apify/storages/base_storage.py b/src/apify/storages/base_storage.py deleted file mode 100644 index 54697511..00000000 --- a/src/apify/storages/base_storage.py +++ /dev/null @@ -1,181 +0,0 @@ -from __future__ import annotations - -import asyncio -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Generic, TypeVar, cast - -from apify_shared.utils import ignore_docs - -from apify._memory_storage import MemoryStorageClient -from apify._memory_storage.resource_clients import BaseResourceClient, BaseResourceCollectionClient -from apify.config import Configuration -from apify.storages.storage_client_manager import StorageClientManager - -if TYPE_CHECKING: - from apify_client import ApifyClientAsync - -BaseResourceClientType = TypeVar('BaseResourceClientType', bound=BaseResourceClient) -BaseResourceCollectionClientType = TypeVar('BaseResourceCollectionClientType', bound=BaseResourceCollectionClient) - - -@ignore_docs -class BaseStorage(ABC, Generic[BaseResourceClientType, BaseResourceCollectionClientType]): - """A class for managing storages.""" - - _id: str - _name: str | None - _storage_client: ApifyClientAsync | MemoryStorageClient - _config: Configuration - - _cache_by_id: dict | None = None - _cache_by_name: dict | None = None - _storage_creating_lock: asyncio.Lock | None = None - - def __init__( - self: BaseStorage, - id: str, # noqa: A002 - name: str | None, - client: ApifyClientAsync | MemoryStorageClient, - config: Configuration, - ) -> None: - """Initialize the storage. - - Do not use this method directly, but use `Actor.open_()` instead. - - Args: - id (str): The storage id - name (str, optional): The storage name - client (ApifyClientAsync or MemoryStorageClient): The storage client - config (Configuration): The configuration - """ - self._id = id - self._name = name - self._storage_client = client - self._config = config - - @classmethod - @abstractmethod - def _get_human_friendly_label(cls: type[BaseStorage]) -> str: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - @abstractmethod - def _get_default_id(cls: type[BaseStorage], config: Configuration) -> str: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - @abstractmethod - def _get_single_storage_client( - cls: type[BaseStorage], - id: str, # noqa: A002 - client: ApifyClientAsync | MemoryStorageClient, - ) -> BaseResourceClientType: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - @abstractmethod - def _get_storage_collection_client( - cls: type[BaseStorage], - client: ApifyClientAsync | MemoryStorageClient, - ) -> BaseResourceCollectionClientType: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - def _ensure_class_initialized(cls: type[BaseStorage]) -> None: - if cls._cache_by_id is None: - cls._cache_by_id = {} - if cls._cache_by_name is None: - cls._cache_by_name = {} - if cls._storage_creating_lock is None: - cls._storage_creating_lock = asyncio.Lock() - - @classmethod - @abstractmethod - async def open( - cls: type[BaseStorage], - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - config: Configuration | None = None, - ) -> BaseStorage: - """Open a storage, or return a cached storage object if it was opened before. - - Opens a storage with the given ID or name. - Returns the cached storage object if the storage was opened before. - - Args: - id (str, optional): ID of the storage to be opened. - If neither `id` nor `name` are provided, the method returns the default storage associated with the actor run. - If the storage with the given ID does not exist, it raises an error. - name (str, optional): Name of the storage to be opened. - If neither `id` nor `name` are provided, the method returns the default storage associated with the actor run. - If the storage with the given name does not exist, it is created. - force_cloud (bool, optional): If set to True, it will open a storage on the Apify Platform even when running the actor locally. - Defaults to False. - config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted. - - Returns: - An instance of the storage. - """ - cls._ensure_class_initialized() - assert cls._cache_by_id is not None # noqa: S101 - assert cls._cache_by_name is not None # noqa: S101 - assert not (id and name) # noqa: S101 - - used_config = config or Configuration.get_global_configuration() - used_client = StorageClientManager.get_storage_client(force_cloud=force_cloud) - - is_default_storage_on_local = False - # Fetch default ID if no ID or name was passed - if not id and not name: - if isinstance(used_client, MemoryStorageClient): - is_default_storage_on_local = True - id = cls._get_default_id(used_config) # noqa: A001 - - # Try to get the storage instance from cache - cached_storage = None - if id: - cached_storage = cls._cache_by_id.get(id) - elif name: - cached_storage = cls._cache_by_name.get(name) - - if cached_storage is not None: - # This cast is needed since MyPy doesn't understand very well that Self and Storage are the same - return cast(BaseStorage, cached_storage) - - # Purge default storages if configured - if used_config.purge_on_start and isinstance(used_client, MemoryStorageClient): - await used_client._purge_on_start() - - assert cls._storage_creating_lock is not None # noqa: S101 - async with cls._storage_creating_lock: - # Create the storage - if id and not is_default_storage_on_local: - single_storage_client = cls._get_single_storage_client(id, used_client) - storage_info = await single_storage_client.get() - if not storage_info: - storage_label = cls._get_human_friendly_label() - raise RuntimeError(f'{storage_label} with id "{id}" does not exist!') - elif is_default_storage_on_local: - storage_collection_client = cls._get_storage_collection_client(used_client) - storage_info = await storage_collection_client.get_or_create(name=name, _id=id) - else: - storage_collection_client = cls._get_storage_collection_client(used_client) - storage_info = await storage_collection_client.get_or_create(name=name) - - storage = cls(storage_info['id'], storage_info.get('name'), used_client, used_config) - - # Cache by id and name - cls._cache_by_id[storage._id] = storage - if storage._name is not None: - cls._cache_by_name[storage._name] = storage - - return storage - - def _remove_from_cache(self: BaseStorage) -> None: - if self.__class__._cache_by_id is not None: - del self.__class__._cache_by_id[self._id] - - if self._name and self.__class__._cache_by_name is not None: - del self.__class__._cache_by_name[self._name] diff --git a/src/apify/storages/dataset.py b/src/apify/storages/dataset.py deleted file mode 100644 index ce4429c7..00000000 --- a/src/apify/storages/dataset.py +++ /dev/null @@ -1,494 +0,0 @@ -from __future__ import annotations - -import csv -import io -import math -from typing import TYPE_CHECKING, AsyncIterator, Iterable, Iterator - -from apify_shared.utils import ignore_docs, json_dumps - -from apify._utils import wrap_internal -from apify.consts import MAX_PAYLOAD_SIZE_BYTES -from apify.storages.base_storage import BaseStorage -from apify.storages.key_value_store import KeyValueStore - -if TYPE_CHECKING: - from apify_client import ApifyClientAsync - from apify_client.clients import DatasetClientAsync, DatasetCollectionClientAsync - from apify_shared.models import ListPage - from apify_shared.types import JSONSerializable - - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import DatasetClient, DatasetCollectionClient - from apify.config import Configuration - -# 0.01% -SAFETY_BUFFER_PERCENT = 0.01 / 100 -EFFECTIVE_LIMIT_BYTES = MAX_PAYLOAD_SIZE_BYTES - math.ceil(MAX_PAYLOAD_SIZE_BYTES * SAFETY_BUFFER_PERCENT) - - -def _check_and_serialize(item: JSONSerializable, index: int | None = None) -> str: - """Accept a JSON serializable object as an input, validate its serializability and its serialized size against `EFFECTIVE_LIMIT_BYTES`.""" - s = ' ' if index is None else f' at index {index} ' - - try: - payload = json_dumps(item) - except Exception as exc: - raise ValueError(f'Data item{s}is not serializable to JSON.') from exc - - length_bytes = len(payload.encode('utf-8')) - if length_bytes > EFFECTIVE_LIMIT_BYTES: - raise ValueError(f'Data item{s}is too large (size: {length_bytes} bytes, limit: {EFFECTIVE_LIMIT_BYTES} bytes)') - - return payload - - -def _chunk_by_size(items: Iterable[str]) -> Iterator[str]: - """Take an array of JSONs, produce iterator of chunked JSON arrays respecting `EFFECTIVE_LIMIT_BYTES`. - - Takes an array of JSONs (payloads) as input and produces an iterator of JSON strings - where each string is a JSON array of payloads with a maximum size of `EFFECTIVE_LIMIT_BYTES` per one - JSON array. Fits as many payloads as possible into a single JSON array and then moves - on to the next, preserving item order. - - The function assumes that none of the items is larger than `EFFECTIVE_LIMIT_BYTES` and does not validate. - """ - last_chunk_bytes = 2 # Add 2 bytes for [] wrapper. - current_chunk = [] - - for payload in items: - length_bytes = len(payload.encode('utf-8')) - - if last_chunk_bytes + length_bytes <= EFFECTIVE_LIMIT_BYTES: - current_chunk.append(payload) - last_chunk_bytes += length_bytes + 1 # Add 1 byte for ',' separator. - else: - yield f'[{",".join(current_chunk)}]' - current_chunk = [payload] - last_chunk_bytes = length_bytes + 2 # Add 2 bytes for [] wrapper. - - yield f'[{",".join(current_chunk)}]' - - -class Dataset(BaseStorage): - """The `Dataset` class represents a store for structured data where each object stored has the same attributes. - - You can imagine it as a table, where each object is a row and its attributes are columns. - Dataset is an append-only storage - you can only add new records to it but you cannot modify or remove existing records. - Typically it is used to store crawling results. - - Do not instantiate this class directly, use the `Actor.open_dataset()` function instead. - - `Dataset` stores its data either on local disk or in the Apify cloud, - depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set. - - If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in - the local directory in the following files: - ``` - {APIFY_LOCAL_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json - ``` - Note that `{DATASET_ID}` is the name or ID of the dataset. The default dataset has ID: `default`, - unless you override it by setting the `APIFY_DEFAULT_DATASET_ID` environment variable. - Each dataset item is stored as a separate JSON file, where `{INDEX}` is a zero-based index of the item in the dataset. - - If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the - [Apify Dataset](https://docs.apify.com/storage/dataset) cloud storage. - """ - - _id: str - _name: str | None - _dataset_client: DatasetClientAsync | DatasetClient - - @ignore_docs - def __init__( - self: Dataset, - id: str, # noqa: A002 - name: str | None, - client: ApifyClientAsync | MemoryStorageClient, - config: Configuration, - ) -> None: - """Create a `Dataset` instance. - - Do not use the constructor directly, use the `Actor.open_dataset()` function instead. - - Args: - id (str): ID of the dataset. - name (str, optional): Name of the dataset. - client (ApifyClientAsync or MemoryStorageClient): The storage client which should be used. - config (Configuration): The configuration which should be used. - """ - super().__init__(id=id, name=name, client=client, config=config) - - self.get_data = wrap_internal(self._get_data_internal, self.get_data) # type: ignore - self.push_data = wrap_internal(self._push_data_internal, self.push_data) # type: ignore - self.export_to_json = wrap_internal(self._export_to_json_internal, self.export_to_json) # type: ignore - self.export_to_csv = wrap_internal(self._export_to_csv_internal, self.export_to_csv) # type: ignore - - self._dataset_client = client.dataset(self._id) - - @classmethod - def _get_human_friendly_label(cls: type[Dataset]) -> str: - return 'Dataset' - - @classmethod - def _get_default_id(cls: type[Dataset], config: Configuration) -> str: - return config.default_dataset_id - - @classmethod - def _get_single_storage_client( - cls: type[Dataset], - id: str, # noqa: A002 - client: ApifyClientAsync | MemoryStorageClient, - ) -> DatasetClientAsync | DatasetClient: - return client.dataset(id) - - @classmethod - def _get_storage_collection_client( - cls: type[Dataset], - client: ApifyClientAsync | MemoryStorageClient, - ) -> DatasetCollectionClientAsync | DatasetCollectionClient: - return client.datasets() - - @classmethod - async def push_data(cls: type[Dataset], data: JSONSerializable) -> None: - """Store an object or an array of objects to the dataset. - - The size of the data is limited by the receiving API and therefore `push_data()` will only - allow objects whose JSON representation is smaller than 9MB. When an array is passed, - none of the included objects may be larger than 9MB, but the array itself may be of any size. - - Args: - data (JSONSerializable): dict or array of dicts containing data to be stored in the default dataset. - The JSON representation of each item must be smaller than 9MB. - """ - dataset = await cls.open() - return await dataset.push_data(data) - - async def _push_data_internal(self: Dataset, data: JSONSerializable) -> None: - # Handle singular items - if not isinstance(data, list): - payload = _check_and_serialize(data) - return await self._dataset_client.push_items(payload) - - # Handle lists - payloads_generator = (_check_and_serialize(item, index) for index, item in enumerate(data)) - - # Invoke client in series to preserve the order of data - for chunk in _chunk_by_size(payloads_generator): - await self._dataset_client.push_items(chunk) - return None - - @classmethod - async def get_data( - cls: type[Dataset], - *, - offset: int | None = None, - limit: int | None = None, - clean: bool | None = None, - desc: bool | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool | None = None, - skip_hidden: bool | None = None, - flatten: list[str] | None = None, - view: str | None = None, - ) -> ListPage: - """Get items from the dataset. - - Args: - offset (int, optional): Number of items that should be skipped at the start. The default value is 0 - limit (int, optional): Maximum number of items to return. By default there is no limit. - desc (bool, optional): By default, results are returned in the same order as they were stored. - To reverse the order, set this parameter to True. - clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character). - The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters. - Note that since some objects might be skipped from the output, that the result might contain less items than the limit value. - fields (list of str, optional): A list of fields which should be picked from the items, - only these fields will remain in the resulting record objects. - Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter. - You can use this feature to effectively fix the output format. - omit (list of str, optional): A list of fields which should be omitted from the items. - unwind (str, optional): Name of a field which should be unwound. - If the field is an array then every element of the array will become a separate record and merged with parent object. - If the unwound field is an object then it is merged with the parent object. - If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object, - then the item gets preserved as it is. Note that the unwound items ignore the desc parameter. - skip_empty (bool, optional): If True, then empty items are skipped from the output. - Note that if used, the results might contain less items than the limit value. - skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character. - flatten (list of str, optional): A list of fields that should be flattened - view (str, optional): Name of the dataset view to be used - - Returns: - ListPage: A page of the list of dataset items according to the specified filters. - """ - dataset = await cls.open() - return await dataset.get_data( - offset=offset, - limit=limit, - desc=desc, - clean=clean, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_hidden=skip_hidden, - flatten=flatten, - view=view, - ) - - async def _get_data_internal( - self: Dataset, - *, - offset: int | None = None, - limit: int | None = None, - clean: bool | None = None, - desc: bool | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool | None = None, - skip_hidden: bool | None = None, - flatten: list[str] | None = None, - view: str | None = None, - ) -> ListPage: - # TODO: Improve error handling here - # https://github.com/apify/apify-sdk-python/issues/140 - return await self._dataset_client.list_items( - offset=offset, - limit=limit, - desc=desc, - clean=clean, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_hidden=skip_hidden, - flatten=flatten, - view=view, - ) - - async def export_to( - self: Dataset, - key: str, - *, - to_key_value_store_id: str | None = None, - to_key_value_store_name: str | None = None, - content_type: str | None = None, - ) -> None: - """Save the entirety of the dataset's contents into one file within a key-value store. - - Args: - key (str): The key to save the data under. - to_key_value_store_id (str, optional): The id of the key-value store in which the result will be saved. - to_key_value_store_name (str, optional): The name of the key-value store in which the result will be saved. - You must specify only one of `to_key_value_store_id` and `to_key_value_store_name` arguments. - If you omit both, it uses the default key-value store. - content_type (str, optional): Either 'text/csv' or 'application/json'. Defaults to JSON. - """ - key_value_store = await KeyValueStore.open(id=to_key_value_store_id, name=to_key_value_store_name) - items: list[dict] = [] - limit = 1000 - offset = 0 - while True: - list_items = await self._dataset_client.list_items(limit=limit, offset=offset) - items.extend(list_items.items) - if list_items.total <= offset + list_items.count: - break - offset += list_items.count - - if len(items) == 0: - raise ValueError('Cannot export an empty dataset') - - if content_type == 'text/csv': - output = io.StringIO() - writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL) - writer.writerows([items[0].keys(), *[item.values() for item in items]]) - value = output.getvalue() - return await key_value_store.set_value(key, value, content_type) - - if content_type == 'application/json': - return await key_value_store.set_value(key, items) - - raise ValueError(f'Unsupported content type: {content_type}') - - @classmethod - async def export_to_json( - cls: type[Dataset], - key: str, - *, - from_dataset_id: str | None = None, - from_dataset_name: str | None = None, - to_key_value_store_id: str | None = None, - to_key_value_store_name: str | None = None, - ) -> None: - """Save the entirety of the dataset's contents into one JSON file within a key-value store. - - Args: - key (str): The key to save the data under. - from_dataset_id (str, optional): The ID of the dataset in case of calling the class method. Uses default dataset if omitted. - from_dataset_name (str, optional): The name of the dataset in case of calling the class method. Uses default dataset if omitted. - You must specify only one of `from_dataset_id` and `from_dataset_name` arguments. - If you omit both, it uses the default dataset. - to_key_value_store_id (str, optional): The id of the key-value store in which the result will be saved. - to_key_value_store_name (str, optional): The name of the key-value store in which the result will be saved. - You must specify only one of `to_key_value_store_id` and `to_key_value_store_name` arguments. - If you omit both, it uses the default key-value store. - """ - dataset = await cls.open(id=from_dataset_id, name=from_dataset_name) - await dataset.export_to_json(key, to_key_value_store_id=to_key_value_store_id, to_key_value_store_name=to_key_value_store_name) - - async def _export_to_json_internal( - self: Dataset, - key: str, - *, - from_dataset_id: str | None = None, # noqa: ARG002 - from_dataset_name: str | None = None, # noqa: ARG002 - to_key_value_store_id: str | None = None, - to_key_value_store_name: str | None = None, - ) -> None: - await self.export_to( - key, - to_key_value_store_id=to_key_value_store_id, - to_key_value_store_name=to_key_value_store_name, - content_type='application/json', - ) - - @classmethod - async def export_to_csv( - cls: type[Dataset], - key: str, - *, - from_dataset_id: str | None = None, - from_dataset_name: str | None = None, - to_key_value_store_id: str | None = None, - to_key_value_store_name: str | None = None, - ) -> None: - """Save the entirety of the dataset's contents into one CSV file within a key-value store. - - Args: - key (str): The key to save the data under. - from_dataset_id (str, optional): The ID of the dataset in case of calling the class method. Uses default dataset if omitted. - from_dataset_name (str, optional): The name of the dataset in case of calling the class method. Uses default dataset if omitted. - You must specify only one of `from_dataset_id` and `from_dataset_name` arguments. - If you omit both, it uses the default dataset. - to_key_value_store_id (str, optional): The id of the key-value store in which the result will be saved. - to_key_value_store_name (str, optional): The name of the key-value store in which the result will be saved. - You must specify only one of `to_key_value_store_id` and `to_key_value_store_name` arguments. - If you omit both, it uses the default key-value store. - """ - dataset = await cls.open(id=from_dataset_id, name=from_dataset_name) - await dataset.export_to_csv(key, to_key_value_store_id=to_key_value_store_id, to_key_value_store_name=to_key_value_store_name) - - async def _export_to_csv_internal( - self: Dataset, - key: str, - *, - from_dataset_id: str | None = None, # noqa: ARG002 - from_dataset_name: str | None = None, # noqa: ARG002 - to_key_value_store_id: str | None = None, - to_key_value_store_name: str | None = None, - ) -> None: - await self.export_to( - key, - to_key_value_store_id=to_key_value_store_id, - to_key_value_store_name=to_key_value_store_name, - content_type='text/csv', - ) - - async def get_info(self: Dataset) -> dict | None: - """Get an object containing general information about the dataset. - - Returns: - dict: Object returned by calling the GET dataset API endpoint. - """ - return await self._dataset_client.get() - - def iterate_items( - self: Dataset, - *, - offset: int = 0, - limit: int | None = None, - clean: bool | None = None, - desc: bool | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool | None = None, - skip_hidden: bool | None = None, - ) -> AsyncIterator[dict]: - """Iterate over the items in the dataset. - - Args: - offset (int, optional): Number of items that should be skipped at the start. The default value is 0 - limit (int, optional): Maximum number of items to return. By default there is no limit. - desc (bool, optional): By default, results are returned in the same order as they were stored. - To reverse the order, set this parameter to True. - clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character). - The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters. - Note that since some objects might be skipped from the output, that the result might contain less items than the limit value. - fields (list of str, optional): A list of fields which should be picked from the items, - only these fields will remain in the resulting record objects. - Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter. - You can use this feature to effectively fix the output format. - omit (list of str, optional): A list of fields which should be omitted from the items. - unwind (str, optional): Name of a field which should be unwound. - If the field is an array then every element of the array will become a separate record and merged with parent object. - If the unwound field is an object then it is merged with the parent object. - If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object, - then the item gets preserved as it is. Note that the unwound items ignore the desc parameter. - skip_empty (bool, optional): If True, then empty items are skipped from the output. - Note that if used, the results might contain less items than the limit value. - skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character. - - Yields: - dict: An item from the dataset - """ - return self._dataset_client.iterate_items( - offset=offset, - limit=limit, - clean=clean, - desc=desc, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_hidden=skip_hidden, - ) - - async def drop(self: Dataset) -> None: - """Remove the dataset either from the Apify cloud storage or from the local directory.""" - await self._dataset_client.delete() - self._remove_from_cache() - - @classmethod - async def open( - cls: type[Dataset], - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - config: Configuration | None = None, - ) -> Dataset: - """Open a dataset. - - Datasets are used to store structured data where each object stored has the same attributes, - such as online store products or real estate offers. - The actual data is stored either on the local filesystem or in the Apify cloud. - - Args: - id (str, optional): ID of the dataset to be opened. - If neither `id` nor `name` are provided, the method returns the default dataset associated with the actor run. - If the dataset with the given ID does not exist, it raises an error. - name (str, optional): Name of the dataset to be opened. - If neither `id` nor `name` are provided, the method returns the default dataset associated with the actor run. - If the dataset with the given name does not exist, it is created. - force_cloud (bool, optional): If set to True, it will open a dataset on the Apify Platform even when running the actor locally. - Defaults to False. - config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted. - - Returns: - Dataset: An instance of the `Dataset` class for the given ID or name. - """ - return await super().open(id=id, name=name, force_cloud=force_cloud, config=config) # type: ignore diff --git a/src/apify/storages/key_value_store.py b/src/apify/storages/key_value_store.py deleted file mode 100644 index 71d843ae..00000000 --- a/src/apify/storages/key_value_store.py +++ /dev/null @@ -1,257 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any, AsyncIterator, NamedTuple, TypedDict, TypeVar, overload - -from apify_client.clients import KeyValueStoreClientAsync, KeyValueStoreCollectionClientAsync -from apify_shared.utils import ignore_docs - -from apify._utils import wrap_internal -from apify.storages.base_storage import BaseStorage - -if TYPE_CHECKING: - from apify_client import ApifyClientAsync - - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import KeyValueStoreClient, KeyValueStoreCollectionClient - from apify.config import Configuration - - -T = TypeVar('T') - - -class IterateKeysInfo(TypedDict): - """Contains information about a key-value store record.""" - - size: int - - -class IterateKeysTuple(NamedTuple): - """A tuple representing a key-value store record.""" - - key: str - info: IterateKeysInfo - - -class KeyValueStore(BaseStorage): - """The `KeyValueStore` class represents a key-value store. - - You can imagine it as a simple data storage that is used - for saving and reading data records or files. Each data record is - represented by a unique key and associated with a MIME content type. - - Do not instantiate this class directly, use the `Actor.open_key_value_store()` function instead. - - Each crawler run is associated with a default key-value store, which is created exclusively - for the run. By convention, the crawler input and output are stored into the - default key-value store under the `INPUT` and `OUTPUT` key, respectively. - Typically, input and output are JSON files, although it can be any other format. - To access the default key-value store directly, you can use the - `KeyValueStore.get_value` and `KeyValueStore.set_value` convenience functions. - - `KeyValueStore` stores its data either on local disk or in the Apify cloud, - depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set. - - If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in - the local directory in the following files: - ``` - {APIFY_LOCAL_STORAGE_DIR}/key_value_stores/{STORE_ID}/{INDEX}.{EXT} - ``` - Note that `{STORE_ID}` is the name or ID of the key-value store. The default key-value store has ID: `default`, - unless you override it by setting the `APIFY_DEFAULT_KEY_VALUE_STORE_ID` environment variable. - The `{KEY}` is the key of the record and `{EXT}` corresponds to the MIME content type of the data value. - - If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the - [Apify Key-value store](https://docs.apify.com/storage/key-value-store) cloud storage. - """ - - _id: str - _name: str | None - _key_value_store_client: KeyValueStoreClientAsync | KeyValueStoreClient - - @ignore_docs - def __init__( - self: KeyValueStore, - id: str, # noqa: A002 - name: str | None, - client: ApifyClientAsync | MemoryStorageClient, - config: Configuration, - ) -> None: - """Create a `KeyValueStore` instance. - - Do not use the constructor directly, use the `Actor.open_key_value_store()` function instead. - - Args: - id (str): ID of the key-value store. - name (str, optional): Name of the key-value store. - client (ApifyClientAsync or MemoryStorageClient): The storage client which should be used. - config (Configuration): The configuration which should be used. - """ - super().__init__(id=id, name=name, client=client, config=config) - - self.get_value = wrap_internal(self._get_value_internal, self.get_value) # type: ignore - self.set_value = wrap_internal(self._set_value_internal, self.set_value) # type: ignore - self.get_public_url = wrap_internal(self._get_public_url_internal, self.get_public_url) # type: ignore - self._id = id - self._name = name - self._key_value_store_client = client.key_value_store(self._id) - - @classmethod - def _get_human_friendly_label(cls: type[KeyValueStore]) -> str: - return 'Key-value store' - - @classmethod - def _get_default_id(cls: type[KeyValueStore], config: Configuration) -> str: - return config.default_key_value_store_id - - @classmethod - def _get_single_storage_client( - cls: type[KeyValueStore], - id: str, # noqa: A002 - client: ApifyClientAsync | MemoryStorageClient, - ) -> KeyValueStoreClientAsync | KeyValueStoreClient: - return client.key_value_store(id) - - @classmethod - def _get_storage_collection_client( - cls: type[KeyValueStore], - client: ApifyClientAsync | MemoryStorageClient, - ) -> KeyValueStoreCollectionClientAsync | KeyValueStoreCollectionClient: - return client.key_value_stores() - - @overload - @classmethod - async def get_value(cls: type[KeyValueStore], key: str) -> Any: - ... - - @overload - @classmethod - async def get_value(cls: type[KeyValueStore], key: str, default_value: T) -> T: - ... - - @overload - @classmethod - async def get_value(cls: type[KeyValueStore], key: str, default_value: T | None = None) -> T | None: - ... - - @classmethod - async def get_value(cls: type[KeyValueStore], key: str, default_value: T | None = None) -> T | None: - """Get a value from the key-value store. - - Args: - key (str): Key of the record to retrieve. - default_value (Any, optional): Default value returned in case the record does not exist. - - Returns: - Any: The value associated with the given key. `default_value` is used in case the record does not exist. - """ - store = await cls.open() - return await store.get_value(key, default_value) - - async def _get_value_internal(self: KeyValueStore, key: str, default_value: T | None = None) -> T | None: - record = await self._key_value_store_client.get_record(key) - return record['value'] if record else default_value - - async def iterate_keys( - self: KeyValueStore, - exclusive_start_key: str | None = None, - ) -> AsyncIterator[IterateKeysTuple]: - """Iterate over the keys in the key-value store. - - Args: - exclusive_start_key (str, optional): All keys up to this one (including) are skipped from the result. - - Yields: - IterateKeysTuple: A tuple `(key, info)`, - where `key` is the record key, and `info` is an object that contains a single property `size` - indicating size of the record in bytes. - """ - while True: - list_keys = await self._key_value_store_client.list_keys(exclusive_start_key=exclusive_start_key) - for item in list_keys['items']: - yield IterateKeysTuple(item['key'], {'size': item['size']}) - - if not list_keys['isTruncated']: - break - exclusive_start_key = list_keys['nextExclusiveStartKey'] - - @classmethod - async def set_value( - cls: type[KeyValueStore], - key: str, - value: Any, - content_type: str | None = None, - ) -> None: - """Set or delete a value in the key-value store. - - Args: - key (str): The key under which the value should be saved. - value (Any): The value to save. If the value is `None`, the corresponding key-value pair will be deleted. - content_type (str, optional): The content type of the saved value. - """ - store = await cls.open() - return await store.set_value(key, value, content_type) - - async def _set_value_internal( - self: KeyValueStore, - key: str, - value: Any, - content_type: str | None = None, - ) -> None: - if value is None: - return await self._key_value_store_client.delete_record(key) - - return await self._key_value_store_client.set_record(key, value, content_type) - - @classmethod - async def get_public_url(cls: type[KeyValueStore], key: str) -> str: - """Get a URL for the given key that may be used to publicly access the value in the remote key-value store. - - Args: - key (str): The key for which the URL should be generated. - """ - store = await cls.open() - return await store.get_public_url(key) - - async def _get_public_url_internal(self: KeyValueStore, key: str) -> str: - if not isinstance(self._key_value_store_client, KeyValueStoreClientAsync): - raise RuntimeError('Cannot generate a public URL for this key-value store as it is not on the Apify Platform!') # noqa: TRY004 - - public_api_url = self._config.api_public_base_url - - return f'{public_api_url}/v2/key-value-stores/{self._id}/records/{key}' - - async def drop(self: KeyValueStore) -> None: - """Remove the key-value store either from the Apify cloud storage or from the local directory.""" - await self._key_value_store_client.delete() - self._remove_from_cache() - - @classmethod - async def open( - cls: type[KeyValueStore], - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - config: Configuration | None = None, - ) -> KeyValueStore: - """Open a key-value store. - - Key-value stores are used to store records or files, along with their MIME content type. - The records are stored and retrieved using a unique key. - The actual data is stored either on a local filesystem or in the Apify cloud. - - Args: - id (str, optional): ID of the key-value store to be opened. - If neither `id` nor `name` are provided, the method returns the default key-value store associated with the actor run. - If the key-value store with the given ID does not exist, it raises an error. - name (str, optional): Name of the key-value store to be opened. - If neither `id` nor `name` are provided, the method returns the default key-value store associated with the actor run. - If the key-value store with the given name does not exist, it is created. - force_cloud (bool, optional): If set to True, it will open a key-value store on the Apify Platform even when running the actor locally. - Defaults to False. - config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted. - - Returns: - KeyValueStore: An instance of the `KeyValueStore` class for the given ID or name. - """ - return await super().open(id=id, name=name, force_cloud=force_cloud, config=config) # type: ignore diff --git a/src/apify/storages/request_queue.py b/src/apify/storages/request_queue.py deleted file mode 100644 index 79d64b5e..00000000 --- a/src/apify/storages/request_queue.py +++ /dev/null @@ -1,602 +0,0 @@ -from __future__ import annotations - -import asyncio -from collections import OrderedDict -from datetime import datetime, timezone -from typing import TYPE_CHECKING -from typing import OrderedDict as OrderedDictType - -from apify_shared.utils import ignore_docs - -from apify._crypto import crypto_random_object_id -from apify._utils import LRUCache, budget_ow, compute_unique_key, unique_key_to_request_id -from apify.consts import REQUEST_QUEUE_HEAD_MAX_LIMIT -from apify.log import logger -from apify.storages.base_storage import BaseStorage - -if TYPE_CHECKING: - from apify_client import ApifyClientAsync - from apify_client.clients import RequestQueueClientAsync, RequestQueueCollectionClientAsync - - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import RequestQueueClient, RequestQueueCollectionClient - from apify.config import Configuration - - -MAX_CACHED_REQUESTS = 1_000_000 - -# When requesting queue head we always fetch requestsInProgressCount * QUERY_HEAD_BUFFER number of requests. -QUERY_HEAD_MIN_LENGTH = 100 - -QUERY_HEAD_BUFFER = 3 - -# If queue was modified (request added/updated/deleted) before more than API_PROCESSED_REQUESTS_DELAY_MILLIS -# then we assume the get head operation to be consistent. -API_PROCESSED_REQUESTS_DELAY_MILLIS = 10_000 - -# How many times we try to get queue head with queueModifiedAt older than API_PROCESSED_REQUESTS_DELAY_MILLIS. -MAX_QUERIES_FOR_CONSISTENCY = 6 - -# This number must be large enough so that processing of all these requests cannot be done in -# a time lower than expected maximum latency of DynamoDB, but low enough not to waste too much memory. -RECENTLY_HANDLED_CACHE_SIZE = 1000 - -# Indicates how long it usually takes for the underlying storage to propagate all writes -# to be available to subsequent reads. -STORAGE_CONSISTENCY_DELAY_MILLIS = 3000 - - -class RequestQueue(BaseStorage): - """Represents a queue of URLs to crawl. - - Can be used for deep crawling of websites where you start with several URLs and then recursively - follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders. - - Each URL is represented using an instance of the {@apilink Request} class. - The queue can only contain unique URLs. More precisely, it can only contain request dictionaries - with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden. - To add a single URL multiple times to the queue, - corresponding request dictionary will need to have different `uniqueKey` properties. - - Do not instantiate this class directly, use the `Actor.open_request_queue()` function instead. - - `RequestQueue` stores its data either on local disk or in the Apify cloud, - depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set. - - If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in - the local directory in the following files: - ``` - {APIFY_LOCAL_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json - ``` - Note that `{QUEUE_ID}` is the name or ID of the request queue. The default request queue has ID: `default`, - unless you override it by setting the `APIFY_DEFAULT_REQUEST_QUEUE_ID` environment variable. - The `{REQUEST_ID}` is the id of the request. - - If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the - [Apify Request Queue](https://docs.apify.com/storage/request-queue) - cloud storage. - """ - - _request_queue_client: RequestQueueClientAsync | RequestQueueClient - _client_key = crypto_random_object_id() - _queue_head_dict: OrderedDictType[str, str] - _query_queue_head_task: asyncio.Task | None - _in_progress: set[str] - _last_activity: datetime - _internal_timeout_seconds = 5 * 60 - _recently_handled: LRUCache[bool] - _assumed_total_count = 0 - _assumed_handled_count = 0 - _requests_cache: LRUCache[dict] - - @ignore_docs - def __init__( - self: RequestQueue, - id: str, # noqa: A002 - name: str | None, - client: ApifyClientAsync | MemoryStorageClient, - config: Configuration, - ) -> None: - """Create a `RequestQueue` instance. - - Do not use the constructor directly, use the `Actor.open_request_queue()` function instead. - - Args: - id (str): ID of the request queue. - name (str, optional): Name of the request queue. - client (ApifyClientAsync or MemoryStorageClient): The storage client which should be used. - config (Configuration): The configuration which should be used. - """ - super().__init__(id=id, name=name, client=client, config=config) - - self._request_queue_client = client.request_queue(self._id, client_key=self._client_key) - self._queue_head_dict = OrderedDict() - self._query_queue_head_task = None - self._in_progress = set() - self._last_activity = datetime.now(timezone.utc) - self._recently_handled = LRUCache[bool](max_length=RECENTLY_HANDLED_CACHE_SIZE) - self._requests_cache = LRUCache(max_length=MAX_CACHED_REQUESTS) - - @classmethod - def _get_human_friendly_label(cls: type[RequestQueue]) -> str: - return 'Request queue' - - @classmethod - def _get_default_id(cls: type[RequestQueue], config: Configuration) -> str: - return config.default_request_queue_id - - @classmethod - def _get_single_storage_client( - cls: type[RequestQueue], - id: str, # noqa: A002 - client: ApifyClientAsync | MemoryStorageClient, - ) -> RequestQueueClientAsync | RequestQueueClient: - return client.request_queue(id) - - @classmethod - def _get_storage_collection_client( - cls: type[RequestQueue], - client: ApifyClientAsync | MemoryStorageClient, - ) -> RequestQueueCollectionClientAsync | RequestQueueCollectionClient: - return client.request_queues() - - async def add_request( - self: RequestQueue, - request: dict, - *, - forefront: bool = False, - keep_url_fragment: bool = False, - use_extended_unique_key: bool = False, - ) -> dict: - """Adds a request to the `RequestQueue` while managing deduplication and positioning within the queue. - - The deduplication of requests relies on the `uniqueKey` field within the request dictionary. If `uniqueKey` - exists, it remains unchanged; if it does not, it is generated based on the request's `url`, `method`, - and `payload` fields. The generation of `uniqueKey` can be influenced by the `keep_url_fragment` and - `use_extended_unique_key` flags, which dictate whether to include the URL fragment and the request's method - and payload, respectively, in its computation. - - The request can be added to the forefront (beginning) or the back of the queue based on the `forefront` - parameter. Information about the request's addition to the queue, including whether it was already present or - handled, is returned in an output dictionary. - - Args: - request: The request object to be added to the queue. Must include at least the `url` key. - Optionaly it can include the `method`, `payload` and `uniqueKey` keys. - - forefront: If True, adds the request to the forefront of the queue; otherwise, adds it to the end. - - keep_url_fragment: Determines whether the URL fragment (the part of the URL after '#') should be retained - in the unique key computation. - - use_extended_unique_key: Determines whether to use an extended unique key, incorporating the request's - method and payload into the unique key computation. - - Returns: A dictionary containing information about the operation, including: - - `requestId` (str): The ID of the request. - - `uniqueKey` (str): The unique key associated with the request. - - `wasAlreadyPresent` (bool): Indicates whether the request was already in the queue. - - `wasAlreadyHandled` (bool): Indicates whether the request was already processed. - """ - budget_ow( - request, - { - 'url': (str, True), - }, - ) - self._last_activity = datetime.now(timezone.utc) - - if request.get('uniqueKey') is None: - request['uniqueKey'] = compute_unique_key( - url=request['url'], - method=request.get('method', 'GET'), - payload=request.get('payload'), - keep_url_fragment=keep_url_fragment, - use_extended_unique_key=use_extended_unique_key, - ) - - cache_key = unique_key_to_request_id(request['uniqueKey']) - cached_info = self._requests_cache.get(cache_key) - - if cached_info: - request['id'] = cached_info['id'] - return { - 'wasAlreadyPresent': True, - # We may assume that if request is in local cache then also the information if the - # request was already handled is there because just one client should be using one queue. - 'wasAlreadyHandled': cached_info['isHandled'], - 'requestId': cached_info['id'], - 'uniqueKey': cached_info['uniqueKey'], - } - - queue_operation_info = await self._request_queue_client.add_request(request, forefront=forefront) - queue_operation_info['uniqueKey'] = request['uniqueKey'] - - self._cache_request(cache_key, queue_operation_info) - - request_id, was_already_present = queue_operation_info['requestId'], queue_operation_info['wasAlreadyPresent'] - is_handled = request.get('handledAt') is not None - if not is_handled and not was_already_present and request_id not in self._in_progress and self._recently_handled.get(request_id) is None: - self._assumed_total_count += 1 - - self._maybe_add_request_to_queue_head(request_id, forefront) - - return queue_operation_info - - async def get_request(self: RequestQueue, request_id: str) -> dict | None: - """Retrieve a request from the queue. - - Args: - request_id (str): ID of the request to retrieve. - - Returns: - dict, optional: The retrieved request, or `None`, if it does not exist. - """ - budget_ow(request_id, (str, True), 'request_id') - return await self._request_queue_client.get_request(request_id) - - async def fetch_next_request(self: RequestQueue) -> dict | None: - """Return the next request in the queue to be processed. - - Once you successfully finish processing of the request, you need to call - `RequestQueue.mark_request_as_handled` to mark the request as handled in the queue. - If there was some error in processing the request, call `RequestQueue.reclaim_request` instead, - so that the queue will give the request to some other consumer in another call to the `fetch_next_request` method. - - Note that the `None` return value does not mean the queue processing finished, it means there are currently no pending requests. - To check whether all requests in queue were finished, use `RequestQueue.is_finished` instead. - - Returns: - dict, optional: The request or `None` if there are no more pending requests. - """ - await self._ensure_head_is_non_empty() - - # We are likely done at this point. - if len(self._queue_head_dict) == 0: - return None - - next_request_id, _ = self._queue_head_dict.popitem(last=False) # ~removeFirst() - - # This should never happen, but... - if next_request_id in self._in_progress or self._recently_handled.get(next_request_id): - logger.warning( - 'Queue head returned a request that is already in progress?!', - extra={ - 'nextRequestId': next_request_id, - 'inProgress': next_request_id in self._in_progress, - 'recentlyHandled': next_request_id in self._recently_handled, - }, - ) - return None - self._in_progress.add(next_request_id) - self._last_activity = datetime.now(timezone.utc) - - try: - request = await self.get_request(next_request_id) - except Exception: - # On error, remove the request from in progress, otherwise it would be there forever - self._in_progress.remove(next_request_id) - raise - - # NOTE: It can happen that the queue head index is inconsistent with the main queue table. This can occur in two situations: - - """ 1) Queue head index is ahead of the main table and the request is not present in the main table yet (i.e. getRequest() returned null). - In this case, keep the request marked as in progress for a short while, - so that isFinished() doesn't return true and _ensureHeadIsNonEmpty() doesn't not load the request - into the queueHeadDict straight again. After the interval expires, fetchNextRequest() - will try to fetch this request again, until it eventually appears in the main table. - """ - if request is None: - logger.debug('Cannot find a request from the beginning of queue, will be retried later', extra={'nextRequestId': next_request_id}) - asyncio.get_running_loop().call_later(STORAGE_CONSISTENCY_DELAY_MILLIS // 1000, lambda: self._in_progress.remove(next_request_id)) - return None - - """ 2) Queue head index is behind the main table and the underlying request was already handled - (by some other client, since we keep the track of handled requests in recentlyHandled dictionary). - We just add the request to the recentlyHandled dictionary so that next call to _ensureHeadIsNonEmpty() - will not put the request again to queueHeadDict. - """ - if request.get('handledAt') is not None: - logger.debug('Request fetched from the beginning of queue was already handled', extra={'nextRequestId': next_request_id}) - self._recently_handled[next_request_id] = True - return None - - return request - - async def mark_request_as_handled(self: RequestQueue, request: dict) -> dict | None: - """Mark a request as handled after successful processing. - - Handled requests will never again be returned by the `RequestQueue.fetch_next_request` method. - - Args: - request (dict): The request to mark as handled. - - Returns: - dict, optional: Information about the queue operation with keys `requestId`, `uniqueKey`, `wasAlreadyPresent`, `wasAlreadyHandled`. - `None` if the given request was not in progress. - """ - budget_ow( - request, - { - 'id': (str, True), - 'uniqueKey': (str, True), - 'handledAt': (datetime, False), - }, - ) - self._last_activity = datetime.now(timezone.utc) - if request['id'] not in self._in_progress: - logger.debug('Cannot mark request as handled, because it is not in progress!', extra={'requestId': request['id']}) - return None - - request['handledAt'] = request.get('handledAt', datetime.now(timezone.utc)) - queue_operation_info = await self._request_queue_client.update_request({**request}) - queue_operation_info['uniqueKey'] = request['uniqueKey'] - - self._in_progress.remove(request['id']) - self._recently_handled[request['id']] = True - - if not queue_operation_info['wasAlreadyHandled']: - self._assumed_handled_count += 1 - - self._cache_request(unique_key_to_request_id(request['uniqueKey']), queue_operation_info) - - return queue_operation_info - - async def reclaim_request( - self: RequestQueue, - request: dict, - forefront: bool = False, # noqa: FBT001, FBT002 - ) -> dict | None: - """Reclaim a failed request back to the queue. - - The request will be returned for processing later again - by another call to `RequestQueue.fetchNextRequest`. - - Args: - request (dict): The request to return to the queue. - forefront (bool, optional): Whether to add the request to the head or the end of the queue - Returns: - dict, optional: Information about the queue operation with keys `requestId`, `uniqueKey`, `wasAlreadyPresent`, `wasAlreadyHandled`. - `None` if the given request was not in progress. - """ - budget_ow( - request, - { - 'id': (str, True), - 'uniqueKey': (str, True), - }, - ) - self._last_activity = datetime.now(timezone.utc) - - if request['id'] not in self._in_progress: - logger.debug('Cannot reclaim request, because it is not in progress!', extra={'requestId': request['id']}) - return None - - # TODO: If request hasn't been changed since the last getRequest(), we don't need to call updateRequest() - # and thus improve performance. - # https://github.com/apify/apify-sdk-python/issues/143 - queue_operation_info = await self._request_queue_client.update_request(request, forefront=forefront) - queue_operation_info['uniqueKey'] = request['uniqueKey'] - self._cache_request(unique_key_to_request_id(request['uniqueKey']), queue_operation_info) - - # Wait a little to increase a chance that the next call to fetchNextRequest() will return the request with updated data. - # This is to compensate for the limitation of DynamoDB, where writes might not be immediately visible to subsequent reads. - def callback() -> None: - if request['id'] not in self._in_progress: - logger.debug('The request is no longer marked as in progress in the queue?!', {'requestId': request['id']}) - return - - self._in_progress.remove(request['id']) - - # Performance optimization: add request straight to head if possible - self._maybe_add_request_to_queue_head(request['id'], forefront) - - asyncio.get_running_loop().call_later(STORAGE_CONSISTENCY_DELAY_MILLIS // 1000, callback) - - return queue_operation_info - - def _in_progress_count(self: RequestQueue) -> int: - return len(self._in_progress) - - async def is_empty(self: RequestQueue) -> bool: - """Check whether the queue is empty. - - Returns: - bool: `True` if the next call to `RequestQueue.fetchNextRequest` would return `None`, otherwise `False`. - """ - await self._ensure_head_is_non_empty() - return len(self._queue_head_dict) == 0 - - async def is_finished(self: RequestQueue) -> bool: - """Check whether the queue is finished. - - Due to the nature of distributed storage used by the queue, - the function might occasionally return a false negative, - but it will never return a false positive. - - Returns: - bool: `True` if all requests were already handled and there are no more left. `False` otherwise. - """ - seconds_since_last_activity = (datetime.now(timezone.utc) - self._last_activity).seconds - if self._in_progress_count() > 0 and seconds_since_last_activity > self._internal_timeout_seconds: - message = f'The request queue seems to be stuck for {self._internal_timeout_seconds}s, resetting internal state.' - logger.warning(message) - self._reset() - - if len(self._queue_head_dict) > 0 or self._in_progress_count() > 0: - return False - - is_head_consistent = await self._ensure_head_is_non_empty(ensure_consistency=True) - return is_head_consistent and len(self._queue_head_dict) == 0 and self._in_progress_count() == 0 - - def _reset(self: RequestQueue) -> None: - self._queue_head_dict.clear() - self._query_queue_head_task = None - self._in_progress.clear() - self._recently_handled.clear() - self._assumed_total_count = 0 - self._assumed_handled_count = 0 - self._requests_cache.clear() - self._last_activity = datetime.now(timezone.utc) - - def _cache_request(self: RequestQueue, cache_key: str, queue_operation_info: dict) -> None: - self._requests_cache[cache_key] = { - 'id': queue_operation_info['requestId'], - 'isHandled': queue_operation_info['wasAlreadyHandled'], - 'uniqueKey': queue_operation_info['uniqueKey'], - 'wasAlreadyHandled': queue_operation_info['wasAlreadyHandled'], - } - - async def _queue_query_head(self: RequestQueue, limit: int) -> dict: - query_started_at = datetime.now(timezone.utc) - - list_head = await self._request_queue_client.list_head(limit=limit) - for request in list_head['items']: - # Queue head index might be behind the main table, so ensure we don't recycle requests - if not request['id'] or not request['uniqueKey'] or request['id'] in self._in_progress or self._recently_handled.get(request['id']): - continue - self._queue_head_dict[request['id']] = request['id'] - self._cache_request( - unique_key_to_request_id(request['uniqueKey']), - { - 'requestId': request['id'], - 'wasAlreadyHandled': False, - 'wasAlreadyPresent': True, - 'uniqueKey': request['uniqueKey'], - }, - ) - - # This is needed so that the next call to _ensureHeadIsNonEmpty() will fetch the queue head again. - self._query_queue_head_task = None - - return { - 'wasLimitReached': len(list_head['items']) >= limit, - 'prevLimit': limit, - 'queueModifiedAt': list_head['queueModifiedAt'], - 'queryStartedAt': query_started_at, - 'hadMultipleClients': list_head['hadMultipleClients'], - } - - async def _ensure_head_is_non_empty( - self: RequestQueue, - ensure_consistency: bool = False, # noqa: FBT001, FBT002 - limit: int | None = None, - iteration: int = 0, - ) -> bool: - # If is nonempty resolve immediately. - if len(self._queue_head_dict) > 0: - return True - - if limit is None: - limit = max(self._in_progress_count() * QUERY_HEAD_BUFFER, QUERY_HEAD_MIN_LENGTH) - - if self._query_queue_head_task is None: - self._query_queue_head_task = asyncio.Task(self._queue_query_head(limit)) - - queue_head = await self._query_queue_head_task - - # TODO: I feel this code below can be greatly simplified... (comes from TS implementation *wink*) - # https://github.com/apify/apify-sdk-python/issues/142 - - # If queue is still empty then one of the following holds: - # - the other calls waiting for this task already consumed all the returned requests - # - the limit was too low and contained only requests in progress - # - the writes from other clients were not propagated yet - # - the whole queue was processed and we are done - - # If limit was not reached in the call then there are no more requests to be returned. - if queue_head['prevLimit'] >= REQUEST_QUEUE_HEAD_MAX_LIMIT: - logger.warning('Reached the maximum number of requests in progress', extra={'limit': REQUEST_QUEUE_HEAD_MAX_LIMIT}) - - should_repeat_with_higher_limit = ( - len(self._queue_head_dict) == 0 and queue_head['wasLimitReached'] and queue_head['prevLimit'] < REQUEST_QUEUE_HEAD_MAX_LIMIT - ) - - # If ensureConsistency=true then we must ensure that either: - # - queueModifiedAt is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS - # - hadMultipleClients=false and this.assumedTotalCount<=this.assumedHandledCount - is_database_consistent = (queue_head['queryStartedAt'] - queue_head['queueModifiedAt'].replace(tzinfo=timezone.utc)).seconds >= ( - API_PROCESSED_REQUESTS_DELAY_MILLIS // 1000 - ) - is_locally_consistent = not queue_head['hadMultipleClients'] and self._assumed_total_count <= self._assumed_handled_count - # Consistent information from one source is enough to consider request queue finished. - should_repeat_for_consistency = ensure_consistency and not is_database_consistent and not is_locally_consistent - - # If both are false then head is consistent and we may exit. - if not should_repeat_with_higher_limit and not should_repeat_for_consistency: - return True - - # If we are querying for consistency then we limit the number of queries to MAX_QUERIES_FOR_CONSISTENCY. - # If this is reached then we return false so that empty() and finished() returns possibly false negative. - if not should_repeat_with_higher_limit and iteration > MAX_QUERIES_FOR_CONSISTENCY: - return False - - next_limit = round(queue_head['prevLimit'] * 1.5) if should_repeat_with_higher_limit else queue_head['prevLimit'] - - # If we are repeating for consistency then wait required time. - if should_repeat_for_consistency: - delay_seconds = (API_PROCESSED_REQUESTS_DELAY_MILLIS // 1000) - (datetime.now(timezone.utc) - queue_head['queueModifiedAt']).seconds - logger.info(f'Waiting for {delay_seconds}s before considering the queue as finished to ensure that the data is consistent.') - await asyncio.sleep(delay_seconds) - - return await self._ensure_head_is_non_empty(ensure_consistency, next_limit, iteration + 1) - - def _maybe_add_request_to_queue_head( - self: RequestQueue, - request_id: str, - forefront: bool, # noqa: FBT001 - ) -> None: - if forefront: - self._queue_head_dict[request_id] = request_id - # Move to start, i.e. forefront of the queue - self._queue_head_dict.move_to_end(request_id, last=False) - elif self._assumed_total_count < QUERY_HEAD_MIN_LENGTH: - # OrderedDict puts the item to the end of the queue by default - self._queue_head_dict[request_id] = request_id - - async def drop(self: RequestQueue) -> None: - """Remove the request queue either from the Apify cloud storage or from the local directory.""" - await self._request_queue_client.delete() - self._remove_from_cache() - - async def get_info(self: RequestQueue) -> dict | None: - """Get an object containing general information about the request queue. - - Returns: - dict: Object returned by calling the GET request queue API endpoint. - """ - return await self._request_queue_client.get() - - @classmethod - async def open( - cls: type[RequestQueue], - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - config: Configuration | None = None, - ) -> RequestQueue: - """Open a request queue. - - Request queue represents a queue of URLs to crawl, which is stored either on local filesystem or in the Apify cloud. - The queue is used for deep crawling of websites, where you start with several URLs and then - recursively follow links to other pages. The data structure supports both breadth-first - and depth-first crawling orders. - - Args: - id (str, optional): ID of the request queue to be opened. - If neither `id` nor `name` are provided, the method returns the default request queue associated with the actor run. - If the request queue with the given ID does not exist, it raises an error. - name (str, optional): Name of the request queue to be opened. - If neither `id` nor `name` are provided, the method returns the default request queue associated with the actor run. - If the request queue with the given name does not exist, it is created. - force_cloud (bool, optional): If set to True, it will open a request queue on the Apify Platform even when running the actor locally. - Defaults to False. - config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted. - - Returns: - RequestQueue: An instance of the `RequestQueue` class for the given ID or name. - """ - queue = await super().open(id=id, name=name, force_cloud=force_cloud, config=config) - await queue._ensure_head_is_non_empty() # type: ignore - return queue # type: ignore diff --git a/src/apify/storages/storage_client_manager.py b/src/apify/storages/storage_client_manager.py deleted file mode 100644 index 52207248..00000000 --- a/src/apify/storages/storage_client_manager.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from apify_shared.utils import ignore_docs - -from apify._memory_storage import MemoryStorageClient -from apify.config import Configuration - -if TYPE_CHECKING: - from apify_client import ApifyClientAsync - - -@ignore_docs -class StorageClientManager: - """A class for managing storage clients.""" - - _config: Configuration - - _local_client: MemoryStorageClient | None = None - _cloud_client: ApifyClientAsync | None = None - - _default_instance: StorageClientManager | None = None - - def __init__(self: StorageClientManager) -> None: - """Create a `StorageClientManager` instance.""" - self._config = Configuration.get_global_configuration() - - @classmethod - def set_config(cls: type[StorageClientManager], config: Configuration) -> None: - """Set the config for the StorageClientManager. - - Args: - config (Configuration): The configuration this StorageClientManager should use. - """ - cls._get_default_instance()._config = config - - @classmethod - def get_storage_client( - cls: type[StorageClientManager], - force_cloud: bool = False, # noqa: FBT001, FBT002 - ) -> ApifyClientAsync | MemoryStorageClient: - """Get the current storage client instance. - - Returns: - ApifyClientAsync or MemoryStorageClient: The current storage client instance. - """ - default_instance = cls._get_default_instance() - if not default_instance._local_client: - default_instance._local_client = MemoryStorageClient(persist_storage=default_instance._config.persist_storage, write_metadata=True) - - if default_instance._config.is_at_home or force_cloud: - assert default_instance._cloud_client is not None # noqa: S101 - return default_instance._cloud_client - - return default_instance._local_client - - @classmethod - def set_cloud_client(cls: type[StorageClientManager], client: ApifyClientAsync) -> None: - """Set the storage client. - - Args: - client (ApifyClientAsync or MemoryStorageClient): The instance of a storage client. - """ - cls._get_default_instance()._cloud_client = client - - @classmethod - def _get_default_instance(cls: type[StorageClientManager]) -> StorageClientManager: - if cls._default_instance is None: - cls._default_instance = cls() - - return cls._default_instance diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index c68d441a..82102bc6 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -133,8 +133,7 @@ def __call__( main_func: Callable | None = None, main_py: str | None = None, source_files: Mapping[str, str | bytes] | None = None, - ) -> Awaitable[ActorClientAsync]: - ... + ) -> Awaitable[ActorClientAsync]: ... @pytest.fixture() diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store.py b/tests/unit/memory_storage/resource_clients/test_key_value_store.py index 86c8e224..848073a0 100644 --- a/tests/unit/memory_storage/resource_clients/test_key_value_store.py +++ b/tests/unit/memory_storage/resource_clients/test_key_value_store.py @@ -20,7 +20,7 @@ from apify._memory_storage.resource_clients import KeyValueStoreClient TINY_PNG = base64.b64decode('iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVQYV2NgYAAAAAMAAWgmWQ0AAAAASUVORK5CYII=') -TINY_BYTES = b'\x12\x34\x56\x78\x90\xAB\xCD\xEF' +TINY_BYTES = b'\x12\x34\x56\x78\x90\xab\xcd\xef' TINY_DATA = {'a': 'b'} TINY_TEXT = 'abcd' diff --git a/website/generate_module_shortcuts.py b/website/generate_module_shortcuts.py index 1e245cc1..f671ea9e 100755 --- a/website/generate_module_shortcuts.py +++ b/website/generate_module_shortcuts.py @@ -19,7 +19,7 @@ def get_module_shortcuts(module, parent_classes=None): shortcuts[f'{module.__name__}.{classname}'] = f'{parent_module_name}.{classname}' for _, submodule in inspect.getmembers(module, inspect.ismodule): - if (submodule.__name__.startswith('apify')): + if submodule.__name__.startswith('apify'): shortcuts.update(get_module_shortcuts(submodule, module_classes)) return shortcuts From fd848e9c81abd8a98a9001a0536d1c204f00952e Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 5 Jun 2024 10:54:07 +0200 Subject: [PATCH 03/68] Consolidate config files --- mypy.ini | 22 ---------------------- pyproject.toml | 27 ++++++++++++++++++++++++++- pytest.ini | 3 --- 3 files changed, 26 insertions(+), 26 deletions(-) delete mode 100644 mypy.ini delete mode 100644 pytest.ini diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index df472b2f..00000000 --- a/mypy.ini +++ /dev/null @@ -1,22 +0,0 @@ -[mypy] -python_version = 3.8 -files = - scripts, - src, - tests -check_untyped_defs = True -disallow_incomplete_defs = True -disallow_untyped_calls = True -disallow_untyped_decorators = True -disallow_untyped_defs = True -no_implicit_optional = True -warn_redundant_casts = True -warn_return_any = True -warn_unreachable = True -warn_unused_ignores = True - -[mypy-scrapy.*] -ignore_missing_imports = True - -[mypy-sortedcollections.*] -ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index 307174de..5a4f9a19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "aiofiles >= 22.1.0", "aioshutil >= 1.0", "colorama >= 0.4.6", - "crawlee >= 0.0.5", + "crawlee >= 0.0.5b3", "cryptography >= 39.0.0", "httpx >= 0.24.0", "psutil >= 5.9.0", @@ -159,3 +159,28 @@ convention = "google" [tool.basedpyright] typeCheckingMode = "standard" + +[tool.pytest.ini_options] +asyncio_mode = "auto" +timeout = 1200 + +[tool.mypy] +python_version = "3.9" +plugins = ["pydantic.mypy"] +files = ["scripts", "src", "tests"] +check_untyped_defs = true +disallow_incomplete_defs = true +disallow_untyped_calls = true +disallow_untyped_decorators = true +disallow_untyped_defs = true +no_implicit_optional = true +warn_redundant_casts = true +warn_return_any = true +warn_unreachable = true +warn_unused_ignores = true + +[tool.mypy-scrapy] +ignore_missing_imports = true + +[tool.mypy-sortedcollections] +ignore_missing_imports = true diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 5231254c..00000000 --- a/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -asyncio_mode = auto -timeout = 1200 From 57873234f24a2745bdbdf1b5517a21f7e7c6f524 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 5 Jun 2024 13:10:41 +0200 Subject: [PATCH 04/68] Remove obsolete tests --- src/apify/scrapy/utils.py | 10 +- tests/integration/conftest.py | 11 +- tests/unit/actor/test_actor_dataset.py | 2 +- .../unit/actor/test_actor_key_value_store.py | 6 +- .../actor/test_actor_memory_storage_e2e.py | 2 +- tests/unit/conftest.py | 20 +- tests/unit/memory_storage/__init__.py | 0 .../resource_clients/__init__.py | 0 .../resource_clients/test_dataset.py | 138 ------ .../test_dataset_collection.py | 42 -- .../resource_clients/test_key_value_store.py | 403 ------------------ .../test_key_value_store_collection.py | 42 -- .../resource_clients/test_request_queue.py | 260 ----------- .../test_request_queue_collection.py | 42 -- .../memory_storage/test_memory_storage.py | 154 ------- tests/unit/storages/test_dataset.py | 107 ----- tests/unit/storages/test_key_value_store.py | 86 ---- tests/unit/storages/test_request_queue.py | 112 ----- 18 files changed, 26 insertions(+), 1411 deletions(-) delete mode 100644 tests/unit/memory_storage/__init__.py delete mode 100644 tests/unit/memory_storage/resource_clients/__init__.py delete mode 100644 tests/unit/memory_storage/resource_clients/test_dataset.py delete mode 100644 tests/unit/memory_storage/resource_clients/test_dataset_collection.py delete mode 100644 tests/unit/memory_storage/resource_clients/test_key_value_store.py delete mode 100644 tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py delete mode 100644 tests/unit/memory_storage/resource_clients/test_request_queue.py delete mode 100644 tests/unit/memory_storage/resource_clients/test_request_queue_collection.py delete mode 100644 tests/unit/memory_storage/test_memory_storage.py delete mode 100644 tests/unit/storages/test_dataset.py delete mode 100644 tests/unit/storages/test_key_value_store.py delete mode 100644 tests/unit/storages/test_request_queue.py diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index 405e59a3..75545dde 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -10,11 +10,17 @@ from scrapy.utils.python import to_bytes except ImportError as exc: raise ImportError( - 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', + 'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run "pip install apify[scrapy]".', ) from exc +from typing import TYPE_CHECKING + +from crawlee.storage_client_manager import StorageClientManager + from apify.actor import Actor -from apify.storages import RequestQueue, StorageClientManager + +if TYPE_CHECKING: + from apify.storages import RequestQueue nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop() diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 82102bc6..37c5a654 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -12,12 +12,12 @@ import pytest from apify_client import ApifyClientAsync from apify_shared.consts import ActorJobStatus, ActorSourceType +from crawlee.storage_client_manager import StorageClientManager from filelock import FileLock from ._utils import generate_unique_resource_name from apify import Actor from apify.config import Configuration -from apify.storages import Dataset, KeyValueStore, RequestQueue, StorageClientManager if TYPE_CHECKING: from apify_client.clients.resource_clients import ActorClientAsync @@ -33,13 +33,8 @@ def _reset_and_patch_default_instances(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr(Actor, '_default_instance', None) monkeypatch.setattr(Configuration, '_default_instance', None) - monkeypatch.setattr(Dataset, '_cache_by_id', None) - monkeypatch.setattr(Dataset, '_cache_by_name', None) - monkeypatch.setattr(KeyValueStore, '_cache_by_id', None) - monkeypatch.setattr(KeyValueStore, '_cache_by_name', None) - monkeypatch.setattr(RequestQueue, '_cache_by_id', None) - monkeypatch.setattr(RequestQueue, '_cache_by_name', None) - monkeypatch.setattr(StorageClientManager, '_default_instance', None) + monkeypatch.setattr(StorageClientManager, '_cloud_client', None) + # TODO StorageClientManager local client purge # This fixture can't be session-scoped, diff --git a/tests/unit/actor/test_actor_dataset.py b/tests/unit/actor/test_actor_dataset.py index d9ba9c66..c03911df 100644 --- a/tests/unit/actor/test_actor_dataset.py +++ b/tests/unit/actor/test_actor_dataset.py @@ -8,7 +8,7 @@ from apify import Actor if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient + from crawlee.memory_storage_client.memory_storage_client import MemoryStorageClient # NOTE: We only test the dataset methods available on Actor class/instance. # Actual tests for the implementations are in storages/. diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index 3de07378..e341318a 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -12,7 +12,7 @@ from apify.consts import ENCRYPTED_INPUT_VALUE_PREFIX if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient + from crawlee.memory_storage_client.memory_storage_client import MemoryStorageClient # NOTE: We only test the key-value store methods available on Actor class/instance. @@ -53,7 +53,7 @@ async def test_get_input(self: TestKeyValueStoreOnActor, memory_storage_client: input_key = 'INPUT' test_input = {'foo': 'bar'} - await memory_storage_client.key_value_stores().get_or_create(_id='default') + await memory_storage_client.key_value_stores().get_or_create(id='default') await memory_storage_client.key_value_store('default').set_record( key=input_key, value=json_dumps(test_input), @@ -80,7 +80,7 @@ async def test_get_input_with_secrets( 'secret': f'{ENCRYPTED_INPUT_VALUE_PREFIX}:{encrypted_secret["encrypted_password"]}:{encrypted_secret["encrypted_value"]}', } - await memory_storage_client.key_value_stores().get_or_create(_id='default') + await memory_storage_client.key_value_stores().get_or_create(id='default') await memory_storage_client.key_value_store('default').set_record( key=input_key, value=json_dumps(input_with_secret), diff --git a/tests/unit/actor/test_actor_memory_storage_e2e.py b/tests/unit/actor/test_actor_memory_storage_e2e.py index dd1d541e..31268bbc 100644 --- a/tests/unit/actor/test_actor_memory_storage_e2e.py +++ b/tests/unit/actor/test_actor_memory_storage_e2e.py @@ -5,9 +5,9 @@ import pytest from apify_shared.consts import ApifyEnvVars +from crawlee.storage_client_manager import StorageClientManager from apify import Actor -from apify.storages import StorageClientManager @pytest.mark.parametrize('purge_on_start', [True, False]) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 39e2cb17..976c331e 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -9,11 +9,12 @@ import pytest from apify_client.client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars +from crawlee.memory_storage_client.memory_storage_client import MemoryStorageClient +from crawlee.storage_client_manager import StorageClientManager +from crawlee.configuration import Configuration as CrawleeConfiguration from apify import Actor -from apify._memory_storage import MemoryStorageClient from apify.config import Configuration -from apify.storages import Dataset, KeyValueStore, RequestQueue, StorageClientManager if TYPE_CHECKING: from pathlib import Path @@ -24,13 +25,8 @@ def reset_default_instances(monkeypatch: pytest.MonkeyPatch) -> Callable[[], Non def reset() -> None: monkeypatch.setattr(Actor, '_default_instance', None) monkeypatch.setattr(Configuration, '_default_instance', None) - monkeypatch.setattr(Dataset, '_cache_by_id', None) - monkeypatch.setattr(Dataset, '_cache_by_name', None) - monkeypatch.setattr(KeyValueStore, '_cache_by_id', None) - monkeypatch.setattr(KeyValueStore, '_cache_by_name', None) - monkeypatch.setattr(RequestQueue, '_cache_by_id', None) - monkeypatch.setattr(RequestQueue, '_cache_by_name', None) - monkeypatch.setattr(StorageClientManager, '_default_instance', None) + monkeypatch.setattr(StorageClientManager, '_cloud_client', None) + # TODO StorageClientManager local client purge return reset @@ -157,4 +153,8 @@ def apify_client_async_patcher(monkeypatch: pytest.MonkeyPatch) -> ApifyClientAs @pytest.fixture() def memory_storage_client() -> MemoryStorageClient: - return MemoryStorageClient(write_metadata=True, persist_storage=True) + configuration = CrawleeConfiguration() + configuration.persist_storage = True + configuration.write_metadata = True + + return MemoryStorageClient(configuration) diff --git a/tests/unit/memory_storage/__init__.py b/tests/unit/memory_storage/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/memory_storage/resource_clients/__init__.py b/tests/unit/memory_storage/resource_clients/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/memory_storage/resource_clients/test_dataset.py b/tests/unit/memory_storage/resource_clients/test_dataset.py deleted file mode 100644 index 6c5aaecf..00000000 --- a/tests/unit/memory_storage/resource_clients/test_dataset.py +++ /dev/null @@ -1,138 +0,0 @@ -from __future__ import annotations - -import asyncio -import os -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import DatasetClient - - -@pytest.fixture() -async def dataset_client(memory_storage_client: MemoryStorageClient) -> DatasetClient: - datasets_client = memory_storage_client.datasets() - dataset_info = await datasets_client.get_or_create(name='test') - return memory_storage_client.dataset(dataset_info['id']) - - -async def test_nonexistent(memory_storage_client: MemoryStorageClient) -> None: - dataset_client = memory_storage_client.dataset(dataset_id='nonexistent-id') - assert await dataset_client.get() is None - with pytest.raises(ValueError, match='Dataset with id "nonexistent-id" does not exist.'): - await dataset_client.update(name='test-update') - - with pytest.raises(ValueError, match='Dataset with id "nonexistent-id" does not exist.'): - await dataset_client.list_items() - - with pytest.raises(ValueError, match='Dataset with id "nonexistent-id" does not exist.'): - await dataset_client.push_items([{'abc': 123}]) - await dataset_client.delete() - - -async def test_not_implemented(dataset_client: DatasetClient) -> None: - with pytest.raises(NotImplementedError, match='This method is not supported in local memory storage.'): - await dataset_client.stream_items() - with pytest.raises(NotImplementedError, match='This method is not supported in local memory storage.'): - await dataset_client.get_items_as_bytes() - - -async def test_get(dataset_client: DatasetClient) -> None: - await asyncio.sleep(0.1) - info = await dataset_client.get() - assert info is not None - assert info['id'] == dataset_client._id - assert info['accessedAt'] != info['createdAt'] - - -async def test_update(dataset_client: DatasetClient) -> None: - new_dataset_name = 'test-update' - await dataset_client.push_items({'abc': 123}) - - old_dataset_info = await dataset_client.get() - assert old_dataset_info is not None - old_dataset_directory = os.path.join(dataset_client._memory_storage_client._datasets_directory, old_dataset_info['name']) - new_dataset_directory = os.path.join(dataset_client._memory_storage_client._datasets_directory, new_dataset_name) - assert os.path.exists(os.path.join(old_dataset_directory, '000000001.json')) is True - assert os.path.exists(os.path.join(new_dataset_directory, '000000001.json')) is False - - await asyncio.sleep(0.1) - updated_dataset_info = await dataset_client.update(name=new_dataset_name) - assert os.path.exists(os.path.join(old_dataset_directory, '000000001.json')) is False - assert os.path.exists(os.path.join(new_dataset_directory, '000000001.json')) is True - # Only modifiedAt and accessedAt should be different - assert old_dataset_info['createdAt'] == updated_dataset_info['createdAt'] - assert old_dataset_info['modifiedAt'] != updated_dataset_info['modifiedAt'] - assert old_dataset_info['accessedAt'] != updated_dataset_info['accessedAt'] - - # Should fail with the same name - with pytest.raises(ValueError, match='Dataset with name "test-update" already exists.'): - await dataset_client.update(name=new_dataset_name) - - -async def test_delete(dataset_client: DatasetClient) -> None: - await dataset_client.push_items({'abc': 123}) - dataset_info = await dataset_client.get() - assert dataset_info is not None - dataset_directory = os.path.join(dataset_client._memory_storage_client._datasets_directory, dataset_info['name']) - assert os.path.exists(os.path.join(dataset_directory, '000000001.json')) is True - await dataset_client.delete() - assert os.path.exists(os.path.join(dataset_directory, '000000001.json')) is False - # Does not crash when called again - await dataset_client.delete() - - -async def test_push_items(dataset_client: DatasetClient) -> None: - await dataset_client.push_items('{"test": "JSON from a string"}') - await dataset_client.push_items({'abc': {'def': {'ghi': '123'}}}) - await dataset_client.push_items(['{"test-json-parse": "JSON from a string"}' for _ in range(10)]) - await dataset_client.push_items([{'test-dict': i} for i in range(10)]) - - list_page = await dataset_client.list_items() - assert list_page.items[0]['test'] == 'JSON from a string' - assert list_page.items[1]['abc']['def']['ghi'] == '123' - assert list_page.items[11]['test-json-parse'] == 'JSON from a string' - assert list_page.items[21]['test-dict'] == 9 - assert list_page.count == 22 - - -async def test_list_items(dataset_client: DatasetClient) -> None: - item_count = 100 - used_offset = 10 - used_limit = 50 - await dataset_client.push_items([{'id': i} for i in range(item_count)]) - # Test without any parameters - list_default = await dataset_client.list_items() - assert list_default.count == item_count - assert list_default.offset == 0 - assert list_default.items[0]['id'] == 0 - assert list_default.desc is False - # Test offset - list_offset_10 = await dataset_client.list_items(offset=used_offset) - assert list_offset_10.count == item_count - used_offset - assert list_offset_10.offset == used_offset - assert list_offset_10.total == item_count - assert list_offset_10.items[0]['id'] == used_offset - # Test limit - list_limit_50 = await dataset_client.list_items(limit=used_limit) - assert list_limit_50.count == used_limit - assert list_limit_50.limit == used_limit - assert list_limit_50.total == item_count - # Test desc - list_desc_true = await dataset_client.list_items(desc=True) - assert list_desc_true.items[0]['id'] == 99 - assert list_desc_true.desc is True - - -async def test_iterate_items(dataset_client: DatasetClient) -> None: - item_count = 100 - await dataset_client.push_items([{'id': i} for i in range(item_count)]) - actual_items = [] - async for item in dataset_client.iterate_items(): - assert 'id' in item - actual_items.append(item) - assert len(actual_items) == item_count - assert actual_items[0]['id'] == 0 - assert actual_items[99]['id'] == 99 diff --git a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py b/tests/unit/memory_storage/resource_clients/test_dataset_collection.py deleted file mode 100644 index 89b79228..00000000 --- a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import DatasetCollectionClient - - -@pytest.fixture() -def datasets_client(memory_storage_client: MemoryStorageClient) -> DatasetCollectionClient: - return memory_storage_client.datasets() - - -async def test_get_or_create(datasets_client: DatasetCollectionClient) -> None: - dataset_name = 'test' - # A new dataset gets created - dataset_info = await datasets_client.get_or_create(name=dataset_name) - assert dataset_info['name'] == dataset_name - - # Another get_or_create call returns the same dataset - dataset_info_existing = await datasets_client.get_or_create(name=dataset_name) - assert dataset_info['id'] == dataset_info_existing['id'] - assert dataset_info['name'] == dataset_info_existing['name'] - assert dataset_info['createdAt'] == dataset_info_existing['createdAt'] - - -async def test_list(datasets_client: DatasetCollectionClient) -> None: - assert (await datasets_client.list()).count == 0 - dataset_info = await datasets_client.get_or_create(name='dataset') - dataset_list = await datasets_client.list() - assert dataset_list.count == 1 - assert dataset_list.items[0]['name'] == dataset_info['name'] - - # Test sorting behavior - newer_dataset_info = await datasets_client.get_or_create(name='newer-dataset') - dataset_list_sorting = await datasets_client.list() - assert dataset_list_sorting.count == 2 - assert dataset_list_sorting.items[0]['name'] == dataset_info['name'] - assert dataset_list_sorting.items[1]['name'] == newer_dataset_info['name'] diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store.py b/tests/unit/memory_storage/resource_clients/test_key_value_store.py deleted file mode 100644 index 848073a0..00000000 --- a/tests/unit/memory_storage/resource_clients/test_key_value_store.py +++ /dev/null @@ -1,403 +0,0 @@ -from __future__ import annotations - -import asyncio -import base64 -import json -import os -from datetime import datetime, timezone -from typing import TYPE_CHECKING - -import pytest -from apify_shared.utils import json_dumps - -from apify._crypto import crypto_random_object_id -from apify._utils import maybe_parse_body - -if TYPE_CHECKING: - from pathlib import Path - - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import KeyValueStoreClient - -TINY_PNG = base64.b64decode('iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVQYV2NgYAAAAAMAAWgmWQ0AAAAASUVORK5CYII=') -TINY_BYTES = b'\x12\x34\x56\x78\x90\xab\xcd\xef' -TINY_DATA = {'a': 'b'} -TINY_TEXT = 'abcd' - - -@pytest.fixture() -async def key_value_store_client(memory_storage_client: MemoryStorageClient) -> KeyValueStoreClient: - key_value_stores_client = memory_storage_client.key_value_stores() - kvs_info = await key_value_stores_client.get_or_create(name='test') - return memory_storage_client.key_value_store(kvs_info['id']) - - -async def test_nonexistent(memory_storage_client: MemoryStorageClient) -> None: - kvs_client = memory_storage_client.key_value_store(key_value_store_id='nonexistent-id') - assert await kvs_client.get() is None - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.update(name='test-update') - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.list_keys() - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.set_record('test', {'abc': 123}) - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.get_record('test') - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.get_record_as_bytes('test') - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.delete_record('test') - - await kvs_client.delete() - - -async def test_not_implemented(key_value_store_client: KeyValueStoreClient) -> None: - with pytest.raises(NotImplementedError, match='This method is not supported in local memory storage.'): - await key_value_store_client.stream_record('test') - - -async def test_get(key_value_store_client: KeyValueStoreClient) -> None: - await asyncio.sleep(0.1) - info = await key_value_store_client.get() - assert info is not None - assert info['id'] == key_value_store_client._id - assert info['accessedAt'] != info['createdAt'] - - -async def test_update(key_value_store_client: KeyValueStoreClient) -> None: - new_kvs_name = 'test-update' - await key_value_store_client.set_record('test', {'abc': 123}) - old_kvs_info = await key_value_store_client.get() - assert old_kvs_info is not None - old_kvs_directory = os.path.join(key_value_store_client._memory_storage_client._key_value_stores_directory, old_kvs_info['name']) - new_kvs_directory = os.path.join(key_value_store_client._memory_storage_client._key_value_stores_directory, new_kvs_name) - assert os.path.exists(os.path.join(old_kvs_directory, 'test.json')) is True - assert os.path.exists(os.path.join(new_kvs_directory, 'test.json')) is False - - await asyncio.sleep(0.1) - updated_kvs_info = await key_value_store_client.update(name=new_kvs_name) - assert os.path.exists(os.path.join(old_kvs_directory, 'test.json')) is False - assert os.path.exists(os.path.join(new_kvs_directory, 'test.json')) is True - # Only modifiedAt and accessedAt should be different - assert old_kvs_info['createdAt'] == updated_kvs_info['createdAt'] - assert old_kvs_info['modifiedAt'] != updated_kvs_info['modifiedAt'] - assert old_kvs_info['accessedAt'] != updated_kvs_info['accessedAt'] - - # Should fail with the same name - with pytest.raises(ValueError, match='Key-value store with name "test-update" already exists.'): - await key_value_store_client.update(name=new_kvs_name) - - -async def test_delete(key_value_store_client: KeyValueStoreClient) -> None: - await key_value_store_client.set_record('test', {'abc': 123}) - kvs_info = await key_value_store_client.get() - assert kvs_info is not None - kvs_directory = os.path.join(key_value_store_client._memory_storage_client._key_value_stores_directory, kvs_info['name']) - assert os.path.exists(os.path.join(kvs_directory, 'test.json')) is True - await key_value_store_client.delete() - assert os.path.exists(os.path.join(kvs_directory, 'test.json')) is False - # Does not crash when called again - await key_value_store_client.delete() - - -async def test_list_keys_empty(key_value_store_client: KeyValueStoreClient) -> None: - keys = await key_value_store_client.list_keys() - assert len(keys['items']) == 0 - assert keys['count'] == 0 - assert keys['isTruncated'] is False - - -async def test_list_keys(key_value_store_client: KeyValueStoreClient) -> None: - record_count = 4 - used_limit = 2 - used_exclusive_start_key = 'a' - await key_value_store_client.set_record('b', 'test') - await key_value_store_client.set_record('a', 'test') - await key_value_store_client.set_record('d', 'test') - await key_value_store_client.set_record('c', 'test') - - # Default settings - keys = await key_value_store_client.list_keys() - assert keys['items'][0]['key'] == 'a' - assert keys['items'][3]['key'] == 'd' - assert keys['count'] == record_count - assert keys['isTruncated'] is False - # Test limit - keys_limit_2 = await key_value_store_client.list_keys(limit=used_limit) - assert keys_limit_2['count'] == record_count - assert keys_limit_2['limit'] == used_limit - assert keys_limit_2['items'][1]['key'] == 'b' - # Test exclusive start key - keys_exclusive_start = await key_value_store_client.list_keys(exclusive_start_key=used_exclusive_start_key, limit=2) - assert keys_exclusive_start['exclusiveStartKey'] == used_exclusive_start_key - assert keys_exclusive_start['isTruncated'] is True - assert keys_exclusive_start['nextExclusiveStartKey'] == 'c' - assert keys_exclusive_start['items'][0]['key'] == 'b' - assert keys_exclusive_start['items'][-1]['key'] == keys_exclusive_start['nextExclusiveStartKey'] - - -async def test_get_and_set_record(tmp_path: Path, key_value_store_client: KeyValueStoreClient) -> None: - # Test setting dict record - dict_record_key = 'test-dict' - await key_value_store_client.set_record(dict_record_key, {'test': 123}) - dict_record_info = await key_value_store_client.get_record(dict_record_key) - assert dict_record_info is not None - assert 'application/json' in dict_record_info['contentType'] - assert dict_record_info['value']['test'] == 123 - - # Test setting str record - str_record_key = 'test-str' - await key_value_store_client.set_record(str_record_key, 'test') - str_record_info = await key_value_store_client.get_record(str_record_key) - assert str_record_info is not None - assert 'text/plain' in str_record_info['contentType'] - assert str_record_info['value'] == 'test' - - # Test setting explicit json record but use str as value, i.e. json dumps is skipped - explicit_json_key = 'test-json' - await key_value_store_client.set_record(explicit_json_key, '{"test": "explicit string"}', 'application/json') - bytes_record_info = await key_value_store_client.get_record(explicit_json_key) - assert bytes_record_info is not None - assert 'application/json' in bytes_record_info['contentType'] - assert bytes_record_info['value']['test'] == 'explicit string' - - # Test using bytes - bytes_key = 'test-json' - bytes_value = b'testing bytes set_record' - await key_value_store_client.set_record(bytes_key, bytes_value, 'unknown') - bytes_record_info = await key_value_store_client.get_record(bytes_key) - assert bytes_record_info is not None - assert 'unknown' in bytes_record_info['contentType'] - assert bytes_record_info['value'] == bytes_value - assert bytes_record_info['value'].decode('utf-8') == bytes_value.decode('utf-8') - - # Test using file descriptor - with open(os.path.join(tmp_path, 'test.json'), 'w+', encoding='utf-8') as f: # noqa: ASYNC101 - f.write('Test') - with pytest.raises(NotImplementedError, match='File-like values are not supported in local memory storage'): - await key_value_store_client.set_record('file', f) - - -async def test_get_record_as_bytes(key_value_store_client: KeyValueStoreClient) -> None: - record_key = 'test' - record_value = 'testing' - await key_value_store_client.set_record(record_key, record_value) - record_info = await key_value_store_client.get_record_as_bytes(record_key) - assert record_info is not None - assert record_info['value'] == record_value.encode('utf-8') - - -async def test_delete_record(key_value_store_client: KeyValueStoreClient) -> None: - record_key = 'test' - await key_value_store_client.set_record(record_key, 'test') - await key_value_store_client.delete_record(record_key) - # Does not crash when called again - await key_value_store_client.delete_record(record_key) - - -@pytest.mark.parametrize( - 'test_case', - [ - { - 'input': {'key': 'image', 'value': TINY_PNG, 'contentType': None}, - 'expectedOutput': {'filename': 'image', 'key': 'image', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'key': 'image', 'value': TINY_PNG, 'contentType': 'image/png'}, - 'expectedOutput': {'filename': 'image.png', 'key': 'image', 'contentType': 'image/png'}, - }, - { - 'input': {'key': 'image.png', 'value': TINY_PNG, 'contentType': None}, - 'expectedOutput': {'filename': 'image.png', 'key': 'image.png', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'key': 'image.png', 'value': TINY_PNG, 'contentType': 'image/png'}, - 'expectedOutput': {'filename': 'image.png', 'key': 'image.png', 'contentType': 'image/png'}, - }, - { - 'input': {'key': 'data', 'value': TINY_DATA, 'contentType': None}, - 'expectedOutput': {'filename': 'data.json', 'key': 'data', 'contentType': 'application/json'}, - }, - { - 'input': {'key': 'data', 'value': TINY_DATA, 'contentType': 'application/json'}, - 'expectedOutput': {'filename': 'data.json', 'key': 'data', 'contentType': 'application/json'}, - }, - { - 'input': {'key': 'data.json', 'value': TINY_DATA, 'contentType': None}, - 'expectedOutput': {'filename': 'data.json', 'key': 'data.json', 'contentType': 'application/json'}, - }, - { - 'input': {'key': 'data.json', 'value': TINY_DATA, 'contentType': 'application/json'}, - 'expectedOutput': {'filename': 'data.json', 'key': 'data.json', 'contentType': 'application/json'}, - }, - { - 'input': {'key': 'text', 'value': TINY_TEXT, 'contentType': None}, - 'expectedOutput': {'filename': 'text.txt', 'key': 'text', 'contentType': 'text/plain'}, - }, - { - 'input': {'key': 'text', 'value': TINY_TEXT, 'contentType': 'text/plain'}, - 'expectedOutput': {'filename': 'text.txt', 'key': 'text', 'contentType': 'text/plain'}, - }, - { - 'input': {'key': 'text.txt', 'value': TINY_TEXT, 'contentType': None}, - 'expectedOutput': {'filename': 'text.txt', 'key': 'text.txt', 'contentType': 'text/plain'}, - }, - { - 'input': {'key': 'text.txt', 'value': TINY_TEXT, 'contentType': 'text/plain'}, - 'expectedOutput': {'filename': 'text.txt', 'key': 'text.txt', 'contentType': 'text/plain'}, - }, - ], -) -async def test_writes_correct_metadata(memory_storage_client: MemoryStorageClient, test_case: dict) -> None: - test_input = test_case['input'] - expected_output = test_case['expectedOutput'] - key_value_store_name = crypto_random_object_id() - - # Write the input data to the key-value store - store_details = await memory_storage_client.key_value_stores().get_or_create(name=key_value_store_name) - key_value_store_client = memory_storage_client.key_value_store(store_details['id']) - await key_value_store_client.set_record(test_input['key'], test_input['value'], content_type=test_input['contentType']) - - # Check that everything was written correctly, both the data and metadata - storage_path = os.path.join(memory_storage_client._key_value_stores_directory, key_value_store_name) - item_path = os.path.join(storage_path, expected_output['filename']) - metadata_path = os.path.join(storage_path, expected_output['filename'] + '.__metadata__.json') - - assert os.path.exists(item_path) - assert os.path.exists(metadata_path) - - with open(item_path, 'rb') as item_file: # noqa: ASYNC101 - actual_value = maybe_parse_body(item_file.read(), expected_output['contentType']) - assert actual_value == test_input['value'] - - with open(metadata_path, encoding='utf-8') as metadata_file: # noqa: ASYNC101 - metadata = json.load(metadata_file) - assert metadata['key'] == expected_output['key'] - assert expected_output['contentType'] in metadata['contentType'] - - -@pytest.mark.parametrize( - 'test_case', - [ - { - 'input': {'filename': 'image', 'value': TINY_PNG, 'metadata': None}, - 'expectedOutput': {'key': 'image', 'filename': 'image', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'filename': 'image.png', 'value': TINY_PNG, 'metadata': None}, - 'expectedOutput': {'key': 'image', 'filename': 'image.png', 'contentType': 'image/png'}, - }, - { - 'input': {'filename': 'image', 'value': TINY_PNG, 'metadata': {'key': 'image', 'contentType': 'application/octet-stream'}}, - 'expectedOutput': {'key': 'image', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'filename': 'image', 'value': TINY_PNG, 'metadata': {'key': 'image', 'contentType': 'image/png'}}, - 'expectedOutput': {'key': 'image', 'filename': 'image', 'contentType': 'image/png'}, - }, - { - 'input': {'filename': 'image.png', 'value': TINY_PNG, 'metadata': {'key': 'image.png', 'contentType': 'application/octet-stream'}}, - 'expectedOutput': {'key': 'image.png', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'filename': 'image.png', 'value': TINY_PNG, 'metadata': {'key': 'image.png', 'contentType': 'image/png'}}, - 'expectedOutput': {'key': 'image.png', 'contentType': 'image/png'}, - }, - { - 'input': {'filename': 'image.png', 'value': TINY_PNG, 'metadata': {'key': 'image', 'contentType': 'image/png'}}, - 'expectedOutput': {'key': 'image', 'contentType': 'image/png'}, - }, - { - 'input': {'filename': 'input', 'value': TINY_BYTES, 'metadata': None}, - 'expectedOutput': {'key': 'input', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'filename': 'input.json', 'value': TINY_DATA, 'metadata': None}, - 'expectedOutput': {'key': 'input', 'contentType': 'application/json'}, - }, - {'input': {'filename': 'input.txt', 'value': TINY_TEXT, 'metadata': None}, 'expectedOutput': {'key': 'input', 'contentType': 'text/plain'}}, - { - 'input': {'filename': 'input.bin', 'value': TINY_BYTES, 'metadata': None}, - 'expectedOutput': {'key': 'input', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'filename': 'input', 'value': TINY_BYTES, 'metadata': {'key': 'input', 'contentType': 'application/octet-stream'}}, - 'expectedOutput': {'key': 'input', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'filename': 'input.json', 'value': TINY_DATA, 'metadata': {'key': 'input', 'contentType': 'application/json'}}, - 'expectedOutput': {'key': 'input', 'contentType': 'application/json'}, - }, - { - 'input': {'filename': 'input.txt', 'value': TINY_TEXT, 'metadata': {'key': 'input', 'contentType': 'text/plain'}}, - 'expectedOutput': {'key': 'input', 'contentType': 'text/plain'}, - }, - { - 'input': {'filename': 'input.bin', 'value': TINY_BYTES, 'metadata': {'key': 'input', 'contentType': 'application/octet-stream'}}, - 'expectedOutput': {'key': 'input', 'contentType': 'application/octet-stream'}, - }, - ], -) -async def test_reads_correct_metadata(memory_storage_client: MemoryStorageClient, test_case: dict) -> None: - test_input = test_case['input'] - expected_output = test_case['expectedOutput'] - key_value_store_name = crypto_random_object_id() - - # Ensure the directory for the store exists - storage_path = os.path.join(memory_storage_client._key_value_stores_directory, key_value_store_name) - os.makedirs(storage_path, exist_ok=True) - - store_metadata = { - 'id': crypto_random_object_id(), - 'name': None, - 'accessedAt': datetime.now(timezone.utc), - 'createdAt': datetime.now(timezone.utc), - 'modifiedAt': datetime.now(timezone.utc), - 'userId': '1', - } - - # Write the store metadata to disk - store_metadata_path = os.path.join(storage_path, '__metadata__.json') - with open(store_metadata_path, mode='wb') as store_metadata_file: # noqa: ASYNC101 - store_metadata_file.write(json_dumps(store_metadata).encode('utf-8')) - - # Write the test input item to the disk - item_path = os.path.join(storage_path, test_input['filename']) - with open(item_path, 'wb') as item_file: # noqa: ASYNC101 - if isinstance(test_input['value'], bytes): - item_file.write(test_input['value']) - elif isinstance(test_input['value'], str): - item_file.write(test_input['value'].encode('utf-8')) - else: - item_file.write(json_dumps(test_input['value']).encode('utf-8')) - - # Optionally write the metadata to disk if there is some - if test_input['metadata'] is not None: - metadata_path = os.path.join(storage_path, test_input['filename'] + '.__metadata__.json') - with open(metadata_path, 'w', encoding='utf-8') as metadata_file: # noqa: ASYNC101 - metadata_file.write( - json_dumps( - { - 'key': test_input['metadata']['key'], - 'contentType': test_input['metadata']['contentType'], - } - ) - ) - - # Create the key-value store client to load the items from disk - store_details = await memory_storage_client.key_value_stores().get_or_create(name=key_value_store_name) - key_value_store_client = memory_storage_client.key_value_store(store_details['id']) - - # Read the item from the store and check if it is as expected - actual_record = await key_value_store_client.get_record(expected_output['key']) - assert actual_record is not None - - assert actual_record['key'] == expected_output['key'] - assert actual_record['contentType'] == expected_output['contentType'] - assert actual_record['value'] == test_input['value'] diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py b/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py deleted file mode 100644 index f645df01..00000000 --- a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import KeyValueStoreCollectionClient - - -@pytest.fixture() -def key_value_stores_client(memory_storage_client: MemoryStorageClient) -> KeyValueStoreCollectionClient: - return memory_storage_client.key_value_stores() - - -async def test_get_or_create(key_value_stores_client: KeyValueStoreCollectionClient) -> None: - kvs_name = 'test' - # A new kvs gets created - kvs_info = await key_value_stores_client.get_or_create(name=kvs_name) - assert kvs_info['name'] == kvs_name - - # Another get_or_create call returns the same kvs - kvs_info_existing = await key_value_stores_client.get_or_create(name=kvs_name) - assert kvs_info['id'] == kvs_info_existing['id'] - assert kvs_info['name'] == kvs_info_existing['name'] - assert kvs_info['createdAt'] == kvs_info_existing['createdAt'] - - -async def test_list(key_value_stores_client: KeyValueStoreCollectionClient) -> None: - assert (await key_value_stores_client.list()).count == 0 - kvs_info = await key_value_stores_client.get_or_create(name='kvs') - kvs_list = await key_value_stores_client.list() - assert kvs_list.count == 1 - assert kvs_list.items[0]['name'] == kvs_info['name'] - - # Test sorting behavior - newer_kvs_info = await key_value_stores_client.get_or_create(name='newer-kvs') - kvs_list_sorting = await key_value_stores_client.list() - assert kvs_list_sorting.count == 2 - assert kvs_list_sorting.items[0]['name'] == kvs_info['name'] - assert kvs_list_sorting.items[1]['name'] == newer_kvs_info['name'] diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue.py b/tests/unit/memory_storage/resource_clients/test_request_queue.py deleted file mode 100644 index c66bc68f..00000000 --- a/tests/unit/memory_storage/resource_clients/test_request_queue.py +++ /dev/null @@ -1,260 +0,0 @@ -from __future__ import annotations - -import asyncio -import os -from datetime import datetime, timezone -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import RequestQueueClient - - -@pytest.fixture() -async def request_queue_client(memory_storage_client: MemoryStorageClient) -> RequestQueueClient: - request_queues_client = memory_storage_client.request_queues() - rq_info = await request_queues_client.get_or_create(name='test') - return memory_storage_client.request_queue(rq_info['id']) - - -async def test_nonexistent(memory_storage_client: MemoryStorageClient) -> None: - request_queue_client = memory_storage_client.request_queue(request_queue_id='nonexistent-id') - assert await request_queue_client.get() is None - with pytest.raises(ValueError, match='Request queue with id "nonexistent-id" does not exist.'): - await request_queue_client.update(name='test-update') - await request_queue_client.delete() - - -async def test_get(request_queue_client: RequestQueueClient) -> None: - await asyncio.sleep(0.1) - info = await request_queue_client.get() - assert info is not None - assert info['id'] == request_queue_client._id - assert info['accessedAt'] != info['createdAt'] - - -async def test_update(request_queue_client: RequestQueueClient) -> None: - new_rq_name = 'test-update' - await request_queue_client.add_request( - { - 'uniqueKey': 'https://apify.com', - 'url': 'https://apify.com', - } - ) - old_rq_info = await request_queue_client.get() - assert old_rq_info is not None - old_rq_directory = os.path.join(request_queue_client._memory_storage_client._request_queues_directory, old_rq_info['name']) - new_rq_directory = os.path.join(request_queue_client._memory_storage_client._request_queues_directory, new_rq_name) - assert os.path.exists(os.path.join(old_rq_directory, 'fvwscO2UJLdr10B.json')) is True - assert os.path.exists(os.path.join(new_rq_directory, 'fvwscO2UJLdr10B.json')) is False - - await asyncio.sleep(0.1) - updated_rq_info = await request_queue_client.update(name=new_rq_name) - assert os.path.exists(os.path.join(old_rq_directory, 'fvwscO2UJLdr10B.json')) is False - assert os.path.exists(os.path.join(new_rq_directory, 'fvwscO2UJLdr10B.json')) is True - # Only modifiedAt and accessedAt should be different - assert old_rq_info['createdAt'] == updated_rq_info['createdAt'] - assert old_rq_info['modifiedAt'] != updated_rq_info['modifiedAt'] - assert old_rq_info['accessedAt'] != updated_rq_info['accessedAt'] - - # Should fail with the same name - with pytest.raises(ValueError, match='Request queue with name "test-update" already exists'): - await request_queue_client.update(name=new_rq_name) - - -async def test_delete(request_queue_client: RequestQueueClient) -> None: - await request_queue_client.add_request( - { - 'uniqueKey': 'https://apify.com', - 'url': 'https://apify.com', - } - ) - rq_info = await request_queue_client.get() - assert rq_info is not None - - rq_directory = os.path.join(request_queue_client._memory_storage_client._request_queues_directory, rq_info['name']) - assert os.path.exists(os.path.join(rq_directory, 'fvwscO2UJLdr10B.json')) is True - - await request_queue_client.delete() - assert os.path.exists(os.path.join(rq_directory, 'fvwscO2UJLdr10B.json')) is False - - # Does not crash when called again - await request_queue_client.delete() - - -async def test_list_head(request_queue_client: RequestQueueClient) -> None: - request_1_url = 'https://apify.com' - request_2_url = 'https://example.com' - await request_queue_client.add_request( - { - 'uniqueKey': request_1_url, - 'url': request_1_url, - } - ) - await request_queue_client.add_request( - { - 'uniqueKey': request_2_url, - 'url': request_2_url, - } - ) - list_head = await request_queue_client.list_head() - assert len(list_head['items']) == 2 - for item in list_head['items']: - assert 'id' in item - - -async def test_add_record(request_queue_client: RequestQueueClient) -> None: - request_forefront_url = 'https://apify.com' - request_not_forefront_url = 'https://example.com' - request_forefront_info = await request_queue_client.add_request( - { - 'uniqueKey': request_forefront_url, - 'url': request_forefront_url, - }, - forefront=True, - ) - request_not_forefront_info = await request_queue_client.add_request( - { - 'uniqueKey': request_not_forefront_url, - 'url': request_not_forefront_url, - }, - forefront=False, - ) - - assert request_forefront_info.get('requestId') is not None - assert request_not_forefront_info.get('requestId') is not None - assert request_forefront_info['wasAlreadyHandled'] is False - assert request_not_forefront_info['wasAlreadyHandled'] is False - - rq_info = await request_queue_client.get() - assert rq_info is not None - assert rq_info['pendingRequestCount'] == rq_info['totalRequestCount'] == 2 - assert rq_info['handledRequestCount'] == 0 - - -async def test_get_record(request_queue_client: RequestQueueClient) -> None: - request_url = 'https://apify.com' - request_info = await request_queue_client.add_request( - { - 'uniqueKey': request_url, - 'url': request_url, - } - ) - request = await request_queue_client.get_request(request_info['requestId']) - assert request is not None - assert 'id' in request - assert request['url'] == request['uniqueKey'] == request_url - - # Non-existent id - assert (await request_queue_client.get_request('non-existent id')) is None - - -async def test_update_record(request_queue_client: RequestQueueClient) -> None: - request_url = 'https://apify.com' - request_info = await request_queue_client.add_request( - { - 'uniqueKey': request_url, - 'url': request_url, - } - ) - request = await request_queue_client.get_request(request_info['requestId']) - assert request is not None - - rq_info_before_update = await request_queue_client.get() - assert rq_info_before_update is not None - assert rq_info_before_update['pendingRequestCount'] == 1 - assert rq_info_before_update['handledRequestCount'] == 0 - - request_update_info = await request_queue_client.update_request({**request, 'handledAt': datetime.now(timezone.utc)}) - assert request_update_info['wasAlreadyHandled'] is False - - rq_info_after_update = await request_queue_client.get() - assert rq_info_after_update is not None - assert rq_info_after_update['pendingRequestCount'] == 0 - assert rq_info_after_update['handledRequestCount'] == 1 - - -async def test_delete_record(request_queue_client: RequestQueueClient) -> None: - request_url = 'https://apify.com' - pending_request_info = await request_queue_client.add_request( - { - 'uniqueKey': 'pending', - 'url': request_url, - } - ) - handled_request_info = await request_queue_client.add_request( - { - 'uniqueKey': 'handled', - 'url': request_url, - 'handledAt': datetime.now(tz=timezone.utc), - } - ) - - rq_info_before_delete = await request_queue_client.get() - assert rq_info_before_delete is not None - assert rq_info_before_delete['pendingRequestCount'] == 1 - assert rq_info_before_delete['pendingRequestCount'] == 1 - - await request_queue_client.delete_request(pending_request_info['requestId']) - rq_info_after_first_delete = await request_queue_client.get() - assert rq_info_after_first_delete is not None - assert rq_info_after_first_delete['pendingRequestCount'] == 0 - assert rq_info_after_first_delete['handledRequestCount'] == 1 - - await request_queue_client.delete_request(handled_request_info['requestId']) - rq_info_after_second_delete = await request_queue_client.get() - assert rq_info_after_second_delete is not None - assert rq_info_after_second_delete['pendingRequestCount'] == 0 - assert rq_info_after_second_delete['handledRequestCount'] == 0 - - # Does not crash when called again - await request_queue_client.delete_request(pending_request_info['requestId']) - - -async def test_forefront(request_queue_client: RequestQueueClient) -> None: - # this should create a queue with requests in this order: - # Handled: - # 2, 5, 8 - # Not handled: - # 7, 4, 1, 0, 3, 6 - for i in range(9): - request_url = f'http://example.com/{i}' - forefront = i % 3 == 1 - was_handled = i % 3 == 2 - await request_queue_client.add_request( - { - 'uniqueKey': str(i), - 'url': request_url, - 'handledAt': datetime.now(timezone.utc) if was_handled else None, - }, - forefront=forefront, - ) - - # Check that the queue head (unhandled items) is in the right order - queue_head = await request_queue_client.list_head() - req_unique_keys = [req['uniqueKey'] for req in queue_head['items']] - assert req_unique_keys == ['7', '4', '1', '0', '3', '6'] - - # Mark request #1 as handled - await request_queue_client.update_request( - { - 'uniqueKey': '1', - 'url': 'http://example.com/1', - 'handledAt': datetime.now(timezone.utc), - } - ) - # Move request #3 to forefront - await request_queue_client.update_request( - { - 'uniqueKey': '3', - 'url': 'http://example.com/3', - }, - forefront=True, - ) - - # Check that the queue head (unhandled items) is in the right order after the updates - queue_head = await request_queue_client.list_head() - req_unique_keys = [req['uniqueKey'] for req in queue_head['items']] - assert req_unique_keys == ['3', '7', '4', '0', '6'] diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py b/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py deleted file mode 100644 index 3c33a2ac..00000000 --- a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import RequestQueueCollectionClient - - -@pytest.fixture() -def request_queues_client(memory_storage_client: MemoryStorageClient) -> RequestQueueCollectionClient: - return memory_storage_client.request_queues() - - -async def test_get_or_create(request_queues_client: RequestQueueCollectionClient) -> None: - rq_name = 'test' - # A new request queue gets created - rq_info = await request_queues_client.get_or_create(name=rq_name) - assert rq_info['name'] == rq_name - - # Another get_or_create call returns the same request queue - rq_existing = await request_queues_client.get_or_create(name=rq_name) - assert rq_info['id'] == rq_existing['id'] - assert rq_info['name'] == rq_existing['name'] - assert rq_info['createdAt'] == rq_existing['createdAt'] - - -async def test_list(request_queues_client: RequestQueueCollectionClient) -> None: - assert (await request_queues_client.list()).count == 0 - rq_info = await request_queues_client.get_or_create(name='dataset') - rq_list = await request_queues_client.list() - assert rq_list.count == 1 - assert rq_list.items[0]['name'] == rq_info['name'] - - # Test sorting behavior - newer_rq_info = await request_queues_client.get_or_create(name='newer-dataset') - rq_list_sorting = await request_queues_client.list() - assert rq_list_sorting.count == 2 - assert rq_list_sorting.items[0]['name'] == rq_info['name'] - assert rq_list_sorting.items[1]['name'] == newer_rq_info['name'] diff --git a/tests/unit/memory_storage/test_memory_storage.py b/tests/unit/memory_storage/test_memory_storage.py deleted file mode 100644 index 3d32398e..00000000 --- a/tests/unit/memory_storage/test_memory_storage.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import annotations - -import os -from typing import TYPE_CHECKING - -import pytest -from apify_shared.consts import ApifyEnvVars - -from apify._memory_storage import MemoryStorageClient - -if TYPE_CHECKING: - from pathlib import Path - - -async def test_write_metadata(tmp_path: Path) -> None: - dataset_name = 'test' - dataset_no_metadata_name = 'test-no-metadata' - ms = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=True) - ms_no_metadata = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=False) - datasets_client = ms.datasets() - datasets_no_metadata_client = ms_no_metadata.datasets() - await datasets_client.get_or_create(name=dataset_name) - await datasets_no_metadata_client.get_or_create(name=dataset_no_metadata_name) - assert os.path.exists(os.path.join(ms._datasets_directory, dataset_name, '__metadata__.json')) is True - assert os.path.exists(os.path.join(ms_no_metadata._datasets_directory, dataset_no_metadata_name, '__metadata__.json')) is False - - -async def test_persist_storage(tmp_path: Path) -> None: - ms = MemoryStorageClient(local_data_directory=str(tmp_path), persist_storage=True) - ms_no_persist = MemoryStorageClient(local_data_directory=str(tmp_path), persist_storage=False) - kvs_client = ms.key_value_stores() - kvs_no_metadata_client = ms_no_persist.key_value_stores() - kvs_info = await kvs_client.get_or_create(name='kvs') - kvs_no_metadata_info = await kvs_no_metadata_client.get_or_create(name='kvs-no-persist') - await ms.key_value_store(kvs_info['id']).set_record('test', {'x': 1}, 'application/json') - await ms_no_persist.key_value_store(kvs_no_metadata_info['id']).set_record('test', {'x': 1}, 'application/json') - assert os.path.exists(os.path.join(ms._key_value_stores_directory, kvs_info['name'], 'test.json')) is True - assert os.path.exists(os.path.join(ms_no_persist._key_value_stores_directory, kvs_no_metadata_info['name'], 'test.json')) is False - - -def test_config_via_env_vars_persist_storage(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - # Env var changes persist_storage to False - monkeypatch.setenv('APIFY_PERSIST_STORAGE', 'false') - ms = MemoryStorageClient(local_data_directory=str(tmp_path)) - assert ms._persist_storage is False - monkeypatch.setenv('APIFY_PERSIST_STORAGE', '0') - ms = MemoryStorageClient(local_data_directory=str(tmp_path)) - assert ms._persist_storage is False - monkeypatch.setenv('APIFY_PERSIST_STORAGE', '') - ms = MemoryStorageClient(local_data_directory=str(tmp_path)) - assert ms._persist_storage is False - # Test if constructor arg takes precedence over env var value - ms = MemoryStorageClient(local_data_directory=str(tmp_path), persist_storage=True) - assert ms._persist_storage is True - - -def test_config_via_env_vars_write_metadata(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - # Env var changes write_metadata to True - monkeypatch.setenv('DEBUG', '*') - ms = MemoryStorageClient(local_data_directory=str(tmp_path)) - assert ms._write_metadata is True - # Test if constructor arg takes precedence over env var value - ms = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=False) - assert ms._write_metadata is False - - -async def test_purge_datasets(tmp_path: Path) -> None: - ms = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=True) - # Create default and non-default datasets - datasets_client = ms.datasets() - default_dataset_info = await datasets_client.get_or_create(name='default') - non_default_dataset_info = await datasets_client.get_or_create(name='non-default') - - # Check all folders inside datasets directory before and after purge - folders_before_purge = os.listdir(ms._datasets_directory) - assert default_dataset_info['name'] in folders_before_purge - assert non_default_dataset_info['name'] in folders_before_purge - - await ms._purge() - folders_after_purge = os.listdir(ms._datasets_directory) - assert default_dataset_info['name'] not in folders_after_purge - assert non_default_dataset_info['name'] in folders_after_purge - - -async def test_purge_key_value_stores(tmp_path: Path) -> None: - ms = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=True) - - # Create default and non-default key-value stores - kvs_client = ms.key_value_stores() - default_kvs_info = await kvs_client.get_or_create(name='default') - non_default_kvs_info = await kvs_client.get_or_create(name='non-default') - default_kvs_client = ms.key_value_store(default_kvs_info['id']) - # INPUT.json should be kept - await default_kvs_client.set_record('INPUT', {'abc': 123}, 'application/json') - # test.json should not be kept - await default_kvs_client.set_record('test', {'abc': 123}, 'application/json') - - # Check all folders and files inside kvs directory before and after purge - folders_before_purge = os.listdir(ms._key_value_stores_directory) - assert default_kvs_info['name'] in folders_before_purge - assert non_default_kvs_info['name'] in folders_before_purge - default_folder_files_before_purge = os.listdir(os.path.join(ms._key_value_stores_directory, 'default')) - assert 'INPUT.json' in default_folder_files_before_purge - assert 'test.json' in default_folder_files_before_purge - - await ms._purge() - folders_after_purge = os.listdir(ms._key_value_stores_directory) - assert default_kvs_info['name'] in folders_after_purge - assert non_default_kvs_info['name'] in folders_after_purge - default_folder_files_after_purge = os.listdir(os.path.join(ms._key_value_stores_directory, 'default')) - assert 'INPUT.json' in default_folder_files_after_purge - assert 'test.json' not in default_folder_files_after_purge - - -async def test_purge_request_queues(tmp_path: Path) -> None: - ms = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=True) - # Create default and non-default request queues - rq_client = ms.request_queues() - default_rq_info = await rq_client.get_or_create(name='default') - non_default_rq_info = await rq_client.get_or_create(name='non-default') - - # Check all folders inside rq directory before and after purge - folders_before_purge = os.listdir(ms._request_queues_directory) - assert default_rq_info['name'] in folders_before_purge - assert non_default_rq_info['name'] in folders_before_purge - await ms._purge() - folders_after_purge = os.listdir(ms._request_queues_directory) - assert default_rq_info['name'] not in folders_after_purge - assert non_default_rq_info['name'] in folders_after_purge - - -async def test_not_implemented_method(tmp_path: Path) -> None: - ms = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=True) - ddt = ms.dataset('test') - with pytest.raises(NotImplementedError, match='This method is not supported in local memory storage.'): - await ddt.stream_items(item_format='json') - - with pytest.raises(NotImplementedError, match='This method is not supported in local memory storage.'): - await ddt.stream_items(item_format='json') - - -async def test_storage_path_configuration(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.delenv(ApifyEnvVars.LOCAL_STORAGE_DIR) - default_ms = MemoryStorageClient() - assert default_ms._local_data_directory == './storage' - - # We expect the env var to override the default value - monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, './env_var_storage_dir') - env_var_ms = MemoryStorageClient() - assert env_var_ms._local_data_directory == './env_var_storage_dir' - - # We expect the parametrized value to override the env var - parametrized_ms = MemoryStorageClient(local_data_directory='./parametrized_storage_dir') - assert parametrized_ms._local_data_directory == './parametrized_storage_dir' diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py deleted file mode 100644 index ca3b1ca3..00000000 --- a/tests/unit/storages/test_dataset.py +++ /dev/null @@ -1,107 +0,0 @@ -from __future__ import annotations - -import pytest - -from apify.storages import Dataset, KeyValueStore - - -@pytest.fixture() -async def dataset() -> Dataset: - return await Dataset.open() - - -async def test_open() -> None: - default_dataset = await Dataset.open() - default_dataset_by_id = await Dataset.open(id=default_dataset._id) - - assert default_dataset is default_dataset_by_id - - dataset_name = 'dummy-name' - named_dataset = await Dataset.open(name=dataset_name) - assert default_dataset is not named_dataset - - with pytest.raises(RuntimeError, match='Dataset with id "nonexistent-id" does not exist!'): - await Dataset.open(id='nonexistent-id') - - # Test that when you try to open a dataset by ID and you use a name of an existing dataset, - # it doesn't work - with pytest.raises(RuntimeError, match='Dataset with id "dummy-name" does not exist!'): - await Dataset.open(id='dummy-name') - - -async def test_same_references() -> None: - dataset1 = await Dataset.open() - dataset2 = await Dataset.open() - assert dataset1 is dataset2 - - dataset_name = 'non-default' - dataset_named1 = await Dataset.open(name=dataset_name) - dataset_named2 = await Dataset.open(name=dataset_name) - assert dataset_named1 is dataset_named2 - - -async def test_drop() -> None: - dataset1 = await Dataset.open() - await dataset1.drop() - dataset2 = await Dataset.open() - assert dataset1 is not dataset2 - - -async def test_export(dataset: Dataset) -> None: - expected_csv = 'id,test\r\n0,test\r\n1,test\r\n2,test\r\n' - expected_json = [{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}, {'id': 2, 'test': 'test'}] - desired_item_count = 3 - await dataset.push_data([{'id': i, 'test': 'test'} for i in range(desired_item_count)]) - await dataset.export_to_csv('dataset-csv') - await dataset.export_to_json('dataset-json') - dataset_csv = await KeyValueStore.get_value('dataset-csv') - dataset_json = await KeyValueStore.get_value('dataset-json') - assert dataset_csv == expected_csv - assert dataset_json == expected_json - - -async def test_push_data(dataset: Dataset) -> None: - desired_item_count = 2000 - await dataset.push_data([{'id': i} for i in range(desired_item_count)]) - dataset_info = await dataset.get_info() - assert dataset_info is not None - assert dataset_info['itemCount'] == desired_item_count - list_page = await dataset.get_data(limit=desired_item_count) - assert list_page.items[0]['id'] == 0 - assert list_page.items[-1]['id'] == desired_item_count - 1 - - -async def test_push_data_empty(dataset: Dataset) -> None: - await dataset.push_data([]) - dataset_info = await dataset.get_info() - assert dataset_info is not None - assert dataset_info['itemCount'] == 0 - - -async def test_push_data_singular(dataset: Dataset) -> None: - await dataset.push_data({'id': 1}) - dataset_info = await dataset.get_info() - assert dataset_info is not None - assert dataset_info['itemCount'] == 1 - list_page = await dataset.get_data() - assert list_page.items[0]['id'] == 1 - - -async def test_get_data(dataset: Dataset) -> None: # We don't test everything, that's done in memory storage tests - desired_item_count = 3 - await dataset.push_data([{'id': i} for i in range(desired_item_count)]) - list_page = await dataset.get_data() - assert list_page.count == desired_item_count - assert list_page.desc is False - assert list_page.offset == 0 - assert list_page.items[0]['id'] == 0 - assert list_page.items[-1]['id'] == desired_item_count - 1 - - -async def test_iterate_items(dataset: Dataset) -> None: - desired_item_count = 3 - idx = 0 - await dataset.push_data([{'id': i} for i in range(desired_item_count)]) - async for item in dataset.iterate_items(): - assert item['id'] == idx - idx += 1 diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py deleted file mode 100644 index 042fd873..00000000 --- a/tests/unit/storages/test_key_value_store.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import annotations - -import pytest - -from apify.storages import KeyValueStore - - -@pytest.fixture() -async def key_value_store() -> KeyValueStore: - return await KeyValueStore.open() - - -async def test_open() -> None: - default_key_value_store = await KeyValueStore.open() - default_key_value_store_by_id = await KeyValueStore.open(id=default_key_value_store._id) - - assert default_key_value_store is default_key_value_store_by_id - - key_value_store_name = 'dummy-name' - named_key_value_store = await KeyValueStore.open(name=key_value_store_name) - assert default_key_value_store is not named_key_value_store - - with pytest.raises(RuntimeError, match='Key-value store with id "nonexistent-id" does not exist!'): - await KeyValueStore.open(id='nonexistent-id') - - # Test that when you try to open a key-value store by ID and you use a name of an existing key-value store, - # it doesn't work - with pytest.raises(RuntimeError, match='Key-value store with id "dummy-name" does not exist!'): - await KeyValueStore.open(id='dummy-name') - - -async def test_same_references() -> None: - kvs1 = await KeyValueStore.open() - kvs2 = await KeyValueStore.open() - assert kvs1 is kvs2 - - kvs_name = 'non-default' - kvs_named1 = await KeyValueStore.open(name=kvs_name) - kvs_named2 = await KeyValueStore.open(name=kvs_name) - assert kvs_named1 is kvs_named2 - - -async def test_drop() -> None: - kvs1 = await KeyValueStore.open() - await kvs1.drop() - kvs2 = await KeyValueStore.open() - assert kvs1 is not kvs2 - - -async def test_get_set_value(key_value_store: KeyValueStore) -> None: - await key_value_store.set_value('test-str', 'string') - await key_value_store.set_value('test-int', 123) - await key_value_store.set_value('test-dict', {'abc': '123'}) - str_value = await key_value_store.get_value('test-str') - int_value = await key_value_store.get_value('test-int') - dict_value = await key_value_store.get_value('test-dict') - non_existent_value = await key_value_store.get_value('test-non-existent') - assert str_value == 'string' - assert int_value == 123 - assert dict_value['abc'] == '123' - assert non_existent_value is None - - -async def test_for_each_key(key_value_store: KeyValueStore) -> None: - keys = [i async for i in key_value_store.iterate_keys()] - assert len(keys) == 0 - - for i in range(2001): - await key_value_store.set_value(str(i).zfill(4), i) - index = 0 - async for key, _ in key_value_store.iterate_keys(): - assert key == str(index).zfill(4) - index += 1 - assert index == 2001 - - -async def test_get_public_url() -> None: - store = await KeyValueStore.open() - with pytest.raises(RuntimeError, match='Cannot generate a public URL for this key-value store as it is not on the Apify Platform!'): - await store.get_public_url('dummy') - - -async def test_static_get_set_value() -> None: - await KeyValueStore.set_value('test-static', 'static') - value = await KeyValueStore.get_value('test-static') - assert value == 'static' diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py deleted file mode 100644 index 2922e5b8..00000000 --- a/tests/unit/storages/test_request_queue.py +++ /dev/null @@ -1,112 +0,0 @@ -from __future__ import annotations - -import asyncio -from datetime import datetime, timezone - -import pytest - -from apify.storages import RequestQueue - - -@pytest.fixture() -async def request_queue() -> RequestQueue: - return await RequestQueue.open() - - -async def test_open() -> None: - default_request_queue = await RequestQueue.open() - default_request_queue_by_id = await RequestQueue.open(id=default_request_queue._id) - - assert default_request_queue is default_request_queue_by_id - - request_queue_name = 'dummy-name' - named_request_queue = await RequestQueue.open(name=request_queue_name) - assert default_request_queue is not named_request_queue - - with pytest.raises(RuntimeError, match='Request queue with id "nonexistent-id" does not exist!'): - await RequestQueue.open(id='nonexistent-id') - - # Test that when you try to open a request queue by ID and you use a name of an existing request queue, - # it doesn't work - with pytest.raises(RuntimeError, match='Request queue with id "dummy-name" does not exist!'): - await RequestQueue.open(id='dummy-name') - - -async def test_same_references() -> None: - rq1 = await RequestQueue.open() - rq2 = await RequestQueue.open() - assert rq1 is rq2 - - rq_name = 'non-default' - rq_named1 = await RequestQueue.open(name=rq_name) - rq_named2 = await RequestQueue.open(name=rq_name) - assert rq_named1 is rq_named2 - - -async def test_drop() -> None: - rq1 = await RequestQueue.open() - await rq1.drop() - rq2 = await RequestQueue.open() - assert rq1 is not rq2 - - -async def test_get_request(request_queue: RequestQueue) -> None: - url = 'https://example.com' - add_request_info = await request_queue.add_request( - { - 'uniqueKey': url, - 'url': url, - } - ) - request = await request_queue.get_request(add_request_info['requestId']) - assert request is not None - assert request['url'] == url - - -async def test_add_fetch_handle_request(request_queue: RequestQueue) -> None: - url = 'https://example.com' - assert await request_queue.is_empty() is True - with pytest.raises(ValueError, match='"url" is required'): - await request_queue.add_request({}) - add_request_info = await request_queue.add_request( - { - 'uniqueKey': url, - 'url': url, - } - ) - assert add_request_info['wasAlreadyPresent'] is False - assert add_request_info['wasAlreadyHandled'] is False - assert await request_queue.is_empty() is False - - # Fetch the request - next_request = await request_queue.fetch_next_request() - assert next_request is not None - - # Mark it as handled - next_request['handledAt'] = datetime.now(timezone.utc) - queue_operation_info = await request_queue.mark_request_as_handled(next_request) - assert queue_operation_info is not None - assert queue_operation_info['uniqueKey'] == url - assert await request_queue.is_finished() is True - - -async def test_reclaim_request(request_queue: RequestQueue) -> None: - url = 'https://example.com' - await request_queue.add_request( - { - 'uniqueKey': url, - 'url': url, - } - ) - # Fetch the request - next_request = await request_queue.fetch_next_request() - assert next_request is not None - assert next_request['uniqueKey'] == url - - # Reclaim - await request_queue.reclaim_request(next_request) - # Try to fetch again after a few secs - await asyncio.sleep(4) # 3 seconds is the consistency delay in request queue - next_again = await request_queue.fetch_next_request() - assert next_again is not None - assert next_again['uniqueKey'] == url From 38debc4559cae156910114384a8f5f6ce07f524f Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 6 Jun 2024 21:59:54 +0200 Subject: [PATCH 05/68] Use id argument --- src/apify/apify_storage_client/dataset_collection_client.py | 4 ++-- .../apify_storage_client/key_value_store_collection_client.py | 4 ++-- .../apify_storage_client/request_queue_collection_client.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/apify/apify_storage_client/dataset_collection_client.py b/src/apify/apify_storage_client/dataset_collection_client.py index 148d2716..7bf3d200 100644 --- a/src/apify/apify_storage_client/dataset_collection_client.py +++ b/src/apify/apify_storage_client/dataset_collection_client.py @@ -20,13 +20,13 @@ def __init__(self, apify_dataset_collection_client: DatasetCollectionClientAsync async def get_or_create( self, *, - id: str | None = None, # TODO unused + id: str | None = None, # noqa: A002 name: str | None = None, schema: dict | None = None, ) -> DatasetMetadata: return DatasetMetadata.model_validate( await self._client.get_or_create( - name=name, + name=id if id is not None else name, schema=schema, ) ) diff --git a/src/apify/apify_storage_client/key_value_store_collection_client.py b/src/apify/apify_storage_client/key_value_store_collection_client.py index 9b825992..7b07019e 100644 --- a/src/apify/apify_storage_client/key_value_store_collection_client.py +++ b/src/apify/apify_storage_client/key_value_store_collection_client.py @@ -20,13 +20,13 @@ def __init__(self, apify_dataset_collection_client: KeyValueStoreCollectionClien async def get_or_create( self, *, - id: str | None = None, # TODO unused + id: str | None = None, # noqa: A002 name: str | None = None, schema: dict | None = None, ) -> KeyValueStoreMetadata: return KeyValueStoreMetadata.model_validate( await self._client.get_or_create( - name=name, + name=id if id is not None else name, schema=schema, ) ) diff --git a/src/apify/apify_storage_client/request_queue_collection_client.py b/src/apify/apify_storage_client/request_queue_collection_client.py index a33f6aac..dad2f285 100644 --- a/src/apify/apify_storage_client/request_queue_collection_client.py +++ b/src/apify/apify_storage_client/request_queue_collection_client.py @@ -20,13 +20,13 @@ def __init__(self, apify_request_queue_collection_client: RequestQueueCollection async def get_or_create( self, *, - id: str | None = None, # TODO unused + id: str | None = None, # noqa: A002 name: str | None = None, schema: dict | None = None, # TODO unused ) -> RequestQueueMetadata: return RequestQueueMetadata.model_validate( await self._client.get_or_create( - name=name, + name=id if id is not None else name, ) ) From 051373cae1b56e43aebaa05a6534bcea3062181e Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 7 Jun 2024 08:51:55 +0200 Subject: [PATCH 06/68] Remove obsolete utils --- src/apify/_utils.py | 237 +------------------------------------------- 1 file changed, 2 insertions(+), 235 deletions(-) diff --git a/src/apify/_utils.py b/src/apify/_utils.py index 14c05e67..305fc282 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -5,60 +5,20 @@ import contextlib import functools import inspect -import json -import mimetypes import os -import re import sys import time -from base64 import b64encode from collections import OrderedDict from collections.abc import MutableMapping -from datetime import datetime, timezone from hashlib import sha256 from importlib import metadata from logging import getLogger -from typing import ( - Any, - Callable, - Generic, - ItemsView, - Iterator, - NoReturn, - TypeVar, - ValuesView, - cast, - overload, -) +from typing import Any, Callable, Generic, ItemsView, Iterator, TypeVar, ValuesView, cast from typing import OrderedDict as OrderedDictType from urllib.parse import parse_qsl, urlencode, urlparse -import aioshutil import psutil -from aiofiles import ospath -from aiofiles.os import remove, rename -from apify_shared.consts import ( - BOOL_ENV_VARS, - BOOL_ENV_VARS_TYPE, - DATETIME_ENV_VARS, - DATETIME_ENV_VARS_TYPE, - FLOAT_ENV_VARS, - FLOAT_ENV_VARS_TYPE, - INTEGER_ENV_VARS, - INTEGER_ENV_VARS_TYPE, - STRING_ENV_VARS_TYPE, - ActorEnvVars, - ApifyEnvVars, -) -from apify_shared.utils import ( - ignore_docs, - is_content_type_json, - is_content_type_text, - is_content_type_xml, - maybe_extract_enum_member_value, -) - -from apify.consts import REQUEST_ID_LENGTH, StorageTypes +from apify_shared.utils import ignore_docs T = TypeVar('T') logger = getLogger(__name__) @@ -115,75 +75,6 @@ def __get__(self: dualproperty, obj: DualPropertyOwner | None, owner: type[DualP return cast(DualPropertyType, val) -@overload -def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE) -> bool | None: ... - - -@overload -def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE, default: bool) -> bool: # noqa: FBT001 - ... - - -@overload -def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE) -> datetime | str | None: ... - - -@overload -def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE, default: datetime) -> datetime | str: ... - - -@overload -def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE) -> float | None: ... - - -@overload -def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE, default: float) -> float: ... - - -@overload -def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE) -> int | None: ... - - -@overload -def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE, default: int) -> int: ... - - -@overload -def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE, default: str) -> str: ... - - -@overload -def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE) -> str | None: ... - - -@overload -def fetch_and_parse_env_var(env_var: ActorEnvVars | ApifyEnvVars) -> Any: ... - - -def fetch_and_parse_env_var(env_var: Any, default: Any = None) -> Any: - env_var_name = str(maybe_extract_enum_member_value(env_var)) - - val = os.getenv(env_var_name) - if not val: - return default - - if env_var in BOOL_ENV_VARS: - return maybe_parse_bool(val) - if env_var in FLOAT_ENV_VARS: - parsed_float = maybe_parse_float(val) - if parsed_float is None: - return default - return parsed_float - if env_var in INTEGER_ENV_VARS: - parsed_int = maybe_parse_int(val) - if parsed_int is None: - return default - return parsed_int - if env_var in DATETIME_ENV_VARS: - return maybe_parse_datetime(val) - return val - - def get_cpu_usage_percent() -> float: return psutil.cpu_percent() @@ -197,33 +88,6 @@ def get_memory_usage_bytes() -> int: return mem -def maybe_parse_bool(val: str | None) -> bool: - if val in {'true', 'True', '1'}: - return True - return False - - -def maybe_parse_datetime(val: str) -> datetime | str: - try: - return datetime.strptime(val, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=timezone.utc) - except ValueError: - return val - - -def maybe_parse_float(val: str) -> float | None: - try: - return float(val) - except ValueError: - return None - - -def maybe_parse_int(val: str) -> int | None: - try: - return int(val) - except ValueError: - return None - - async def run_func_at_interval_async(func: Callable, interval_secs: float) -> None: started_at = time.perf_counter() sleep_until = started_at @@ -241,64 +105,6 @@ async def run_func_at_interval_async(func: Callable, interval_secs: float) -> No await res -async def force_remove(filename: str) -> None: - """JS-like rm(filename, { force: true }).""" - with contextlib.suppress(FileNotFoundError): - await remove(filename) - - -def raise_on_non_existing_storage(client_type: StorageTypes, id: str) -> NoReturn: # noqa: A002 - client_type = maybe_extract_enum_member_value(client_type) - raise ValueError(f'{client_type} with id "{id}" does not exist.') - - -def raise_on_duplicate_storage(client_type: StorageTypes, key_name: str, value: str) -> NoReturn: - client_type = maybe_extract_enum_member_value(client_type) - raise ValueError(f'{client_type} with {key_name} "{value}" already exists.') - - -def guess_file_extension(content_type: str) -> str | None: - """Guess the file extension based on content type.""" - # e.g. mimetypes.guess_extension('application/json ') does not work... - actual_content_type = content_type.split(';')[0].strip() - - # mimetypes.guess_extension returns 'xsl' in this case, because 'application/xxx' is "structured" - # ('text/xml' would be "unstructured" and return 'xml') - # we have to explicitly override it here - if actual_content_type == 'application/xml': - return 'xml' - - # Guess the extension from the mime type - ext = mimetypes.guess_extension(actual_content_type) - - # Remove the leading dot if extension successfully parsed - return ext[1:] if ext is not None else ext - - -def maybe_parse_body(body: bytes, content_type: str) -> Any: - if is_content_type_json(content_type): - return json.loads(body.decode('utf-8')) # Returns any - if is_content_type_xml(content_type) or is_content_type_text(content_type): - return body.decode('utf-8') - return body - - -def unique_key_to_request_id(unique_key: str) -> str: - """Generate request ID based on unique key in a deterministic way.""" - request_id = re.sub(r'(\+|\/|=)', '', b64encode(sha256(unique_key.encode('utf-8')).digest()).decode('utf-8')) - return request_id[:REQUEST_ID_LENGTH] if len(request_id) > REQUEST_ID_LENGTH else request_id - - -async def force_rename(src_dir: str, dst_dir: str) -> None: - """Rename a directory. Checks for existence of soruce directory and removes destination directory if it exists.""" - # Make sure source directory exists - if await ospath.exists(src_dir): - # Remove destination directory if it exists - if await ospath.exists(dst_dir): - await aioshutil.rmtree(dst_dir, ignore_errors=True) - await rename(src_dir, dst_dir) - - ImplementationType = TypeVar('ImplementationType', bound=Callable) MetadataType = TypeVar('MetadataType', bound=Callable) @@ -363,45 +169,6 @@ def is_running_in_ipython() -> bool: return getattr(builtins, '__IPYTHON__', False) -@overload -def budget_ow(value: str | float | bool, predicate: tuple[type, bool], value_name: str) -> None: ... - - -@overload -def budget_ow(value: dict, predicate: dict[str, tuple[type, bool]]) -> None: ... - - -def budget_ow( - value: dict | str | float | bool, - predicate: dict[str, tuple[type, bool]] | tuple[type, bool], - value_name: str | None = None, -) -> None: - """Budget version of ow.""" - - def validate_single(field_value: Any, expected_type: type, required: bool, name: str) -> None: # noqa: FBT001 - if field_value is None and required: - raise ValueError(f'"{name}" is required!') - if (field_value is not None or required) and not isinstance(field_value, expected_type): - raise ValueError(f'"{name}" must be of type "{expected_type.__name__}" but it is "{type(field_value).__name__}"!') - - # Validate object - if isinstance(value, dict) and isinstance(predicate, dict): - for key, (field_type, required) in predicate.items(): - field_value = value.get(key) - validate_single(field_value, field_type, required, key) - # Validate "primitive" - elif isinstance(value, (int, str, float, bool)) and isinstance(predicate, tuple) and value_name is not None: - field_type, required = predicate - validate_single(value, field_type, required, value_name) - else: - raise ValueError('Wrong input!') - - -PARSE_DATE_FIELDS_MAX_DEPTH = 3 -PARSE_DATE_FIELDS_KEY_SUFFIX = 'At' -ListOrDictOrAny = TypeVar('ListOrDictOrAny', list, dict, Any) - - def compute_short_hash(data: bytes, *, length: int = 8) -> str: """Computes a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it. From b026bc03c3417ba6f73af146976d4869ba1ff89b Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 7 Jun 2024 16:45:21 +0200 Subject: [PATCH 07/68] Rework configuration parsing --- src/apify/config.py | 159 ++++++++++--------------------- src/apify/proxy_configuration.py | 2 +- 2 files changed, 52 insertions(+), 109 deletions(-) diff --git a/src/apify/config.py b/src/apify/config.py index 86e82e74..8234e892 100644 --- a/src/apify/config.py +++ b/src/apify/config.py @@ -1,127 +1,70 @@ +# ruff: noqa: TCH002 TCH003 from __future__ import annotations -from apify_shared.consts import ActorEnvVars, ApifyEnvVars +from datetime import datetime, timedelta +from typing import Annotated -from apify._utils import fetch_and_parse_env_var +from crawlee._utils.models import timedelta_ms +from crawlee.configuration import Configuration as CrawleeConfiguration +from pydantic import Field +from typing_extensions import Self -class Configuration: +class Configuration(CrawleeConfiguration): """A class for specifying the configuration of an actor. Can be used either globally via `Configuration.get_global_configuration()`, or it can be specific to each `Actor` instance on the `actor.config` property. """ - _default_instance: Configuration | None = None + actor_id: Annotated[str | None, Field(alias='actor_id')] = None + actor_run_id: Annotated[str | None, Field(alias='actor_run_id')] = None + actor_build_id: Annotated[str | None, Field()] = None + actor_build_number: Annotated[str | None, Field()] = None + actor_task_id: Annotated[str | None, Field(alias='actor_task_id')] = None + actor_events_ws_url: Annotated[str | None, Field(alias='actor_events_websocket_url')] = None + api_base_url: Annotated[str, Field(alias='apify_api_base_url')] = 'https://api.apify.com' + api_public_base_url: Annotated[str, Field(alias='apify_api_public_base_url')] = 'https://api.apify.com' + default_dataset_id: Annotated[str, Field(alias='actor_default_dataset_id')] = 'default' + default_key_value_store_id: Annotated[str, Field(alias='actor_default_key_value_store_id')] = 'default' + default_request_queue_id: Annotated[str, Field(alias='actor_default_request_queue_id')] = 'default' + disable_browser_sandbox: Annotated[bool, Field(alias='apify_disable_browser_sandbox')] = False + headless: Annotated[bool, Field(alias='apify_headless')] = True + input_key: Annotated[str, Field(alias='actor_input_key')] = 'INPUT' + input_secrets_private_key_file: Annotated[str | None, Field(alias='apify_input_secrets_private_key_file')] = None + input_secrets_private_key_passphrase: Annotated[str | None, Field(alias='apify_input_secrets_private_key_passphrase')] = None + is_at_home: Annotated[bool, Field(alias='apify_is_at_home')] = False + max_paid_dataset_items: Annotated[int | None, Field(alias='actor_max_paid_dataset_items')] = None + memory_mbytes: Annotated[int | None, Field(alias='actor_memory_mbytes')] = None + meta_origin: Annotated[str | None, Field(alias='apify_meta_origin')] = None + metamorph_after_sleep: Annotated[timedelta_ms, Field('apify_metamorph_after_sleep_millis')] = timedelta(minutes=5) + persist_state_interval: Annotated[timedelta_ms, Field('apify_persist_state_interval_millis')] = timedelta(minutes=1) + persist_storage: Annotated[bool, Field(alias='apify_persist_storage')] = True + proxy_hostname: Annotated[str, Field(alias='apify_proxy_hostname')] = 'proxy.apify.com' + proxy_password: Annotated[str | None, Field(alias='apify_proxy_password')] = None + proxy_port: Annotated[int, Field(alias='apify_proxy_port')] = 8000 + proxy_status_url: Annotated[str, Field(alias='apify_proxy_status_url')] = 'http://proxy.apify.com' + purge_on_start: Annotated[bool, Field(alias='apify_purge_on_start')] = False + started_at: Annotated[datetime | None, Field(alias='actor_started_at')] = None + timeout_at: Annotated[datetime | None, Field(alias='actor_timeout_at')] = None + token: Annotated[str | None, Field(alias='apify_token')] = None + user_id: Annotated[str | None, Field(alias='apify_user_id')] = None + web_server_port: Annotated[int, Field(alias='actor_web_server_port')] = 4321 + web_server_url: Annotated[str, Field(alias='actor_web_server_url')] = 'http://localhost:4321' + xvfb: Annotated[bool, Field(alias='apify_xvfb')] = False + system_info_interval: Annotated[timedelta_ms, Field(alias='apify_system_info_interval_millis')] = timedelta(minutes=1) - def __init__( - self: Configuration, - *, - api_base_url: str | None = None, - api_public_base_url: str | None = None, - container_port: int | None = None, - container_url: str | None = None, - default_dataset_id: str | None = None, - default_key_value_store_id: str | None = None, - default_request_queue_id: str | None = None, - input_key: str | None = None, - max_used_cpu_ratio: float | None = None, - metamorph_after_sleep_millis: int | None = None, - persist_state_interval_millis: int | None = None, - persist_storage: bool | None = None, - proxy_hostname: str | None = None, - proxy_password: str | None = None, - proxy_port: int | None = None, - proxy_status_url: str | None = None, - purge_on_start: bool | None = None, - token: str | None = None, - system_info_interval_millis: int | None = None, - ) -> None: - """Create a `Configuration` instance. - - All the parameters are loaded by default from environment variables when running on the Apify platform. - You can override them here in the Configuration constructor, which might be useful for local testing of your actors. - - Args: - api_base_url (str, optional): The URL of the Apify API. - This is the URL actually used for connecting to the API, so it can contain an IP address when running in a container on the platform. - api_public_base_url (str, optional): The public URL of the Apify API. - This will always contain the public URL of the API, even when running in a container on the platform. - Useful for generating shareable URLs to key-value store records or datasets. - container_port (int, optional): The port on which the container can listen for HTTP requests. - container_url (str, optional): The URL on which the container can listen for HTTP requests. - default_dataset_id (str, optional): The ID of the default dataset for the actor. - default_key_value_store_id (str, optional): The ID of the default key-value store for the actor. - default_request_queue_id (str, optional): The ID of the default request queue for the actor. - input_key (str, optional): The key of the input record in the actor's default key-value store - max_used_cpu_ratio (float, optional): The CPU usage above which the SYSTEM_INFO event will report the CPU is overloaded. - metamorph_after_sleep_millis (int, optional): How long should the actor sleep after calling metamorph. - persist_state_interval_millis (int, optional): How often should the actor emit the PERSIST_STATE event. - persist_storage (bool, optional): Whether the actor should persist its used storages to the filesystem when running locally. - proxy_hostname (str, optional): The hostname of Apify Proxy. - proxy_password (str, optional): The password for Apify Proxy. - proxy_port (str, optional): The port of Apify Proxy. - proxy_status_url (str, optional): The URL on which the Apify Proxy status page is available. - purge_on_start (str, optional): Whether the actor should purge its default storages on startup, when running locally. - token (str, optional): The API token for the Apify API this actor should use. - system_info_interval_millis (str, optional): How often should the actor emit the SYSTEM_INFO event when running locally. - """ - # TODO: Document all these members - # https://github.com/apify/apify-sdk-python/issues/147 - self.actor_build_id = fetch_and_parse_env_var(ActorEnvVars.BUILD_ID) - self.actor_build_number = fetch_and_parse_env_var(ActorEnvVars.BUILD_NUMBER) - self.actor_events_ws_url = fetch_and_parse_env_var(ActorEnvVars.EVENTS_WEBSOCKET_URL) - self.actor_id = fetch_and_parse_env_var(ActorEnvVars.ID) - self.actor_run_id = fetch_and_parse_env_var(ActorEnvVars.RUN_ID) - self.actor_task_id = fetch_and_parse_env_var(ActorEnvVars.TASK_ID) - self.api_base_url = api_base_url or fetch_and_parse_env_var(ApifyEnvVars.API_BASE_URL, 'https://api.apify.com') - self.api_public_base_url = api_public_base_url or fetch_and_parse_env_var(ApifyEnvVars.API_PUBLIC_BASE_URL, 'https://api.apify.com') - self.chrome_executable_path = fetch_and_parse_env_var(ApifyEnvVars.CHROME_EXECUTABLE_PATH) - self.container_port = container_port or fetch_and_parse_env_var(ActorEnvVars.WEB_SERVER_PORT, 4321) - self.container_url = container_url or fetch_and_parse_env_var(ActorEnvVars.WEB_SERVER_URL, 'http://localhost:4321') - self.dedicated_cpus = fetch_and_parse_env_var(ApifyEnvVars.DEDICATED_CPUS) - self.default_browser_path = fetch_and_parse_env_var(ApifyEnvVars.DEFAULT_BROWSER_PATH) - self.default_dataset_id = default_dataset_id or fetch_and_parse_env_var(ActorEnvVars.DEFAULT_DATASET_ID, 'default') - self.default_key_value_store_id = default_key_value_store_id or fetch_and_parse_env_var(ActorEnvVars.DEFAULT_KEY_VALUE_STORE_ID, 'default') - self.default_request_queue_id = default_request_queue_id or fetch_and_parse_env_var(ActorEnvVars.DEFAULT_REQUEST_QUEUE_ID, 'default') - self.disable_browser_sandbox = fetch_and_parse_env_var(ApifyEnvVars.DISABLE_BROWSER_SANDBOX, default=False) - self.headless = fetch_and_parse_env_var(ApifyEnvVars.HEADLESS, default=True) - self.input_key = input_key or fetch_and_parse_env_var(ActorEnvVars.INPUT_KEY, 'INPUT') - self.input_secrets_private_key_file = fetch_and_parse_env_var(ApifyEnvVars.INPUT_SECRETS_PRIVATE_KEY_FILE) - self.input_secrets_private_key_passphrase = fetch_and_parse_env_var(ApifyEnvVars.INPUT_SECRETS_PRIVATE_KEY_PASSPHRASE) - self.is_at_home = fetch_and_parse_env_var(ApifyEnvVars.IS_AT_HOME, default=False) - self.max_used_cpu_ratio = max_used_cpu_ratio or fetch_and_parse_env_var(ApifyEnvVars.MAX_USED_CPU_RATIO, 0.95) - self.memory_mbytes = fetch_and_parse_env_var(ActorEnvVars.MEMORY_MBYTES) - self.meta_origin = fetch_and_parse_env_var(ApifyEnvVars.META_ORIGIN) - self.metamorph_after_sleep_millis = metamorph_after_sleep_millis or fetch_and_parse_env_var(ApifyEnvVars.METAMORPH_AFTER_SLEEP_MILLIS, 300000) - self.persist_state_interval_millis = persist_state_interval_millis or fetch_and_parse_env_var( - ApifyEnvVars.PERSIST_STATE_INTERVAL_MILLIS, 60000 - ) - self.persist_storage = persist_storage or fetch_and_parse_env_var(ApifyEnvVars.PERSIST_STORAGE, default=True) - self.proxy_hostname = proxy_hostname or fetch_and_parse_env_var(ApifyEnvVars.PROXY_HOSTNAME, 'proxy.apify.com') - self.proxy_password = proxy_password or fetch_and_parse_env_var(ApifyEnvVars.PROXY_PASSWORD) - self.proxy_port = proxy_port or fetch_and_parse_env_var(ApifyEnvVars.PROXY_PORT, 8000) - self.proxy_status_url = proxy_status_url or fetch_and_parse_env_var(ApifyEnvVars.PROXY_STATUS_URL, 'http://proxy.apify.com') - self.purge_on_start = purge_on_start or fetch_and_parse_env_var(ApifyEnvVars.PURGE_ON_START, default=False) - self.started_at = fetch_and_parse_env_var(ActorEnvVars.STARTED_AT) - self.timeout_at = fetch_and_parse_env_var(ActorEnvVars.TIMEOUT_AT) - self.token = token or fetch_and_parse_env_var(ApifyEnvVars.TOKEN) - self.user_id = fetch_and_parse_env_var(ApifyEnvVars.USER_ID) - self.xvfb = fetch_and_parse_env_var(ApifyEnvVars.XVFB, default=False) - self.system_info_interval_millis = system_info_interval_millis or fetch_and_parse_env_var(ApifyEnvVars.SYSTEM_INFO_INTERVAL_MILLIS, 60000) + # TODO chrome_executable_path, container_port, container_url, dedicated_cpus, default_browser_path, + # disable_browser_sandbox, input_secrets_private_key_file, input_secrets_private_key_passphrase, max_used_cpu_ratio @classmethod - def _get_default_instance(cls: type[Configuration]) -> Configuration: - if cls._default_instance is None: - cls._default_instance = cls() - - return cls._default_instance - - @classmethod - def get_global_configuration(cls: type[Configuration]) -> Configuration: + def get_global_configuration(cls) -> Self: """Retrive the global configuration. The global configuration applies when you call actor methods via their static versions, e.g. `Actor.init()`. Also accessible via `Actor.config`. """ - return cls._get_default_instance() + if cls._default_instance is None: + cls._default_instance = cls() + + return cls._default_instance diff --git a/src/apify/proxy_configuration.py b/src/apify/proxy_configuration.py index 0370959b..c8c84510 100644 --- a/src/apify/proxy_configuration.py +++ b/src/apify/proxy_configuration.py @@ -178,7 +178,7 @@ def __init__( 'See https://sdk.apify.com/docs/guides/proxy-management#apify-proxy-configuration' ) - self._actor_config = _actor_config or Configuration._get_default_instance() + self._actor_config = _actor_config or Configuration.get_global_configuration() self._apify_client = _apify_client self._hostname = self._actor_config.proxy_hostname From 79817e73206ce9fd508d69847aa6b9920dd2cd77 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 10 Jun 2024 13:30:04 +0200 Subject: [PATCH 08/68] Implement force_cloud option --- pyproject.toml | 2 +- src/apify/actor.py | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5a4f9a19..65e1c0f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "aiofiles >= 22.1.0", "aioshutil >= 1.0", "colorama >= 0.4.6", - "crawlee >= 0.0.5b3", + "crawlee >= 0.0.5b4", "cryptography >= 39.0.0", "httpx >= 0.24.0", "psutil >= 5.9.0", diff --git a/src/apify/actor.py b/src/apify/actor.py index 2b14325c..faed4c6c 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -6,7 +6,7 @@ import os import sys from datetime import datetime, timedelta, timezone -from typing import TYPE_CHECKING, Any, Awaitable, Callable, TypeVar, cast +from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorEventTypes, ActorExitCodes, ApifyEnvVars, WebhookEventType @@ -34,6 +34,7 @@ if TYPE_CHECKING: import logging + from collections.abc import Awaitable from types import TracebackType T = TypeVar('T') @@ -544,7 +545,11 @@ async def _open_dataset_internal( ) -> Dataset: self._raise_if_not_initialized() - return await Dataset.open(id=id, name=name, configuration=self._configuration) + configuration_updates = {} + if force_cloud: + configuration_updates['is_at_home'] = True + + return await Dataset.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) @classmethod async def open_key_value_store( @@ -582,7 +587,11 @@ async def _open_key_value_store_internal( ) -> KeyValueStore: self._raise_if_not_initialized() - return await KeyValueStore.open(id=id, name=name, configuration=self._configuration) + configuration_updates = {} + if force_cloud: + configuration_updates['is_at_home'] = True + + return await KeyValueStore.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) @classmethod async def open_request_queue( @@ -621,7 +630,11 @@ async def _open_request_queue_internal( ) -> RequestQueue: self._raise_if_not_initialized() - return await RequestQueue.open(id=id, name=name, configuration=self._configuration) + configuration_updates = {} + if force_cloud: + configuration_updates['is_at_home'] = True + + return await RequestQueue.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) @classmethod async def push_data(cls: type[Actor], data: Any) -> None: From fe70d9ae915398f8f70d6b8e62152897baa2250e Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 10 Jun 2024 13:44:12 +0200 Subject: [PATCH 09/68] Fix Actor.get_env --- src/apify/actor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/apify/actor.py b/src/apify/actor.py index faed4c6c..07cd6c72 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -16,7 +16,6 @@ from apify._crypto import decrypt_input_secrets, load_private_key from apify._utils import ( dualproperty, - fetch_and_parse_env_var, get_cpu_usage_percent, get_memory_usage_bytes, get_system_info, @@ -793,7 +792,9 @@ def get_env(cls: type[Actor]) -> dict: def _get_env_internal(self: Actor) -> dict: self._raise_if_not_initialized() - return {env_var.name.lower(): fetch_and_parse_env_var(env_var) for env_var in [*ActorEnvVars, *ApifyEnvVars]} + config = self._configuration.model_dump(by_alias=True) + env_vars = {env_var.value.lower(): env_var.name.lower() for env_var in [*ActorEnvVars, *ApifyEnvVars]} + return {option_name: config[env_var] for env_var, option_name in env_vars} @classmethod async def start( From 01a3d239e6969c94cd32e9453ba5a2fb417ce7ec Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 10 Jun 2024 14:53:25 +0200 Subject: [PATCH 10/68] Use RecurringTask from crawlee --- src/apify/actor.py | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/src/apify/actor.py b/src/apify/actor.py index 07cd6c72..0669f192 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -11,18 +11,11 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorEventTypes, ActorExitCodes, ApifyEnvVars, WebhookEventType from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value +from crawlee._utils.recurring_task import RecurringTask from crawlee.storage_client_manager import StorageClientManager from apify._crypto import decrypt_input_secrets, load_private_key -from apify._utils import ( - dualproperty, - get_cpu_usage_percent, - get_memory_usage_bytes, - get_system_info, - is_running_in_ipython, - run_func_at_interval_async, - wrap_internal, -) +from apify._utils import dualproperty, get_cpu_usage_percent, get_memory_usage_bytes, get_system_info, is_running_in_ipython, wrap_internal from apify.apify_storage_client.apify_storage_client import ApifyStorageClient from apify.config import Configuration from apify.consts import EVENT_LISTENERS_TIMEOUT_SECS @@ -127,6 +120,16 @@ def __init__(self: Actor, config: Configuration | None = None) -> None: self._is_initialized = False + self._system_info_task = RecurringTask(self._send_system_info, self._configuration.system_info_interval) + self._persist_state_task = RecurringTask(self._send_persist_state, self._configuration.persist_state_interval) + + def _send_system_info(self) -> None: + if not self._configuration.is_at_home: + self._event_manager.emit(ActorEventTypes.SYSTEM_INFO, self.get_system_info()) + + def _send_persist_state(self) -> None: + self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': False}) + @ignore_docs async def __aenter__(self: Actor) -> Actor: """Initialize the Actor. @@ -233,20 +236,8 @@ async def _init_internal(self: Actor) -> None: await self._event_manager.init() - self._send_persist_state_interval_task = asyncio.create_task( - run_func_at_interval_async( - lambda: self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': False}), - self._configuration.persist_state_interval_millis / 1000, - ), - ) - - if not self.is_at_home(): - self._send_system_info_interval_task = asyncio.create_task( - run_func_at_interval_async( - lambda: self._event_manager.emit(ActorEventTypes.SYSTEM_INFO, self.get_system_info()), - self._configuration.system_info_interval_millis / 1000, - ), - ) + self._system_info_task.start() + self._persist_state_task.start() self._event_manager.on(ActorEventTypes.MIGRATING, self._respond_to_migrating_event) From c943210efba956e947ddc89f2e7c91add4456607 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 10 Jun 2024 16:30:28 +0200 Subject: [PATCH 11/68] Use timedelta instead of numbers of seconds/milliseconds in Actor class --- src/apify/actor.py | 130 ++++++++++++++++++++++---------------------- src/apify/consts.py | 12 +--- 2 files changed, 68 insertions(+), 74 deletions(-) diff --git a/src/apify/actor.py b/src/apify/actor.py index 0669f192..33496586 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -18,7 +18,7 @@ from apify._utils import dualproperty, get_cpu_usage_percent, get_memory_usage_bytes, get_system_info, is_running_in_ipython, wrap_internal from apify.apify_storage_client.apify_storage_client import ApifyStorageClient from apify.config import Configuration -from apify.consts import EVENT_LISTENERS_TIMEOUT_SECS +from apify.consts import EVENT_LISTENERS_TIMEOUT from apify.event_manager import EventManager from apify.log import logger from apify.proxy_configuration import ProxyConfiguration @@ -288,7 +288,7 @@ async def exit( cls: type[Actor], *, exit_code: int = 0, - event_listeners_timeout_secs: float | None = EVENT_LISTENERS_TIMEOUT_SECS, + event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, status_message: str | None = None, cleanup_timeout: timedelta = timedelta(seconds=30), ) -> None: @@ -302,13 +302,13 @@ async def exit( Args: exit_code (int, optional): The exit code with which the actor should fail (defaults to `0`). - event_listeners_timeout_secs (float, optional): How long should the actor wait for actor event listeners to finish before exiting. + event_listeners_timeout (timedelta, optional): How long should the actor wait for actor event listeners to finish before exiting. status_message (str, optional): The final status message that the actor should display. cleanup_timeout (timedelta, optional): How long we should wait for event listeners. """ return await cls._get_default_instance().exit( exit_code=exit_code, - event_listeners_timeout_secs=event_listeners_timeout_secs, + event_listeners_timeout=event_listeners_timeout, status_message=status_message, cleanup_timeout=cleanup_timeout, ) @@ -317,7 +317,7 @@ async def _exit_internal( self: Actor, *, exit_code: int = 0, - event_listeners_timeout_secs: float | None = EVENT_LISTENERS_TIMEOUT_SECS, + event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, status_message: str | None = None, cleanup_timeout: timedelta = timedelta(seconds=30), ) -> None: @@ -343,7 +343,7 @@ async def finalize() -> None: # Sleep for a bit so that the listeners have a chance to trigger await asyncio.sleep(0.1) - await self._event_manager.close(event_listeners_timeout_secs=event_listeners_timeout_secs) + await self._event_manager.close(event_listeners_timeout_secs=event_listeners_timeout.total_seconds() if event_listeners_timeout else None) await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds()) self._is_initialized = False @@ -449,8 +449,8 @@ def new_client( token: str | None = None, api_url: str | None = None, max_retries: int | None = None, - min_delay_between_retries_millis: int | None = None, - timeout_secs: int | None = None, + min_delay_between_retries: timedelta | None = None, + timeout: timedelta | None = None, ) -> ApifyClientAsync: """Return a new instance of the Apify API client. @@ -464,16 +464,16 @@ def new_client( token (str, optional): The Apify API token api_url (str, optional): The URL of the Apify API server to which to connect to. Defaults to https://api.apify.com max_retries (int, optional): How many times to retry a failed request at most - min_delay_between_retries_millis (int, optional): How long will the client wait between retrying requests + min_delay_between_retries (timedelta, optional): How long will the client wait between retrying requests (increases exponentially from this value) - timeout_secs (int, optional): The socket timeout of the HTTP requests sent to the Apify API + timeout (timedelta, optional): The socket timeout of the HTTP requests sent to the Apify API """ return cls._get_default_instance().new_client( token=token, api_url=api_url, max_retries=max_retries, - min_delay_between_retries_millis=min_delay_between_retries_millis, - timeout_secs=timeout_secs, + min_delay_between_retries=min_delay_between_retries, + timeout=timeout, ) def _new_client_internal( @@ -482,8 +482,8 @@ def _new_client_internal( token: str | None = None, api_url: str | None = None, max_retries: int | None = None, - min_delay_between_retries_millis: int | None = None, - timeout_secs: int | None = None, + min_delay_between_retries: timedelta | None = None, + timeout: timedelta | None = None, ) -> ApifyClientAsync: token = token or self._configuration.token api_url = api_url or self._configuration.api_base_url @@ -491,8 +491,8 @@ def _new_client_internal( token=token, api_url=api_url, max_retries=max_retries, - min_delay_between_retries_millis=min_delay_between_retries_millis, - timeout_secs=timeout_secs, + min_delay_between_retries_millis=int(min_delay_between_retries.total_seconds() * 1000) if min_delay_between_retries is not None else None, + timeout_secs=int(timeout.total_seconds()) if timeout else None, ) def _get_storage_client(self: Actor, force_cloud: bool) -> ApifyClientAsync | None: # noqa: FBT001 @@ -797,7 +797,7 @@ async def start( content_type: str | None = None, build: str | None = None, memory_mbytes: int | None = None, - timeout_secs: int | None = None, + timeout: int | None = None, wait_for_finish: int | None = None, webhooks: list[dict] | None = None, ) -> dict: @@ -814,7 +814,7 @@ async def start( By default, the run uses the build specified in the default run configuration for the actor (typically latest). memory_mbytes (int, optional): Memory limit for the run, in megabytes. By default, the run uses a memory limit specified in the default run configuration for the actor. - timeout_secs (int, optional): Optional timeout for the run, in seconds. + timeout (timedelta, optional): Optional timeout for the run, in seconds. By default, the run uses timeout specified in the default run configuration for the actor. wait_for_finish (int, optional): The maximum number of seconds the server waits for the run to finish. By default, it is 0, the maximum value is 300. @@ -837,7 +837,7 @@ async def start( content_type=content_type, build=build, memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, + timeout=timeout, wait_for_finish=wait_for_finish, webhooks=webhooks, ) @@ -851,7 +851,7 @@ async def _start_internal( content_type: str | None = None, build: str | None = None, memory_mbytes: int | None = None, - timeout_secs: int | None = None, + timeout: timedelta | None = None, wait_for_finish: int | None = None, webhooks: list[dict] | None = None, ) -> dict: @@ -864,7 +864,7 @@ async def _start_internal( content_type=content_type, build=build, memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, + timeout_secs=int(timeout.total_seconds()) if timeout is not None else None, wait_for_finish=wait_for_finish, webhooks=webhooks, ) @@ -922,13 +922,13 @@ async def call( content_type: str | None = None, build: str | None = None, memory_mbytes: int | None = None, - timeout_secs: int | None = None, + timeout: timedelta | None = None, webhooks: list[dict] | None = None, - wait_secs: int | None = None, + wait: timedelta | None = None, ) -> dict | None: """Start an actor on the Apify Platform and wait for it to finish before returning. - It waits indefinitely, unless the wait_secs argument is provided. + It waits indefinitely, unless the wait argument is provided. Args: actor_id (str): The ID of the actor to be run. @@ -939,12 +939,12 @@ async def call( By default, the run uses the build specified in the default run configuration for the actor (typically latest). memory_mbytes (int, optional): Memory limit for the run, in megabytes. By default, the run uses a memory limit specified in the default run configuration for the actor. - timeout_secs (int, optional): Optional timeout for the run, in seconds. + timeout (timedelta, optional): Optional timeout for the run, in seconds. By default, the run uses timeout specified in the default run configuration for the actor. webhooks (list, optional): Optional webhooks (https://docs.apify.com/webhooks) associated with the actor run, which can be used to receive a notification, e.g. when the actor finished or failed. If you already have a webhook set up for the actor, you do not have to add it again here. - wait_secs (int, optional): The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. + wait(timedelta, optional): The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. Returns: dict: Info about the started actor run @@ -956,9 +956,9 @@ async def call( content_type=content_type, build=build, memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, + timeout=timeout, webhooks=webhooks, - wait_secs=wait_secs, + wait=wait, ) async def _call_internal( @@ -970,9 +970,9 @@ async def _call_internal( content_type: str | None = None, build: str | None = None, memory_mbytes: int | None = None, - timeout_secs: int | None = None, + timeout: timedelta | None = None, webhooks: list[dict] | None = None, - wait_secs: int | None = None, + wait: timedelta | None = None, ) -> dict | None: self._raise_if_not_initialized() @@ -983,9 +983,9 @@ async def _call_internal( content_type=content_type, build=build, memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, + timeout_secs=int(timeout.total_seconds()) if timeout is not None else None, webhooks=webhooks, - wait_secs=wait_secs, + wait_secs=int(wait.total_seconds()) if wait is not None else None, ) @classmethod @@ -996,14 +996,14 @@ async def call_task( *, build: str | None = None, memory_mbytes: int | None = None, - timeout_secs: int | None = None, + timeout: timedelta | None = None, webhooks: list[dict] | None = None, - wait_secs: int | None = None, + wait: timedelta | None = None, token: str | None = None, ) -> dict | None: """Start an actor task on the Apify Platform and wait for it to finish before returning. - It waits indefinitely, unless the wait_secs argument is provided. + It waits indefinitely, unless the wait argument is provided. Note that an actor task is a saved input configuration and options for an actor. If you want to run an actor directly rather than an actor task, please use the `Actor.call` @@ -1017,12 +1017,12 @@ async def call_task( By default, the run uses the build specified in the default run configuration for the actor (typically latest). memory_mbytes (int, optional): Memory limit for the run, in megabytes. By default, the run uses a memory limit specified in the default run configuration for the actor. - timeout_secs (int, optional): Optional timeout for the run, in seconds. + timeout (timedelta, optional): Optional timeout for the run, in seconds. By default, the run uses timeout specified in the default run configuration for the actor. webhooks (list, optional): Optional webhooks (https://docs.apify.com/webhooks) associated with the actor run, which can be used to receive a notification, e.g. when the actor finished or failed. If you already have a webhook set up for the actor, you do not have to add it again here. - wait_secs (int, optional): The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. + wait (timedelta, optional): The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. Returns: dict: Info about the started actor run @@ -1033,9 +1033,9 @@ async def call_task( token=token, build=build, memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, + timeout=timeout, webhooks=webhooks, - wait_secs=wait_secs, + wait=wait, ) async def _call_task_internal( @@ -1045,9 +1045,9 @@ async def _call_task_internal( *, build: str | None = None, memory_mbytes: int | None = None, - timeout_secs: int | None = None, + timeout: timedelta | None = None, webhooks: list[dict] | None = None, - wait_secs: int | None = None, + wait: timedelta | None = None, token: str | None = None, ) -> dict | None: self._raise_if_not_initialized() @@ -1058,9 +1058,9 @@ async def _call_task_internal( task_input=task_input, build=build, memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, + timeout_secs=int(timeout.total_seconds()) if timeout is not None else None, webhooks=webhooks, - wait_secs=wait_secs, + wait_secs=int(wait.total_seconds()) if wait is not None else None, ) @classmethod @@ -1071,7 +1071,7 @@ async def metamorph( *, target_actor_build: str | None = None, content_type: str | None = None, - custom_after_sleep_millis: int | None = None, + custom_after_sleep: timedelta | None = None, ) -> None: """Transform this actor run to an actor run of a different actor. @@ -1085,7 +1085,7 @@ async def metamorph( target_actor_build (str, optional): The build of the target actor. It can be either a build tag or build number. By default, the run uses the build specified in the default run configuration for the target actor (typically the latest build). content_type (str, optional): The content type of the input. - custom_after_sleep_millis (int, optional): How long to sleep for after the metamorph, to wait for the container to be stopped. + custom_after_sleep (timedelta, optional): How long to sleep for after the metamorph, to wait for the container to be stopped. Returns: dict: The actor run data. @@ -1095,7 +1095,7 @@ async def metamorph( target_actor_build=target_actor_build, run_input=run_input, content_type=content_type, - custom_after_sleep_millis=custom_after_sleep_millis, + custom_after_sleep=custom_after_sleep, ) async def _metamorph_internal( @@ -1105,7 +1105,7 @@ async def _metamorph_internal( *, target_actor_build: str | None = None, content_type: str | None = None, - custom_after_sleep_millis: int | None = None, + custom_after_sleep: timedelta | None = None, ) -> None: self._raise_if_not_initialized() @@ -1113,8 +1113,8 @@ async def _metamorph_internal( self.log.error('Actor.metamorph() is only supported when running on the Apify platform.') return - if not custom_after_sleep_millis: - custom_after_sleep_millis = self._configuration.metamorph_after_sleep_millis + if not custom_after_sleep: + custom_after_sleep = self._configuration.metamorph_after_sleep # If is_at_home() is True, config.actor_run_id is always set assert self._configuration.actor_run_id is not None # noqa: S101 @@ -1126,34 +1126,34 @@ async def _metamorph_internal( content_type=content_type, ) - if custom_after_sleep_millis: - await asyncio.sleep(custom_after_sleep_millis / 1000) + if custom_after_sleep: + await asyncio.sleep(custom_after_sleep.total_seconds()) @classmethod async def reboot( cls: type[Actor], *, - event_listeners_timeout_secs: int | None = EVENT_LISTENERS_TIMEOUT_SECS, - custom_after_sleep_millis: int | None = None, + event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, + custom_after_sleep: timedelta | None = None, ) -> None: """Internally reboot this actor. The system stops the current container and starts a new one, with the same run ID and default storages. Args: - event_listeners_timeout_secs (int, optional): How long should the actor wait for actor event listeners to finish before exiting - custom_after_sleep_millis (int, optional): How long to sleep for after the reboot, to wait for the container to be stopped. + event_listeners_timeout (timedelta, optional): How long should the actor wait for actor event listeners to finish before exiting + custom_after_sleep (timedelta, optional): How long to sleep for after the reboot, to wait for the container to be stopped. """ return await cls._get_default_instance().reboot( - event_listeners_timeout_secs=event_listeners_timeout_secs, - custom_after_sleep_millis=custom_after_sleep_millis, + event_listeners_timeout=event_listeners_timeout, + custom_after_sleep=custom_after_sleep, ) async def _reboot_internal( self: Actor, *, - event_listeners_timeout_secs: int | None = EVENT_LISTENERS_TIMEOUT_SECS, - custom_after_sleep_millis: int | None = None, + event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, + custom_after_sleep: timedelta | None = None, ) -> None: self._raise_if_not_initialized() @@ -1161,21 +1161,23 @@ async def _reboot_internal( self.log.error('Actor.reboot() is only supported when running on the Apify platform.') return - if not custom_after_sleep_millis: - custom_after_sleep_millis = self._configuration.metamorph_after_sleep_millis + if not custom_after_sleep: + custom_after_sleep = self._configuration.metamorph_after_sleep await self._cancel_event_emitting_intervals() self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': True}) self._was_final_persist_state_emitted = True - await self._event_manager.close(event_listeners_timeout_secs=event_listeners_timeout_secs) + await self._event_manager.close( + event_listeners_timeout_secs=int(event_listeners_timeout.total_seconds()) if event_listeners_timeout is not None else None, + ) assert self._configuration.actor_run_id is not None # noqa: S101 await self._apify_client.run(self._configuration.actor_run_id).reboot() - if custom_after_sleep_millis: - await asyncio.sleep(custom_after_sleep_millis / 1000) + if custom_after_sleep: + await asyncio.sleep(custom_after_sleep.total_seconds()) @classmethod async def add_webhook( diff --git a/src/apify/consts.py b/src/apify/consts.py index 47d2ca7b..4ed8bba7 100644 --- a/src/apify/consts.py +++ b/src/apify/consts.py @@ -2,18 +2,10 @@ import re import warnings +from datetime import timedelta from enum import Enum from typing import Any -from apify_shared.consts import BOOL_ENV_VARS as _BOOL_ENV_VARS # noqa: F401 -from apify_shared.consts import DATETIME_ENV_VARS as _DATETIME_ENV_VARS # noqa: F401 -from apify_shared.consts import FLOAT_ENV_VARS as _FLOAT_ENV_VARS # noqa: F401 -from apify_shared.consts import INTEGER_ENV_VARS as _INTEGER_ENV_VARS # noqa: F401 -from apify_shared.consts import STRING_ENV_VARS as _STRING_ENV_VARS # noqa: F401 -from apify_shared.consts import ActorEventTypes as _ActorEventTypes # noqa: F401 -from apify_shared.consts import ActorExitCodes as _ActorExitCodes # noqa: F401 -from apify_shared.consts import ApifyEnvVars as _ApifyEnvVars # noqa: F401 - DEPRECATED_NAMES = [ 'BOOL_ENV_VARS', 'DATETIME_ENV_VARS', @@ -57,7 +49,7 @@ class StorageTypes(str, Enum): REQUEST_QUEUE_HEAD_MAX_LIMIT = 1000 -EVENT_LISTENERS_TIMEOUT_SECS = 5 +EVENT_LISTENERS_TIMEOUT = timedelta(seconds=5) BASE64_REGEXP = '[-A-Za-z0-9+/]*={0,3}' ENCRYPTED_INPUT_VALUE_PREFIX = 'ENCRYPTED_VALUE' From 57a402cd1b500ffd15507e47b695274637e3dfb1 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 12 Jun 2024 13:51:06 +0200 Subject: [PATCH 12/68] Turns out there shouldn't be any recurring tasks in Actor --- src/apify/_utils.py | 36 ---------------------- src/apify/actor.py | 74 ++------------------------------------------- 2 files changed, 2 insertions(+), 108 deletions(-) diff --git a/src/apify/_utils.py b/src/apify/_utils.py index 305fc282..22823e13 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -1,13 +1,8 @@ from __future__ import annotations -import asyncio import builtins -import contextlib import functools -import inspect -import os import sys -import time from collections import OrderedDict from collections.abc import MutableMapping from hashlib import sha256 @@ -17,7 +12,6 @@ from typing import OrderedDict as OrderedDictType from urllib.parse import parse_qsl, urlencode, urlparse -import psutil from apify_shared.utils import ignore_docs T = TypeVar('T') @@ -75,36 +69,6 @@ def __get__(self: dualproperty, obj: DualPropertyOwner | None, owner: type[DualP return cast(DualPropertyType, val) -def get_cpu_usage_percent() -> float: - return psutil.cpu_percent() - - -def get_memory_usage_bytes() -> int: - current_process = psutil.Process(os.getpid()) - mem = int(current_process.memory_info().rss or 0) - for child in current_process.children(recursive=True): - with contextlib.suppress(psutil.NoSuchProcess): - mem += int(child.memory_info().rss or 0) - return mem - - -async def run_func_at_interval_async(func: Callable, interval_secs: float) -> None: - started_at = time.perf_counter() - sleep_until = started_at - while True: - now = time.perf_counter() - sleep_until += interval_secs - while sleep_until < now: - sleep_until += interval_secs - - sleep_for_secs = sleep_until - now - await asyncio.sleep(sleep_for_secs) - - res = func() - if inspect.isawaitable(res): - await res - - ImplementationType = TypeVar('ImplementationType', bound=Callable) MetadataType = TypeVar('MetadataType', bound=Callable) diff --git a/src/apify/actor.py b/src/apify/actor.py index 33496586..b23842d5 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -1,21 +1,19 @@ from __future__ import annotations import asyncio -import contextlib import inspect import os import sys -from datetime import datetime, timedelta, timezone +from datetime import timedelta from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorEventTypes, ActorExitCodes, ApifyEnvVars, WebhookEventType from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value -from crawlee._utils.recurring_task import RecurringTask from crawlee.storage_client_manager import StorageClientManager from apify._crypto import decrypt_input_secrets, load_private_key -from apify._utils import dualproperty, get_cpu_usage_percent, get_memory_usage_bytes, get_system_info, is_running_in_ipython, wrap_internal +from apify._utils import dualproperty, get_system_info, is_running_in_ipython, wrap_internal from apify.apify_storage_client.apify_storage_client import ApifyStorageClient from apify.config import Configuration from apify.consts import EVENT_LISTENERS_TIMEOUT @@ -65,10 +63,7 @@ class Actor(metaclass=_ActorContextManager): _apify_client: ApifyClientAsync _configuration: Configuration _event_manager: EventManager - _send_system_info_interval_task: asyncio.Task | None = None - _send_persist_state_interval_task: asyncio.Task | None = None _is_exiting = False - _was_final_persist_state_emitted = False def __init__(self: Actor, config: Configuration | None = None) -> None: """Create an Actor instance. @@ -120,16 +115,6 @@ def __init__(self: Actor, config: Configuration | None = None) -> None: self._is_initialized = False - self._system_info_task = RecurringTask(self._send_system_info, self._configuration.system_info_interval) - self._persist_state_task = RecurringTask(self._send_persist_state, self._configuration.persist_state_interval) - - def _send_system_info(self) -> None: - if not self._configuration.is_at_home: - self._event_manager.emit(ActorEventTypes.SYSTEM_INFO, self.get_system_info()) - - def _send_persist_state(self) -> None: - self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': False}) - @ignore_docs async def __aenter__(self: Actor) -> Actor: """Initialize the Actor. @@ -236,53 +221,8 @@ async def _init_internal(self: Actor) -> None: await self._event_manager.init() - self._system_info_task.start() - self._persist_state_task.start() - - self._event_manager.on(ActorEventTypes.MIGRATING, self._respond_to_migrating_event) - - # The CPU usage is calculated as an average between two last calls to psutil - # We need to make a first, dummy call, so the next calls have something to compare itself agains - get_cpu_usage_percent() - self._is_initialized = True - def get_system_info(self: Actor) -> dict: - """Get the current system info.""" - cpu_usage_percent = get_cpu_usage_percent() - memory_usage_bytes = get_memory_usage_bytes() - # This is in camel case to be compatible with the events from the platform - result = { - 'createdAt': datetime.now(timezone.utc), - 'cpuCurrentUsage': cpu_usage_percent, - 'memCurrentBytes': memory_usage_bytes, - } - if self._configuration.max_used_cpu_ratio: - result['isCpuOverloaded'] = cpu_usage_percent > 100 * self._configuration.max_used_cpu_ratio - - return result - - async def _respond_to_migrating_event(self: Actor, _event_data: Any) -> None: - # Don't emit any more regular persist state events - if self._send_persist_state_interval_task and not self._send_persist_state_interval_task.cancelled(): - self._send_persist_state_interval_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await self._send_persist_state_interval_task - - self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': True}) - self._was_final_persist_state_emitted = True - - async def _cancel_event_emitting_intervals(self: Actor) -> None: - if self._send_persist_state_interval_task and not self._send_persist_state_interval_task.cancelled(): - self._send_persist_state_interval_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await self._send_persist_state_interval_task - - if self._send_system_info_interval_task and not self._send_system_info_interval_task.cancelled(): - self._send_system_info_interval_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await self._send_system_info_interval_task - @classmethod async def exit( cls: type[Actor], @@ -330,13 +270,6 @@ async def _exit_internal( self.log.info('Exiting actor', extra={'exit_code': exit_code}) async def finalize() -> None: - await self._cancel_event_emitting_intervals() - - # Send final persist state event - if not self._was_final_persist_state_emitted: - self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': False}) - self._was_final_persist_state_emitted = True - if status_message is not None: await self.set_status_message(status_message, is_terminal=True) @@ -1164,10 +1097,7 @@ async def _reboot_internal( if not custom_after_sleep: custom_after_sleep = self._configuration.metamorph_after_sleep - await self._cancel_event_emitting_intervals() - self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': True}) - self._was_final_persist_state_emitted = True await self._event_manager.close( event_listeners_timeout_secs=int(event_listeners_timeout.total_seconds()) if event_listeners_timeout is not None else None, From 1620dfeb37bda34181ae7f5788eda119f3675f35 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 12 Jun 2024 17:27:55 +0200 Subject: [PATCH 13/68] Implement PlatformEventManager --- pyproject.toml | 2 +- src/apify/actor.py | 30 ++-- src/apify/event_manager.py | 279 ++++++++++++++----------------------- 3 files changed, 123 insertions(+), 188 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 65e1c0f1..663afd4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "aiofiles >= 22.1.0", "aioshutil >= 1.0", "colorama >= 0.4.6", - "crawlee >= 0.0.5b4", + "crawlee >= 0.0.5b8", "cryptography >= 39.0.0", "httpx >= 0.24.0", "psutil >= 5.9.0", diff --git a/src/apify/actor.py b/src/apify/actor.py index b23842d5..61510288 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -10,6 +10,7 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorEventTypes, ActorExitCodes, ApifyEnvVars, WebhookEventType from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value +from crawlee.events.types import Event, EventPersistStateData from crawlee.storage_client_manager import StorageClientManager from apify._crypto import decrypt_input_secrets, load_private_key @@ -17,7 +18,7 @@ from apify.apify_storage_client.apify_storage_client import ApifyStorageClient from apify.config import Configuration from apify.consts import EVENT_LISTENERS_TIMEOUT -from apify.event_manager import EventManager +from apify.event_manager import EventManager, PlatformEventManager from apify.log import logger from apify.proxy_configuration import ProxyConfiguration from apify.storages import Dataset, KeyValueStore, RequestQueue @@ -27,6 +28,7 @@ from collections.abc import Awaitable from types import TracebackType + T = TypeVar('T') MainReturnType = TypeVar('MainReturnType') @@ -62,7 +64,6 @@ class Actor(metaclass=_ActorContextManager): _default_instance: Actor | None = None _apify_client: ApifyClientAsync _configuration: Configuration - _event_manager: EventManager _is_exiting = False def __init__(self: Actor, config: Configuration | None = None) -> None: @@ -111,7 +112,11 @@ def __init__(self: Actor, config: Configuration | None = None) -> None: self._configuration = config or Configuration() self._apify_client = self.new_client() - self._event_manager = EventManager(config=self._configuration) + + if self._configuration.is_at_home: + self._event_manager = PlatformEventManager(config=self._configuration) + else: + self._event_manager = EventManager() self._is_initialized = False @@ -219,7 +224,7 @@ async def _init_internal(self: Actor) -> None: if self._configuration.token: StorageClientManager.set_cloud_client(ApifyStorageClient(configuration=self._configuration)) - await self._event_manager.init() + await self._event_manager.__aenter__() self._is_initialized = True @@ -276,7 +281,7 @@ async def finalize() -> None: # Sleep for a bit so that the listeners have a chance to trigger await asyncio.sleep(0.1) - await self._event_manager.close(event_listeners_timeout_secs=event_listeners_timeout.total_seconds() if event_listeners_timeout else None) + await self._event_manager.__aexit__(None, None, None) await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds()) self._is_initialized = False @@ -675,10 +680,11 @@ def on(cls: type[Actor], event_name: ActorEventTypes, listener: Callable) -> Cal """ return cls._get_default_instance().on(event_name, listener) - def _on_internal(self: Actor, event_name: ActorEventTypes, listener: Callable) -> Callable: + def _on_internal(self: Actor, event_name: Event, listener: Callable) -> Callable: self._raise_if_not_initialized() - return self._event_manager.on(event_name, listener) + self._event_manager.on(event=event_name, listener=listener) + return listener @classmethod def off(cls: type[Actor], event_name: ActorEventTypes, listener: Callable | None = None) -> None: @@ -690,10 +696,10 @@ def off(cls: type[Actor], event_name: ActorEventTypes, listener: Callable | None """ return cls._get_default_instance().off(event_name, listener) - def _off_internal(self: Actor, event_name: ActorEventTypes, listener: Callable | None = None) -> None: + def _off_internal(self: Actor, event_name: Event, listener: Callable | None = None) -> None: self._raise_if_not_initialized() - return self._event_manager.off(event_name, listener) + self._event_manager.off(event=event_name, listener=listener) @classmethod def is_at_home(cls: type[Actor]) -> bool: @@ -1097,11 +1103,9 @@ async def _reboot_internal( if not custom_after_sleep: custom_after_sleep = self._configuration.metamorph_after_sleep - self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': True}) + self._event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True)) - await self._event_manager.close( - event_listeners_timeout_secs=int(event_listeners_timeout.total_seconds()) if event_listeners_timeout is not None else None, - ) + await self._event_manager.__aexit__(None, None, None) assert self._configuration.actor_run_id is not None # noqa: S101 await self._apify_client.run(self._configuration.actor_run_id).reboot() diff --git a/src/apify/event_manager.py b/src/apify/event_manager.py index edb2595f..1cc7967b 100644 --- a/src/apify/event_manager.py +++ b/src/apify/event_manager.py @@ -1,28 +1,91 @@ from __future__ import annotations import asyncio -import contextlib -import inspect -import json -from collections import defaultdict -from typing import TYPE_CHECKING, Any, Callable, Coroutine, Union +from typing import TYPE_CHECKING, Annotated, Any, Literal, Union import websockets.client -from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value, parse_date_fields -from pyee.asyncio import AsyncIOEventEmitter +from apify_shared.utils import ignore_docs +from crawlee.events.event_manager import EventManager, EventManagerOptions +from crawlee.events.types import Event, EventAbortingData, EventExitData, EventMigratingData, EventPersistStateData, EventSystemInfoData +from pydantic import BaseModel, Discriminator, Field, TypeAdapter +from typing_extensions import Self, Unpack, override from apify.log import logger if TYPE_CHECKING: - from apify_shared.consts import ActorEventTypes + from types import TracebackType from apify.config import Configuration -ListenerType = Union[Callable[[], None], Callable[[Any], None], Callable[[], Coroutine[Any, Any, None]], Callable[[Any], Coroutine[Any, Any, None]]] + +__all__ = ['EventManager', 'PlatformEventManager'] + + +class PersistStateEvent(BaseModel): + name: Literal[Event.PERSIST_STATE] + data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))] + + +class SystemInfoEvent(BaseModel): + name: Literal[Event.SYSTEM_INFO] + data: EventSystemInfoData + + +class MigratingEvent(BaseModel): + name: Literal[Event.MIGRATING] + data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)] + + +class AbortingEvent(BaseModel): + name: Literal[Event.ABORTING] + data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)] + + +class ExitEvent(BaseModel): + name: Literal[Event.EXIT] + data: Annotated[EventExitData, Field(default_factory=EventExitData)] + + +class EventWithoutData(BaseModel): + name: Literal[ + Event.SESSION_RETIRED, + Event.BROWSER_LAUNCHED, + Event.BROWSER_RETIRED, + Event.BROWSER_CLOSED, + Event.PAGE_CREATED, + Event.PAGE_CLOSED, + ] + data: Any = None + + +class UnknownEvent(BaseModel): + name: str + data: Annotated[dict[str, Any], Field(default_factory=dict)] + + +EventMessage = Union[ + PersistStateEvent, + SystemInfoEvent, + MigratingEvent, + AbortingEvent, + ExitEvent, + EventWithoutData, +] + + +event_data_adapter: TypeAdapter[EventMessage | UnknownEvent] = TypeAdapter( + Union[ + Annotated[ + EventMessage, + Discriminator('name'), + ], + UnknownEvent, + ] +) @ignore_docs -class EventManager: +class PlatformEventManager(EventManager): """A class for managing actor events. You shouldn't use this class directly, @@ -31,204 +94,72 @@ class EventManager: _platform_events_websocket: websockets.client.WebSocketClientProtocol | None = None _process_platform_messages_task: asyncio.Task | None = None - _send_persist_state_interval_task: asyncio.Task | None = None _send_system_info_interval_task: asyncio.Task | None = None - _listener_tasks: set[asyncio.Task] - _listeners_to_wrappers: dict[ActorEventTypes, dict[Callable, list[Callable]]] - _connected_to_platform_websocket: asyncio.Future | None = None + _connected_to_platform_websocket: asyncio.Future = asyncio.Future() - def __init__(self: EventManager, config: Configuration) -> None: + def __init__(self, config: Configuration, **kwargs: Unpack[EventManagerOptions]) -> None: """Create an instance of the EventManager. Args: config (Configuration): The actor configuration to be used in this event manager. + kwargs (EventManagerOptions): Event manager options - forwarded to the base class """ + super().__init__(**kwargs) + self._config = config - self._event_emitter = AsyncIOEventEmitter() - self._initialized = False self._listener_tasks = set() - self._listeners_to_wrappers = defaultdict(lambda: defaultdict(list)) - - async def init(self: EventManager) -> None: - """Initialize the event manager. + self._connected_to_platform_websocket = asyncio.Future[bool]() - When running this on the Apify Platform, this will start processing events - send by the platform to the events websocket and emitting them as events - that can be listened to by the `Actor.on()` method. - """ - if self._initialized: - raise RuntimeError('EventManager was already initialized!') + @override + async def __aenter__(self) -> Self: + await super().__aenter__() + self._connected_to_platform_websocket = asyncio.Future() # Run tasks but don't await them if self._config.actor_events_ws_url: - self._connected_to_platform_websocket = asyncio.Future() - self._process_platform_messages_task = asyncio.create_task(self._process_platform_messages()) + self._process_platform_messages_task = asyncio.create_task(self._process_platform_messages(self._config.actor_events_ws_url)) is_connected = await self._connected_to_platform_websocket if not is_connected: raise RuntimeError('Error connecting to platform events websocket!') else: logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.') - self._initialized = True - - async def close(self: EventManager, event_listeners_timeout_secs: float | None = None) -> None: - """Initialize the event manager. - - This will stop listening for the platform events, - and it will wait for all the event listeners to finish. - - Args: - event_listeners_timeout_secs (float, optional): Optional timeout after which the pending event listeners are canceled. - """ - if not self._initialized: - raise RuntimeError('EventManager was not initialized!') + return self + @override + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: if self._platform_events_websocket: await self._platform_events_websocket.close() if self._process_platform_messages_task: await self._process_platform_messages_task - await self.wait_for_all_listeners_to_complete(timeout_secs=event_listeners_timeout_secs) - - self._event_emitter.remove_all_listeners() - - self._initialized = False - - def on(self: EventManager, event_name: ActorEventTypes, listener: ListenerType) -> Callable: - """Add an event listener to the event manager. - - Args: - event_name (ActorEventTypes): The actor event for which to listen to. - listener (Callable): The function which is to be called when the event is emitted (can be async). - Must accept either zero or one arguments (the first argument will be the event data). - """ - if not self._initialized: - raise RuntimeError('EventManager was not initialized!') - - # Detect whether the listener will accept the event_data argument - try: - signature = inspect.signature(listener) - except (ValueError, TypeError): - # If we can't determine the listener argument count (e.g. for the built-in `print` function), - # let's assume the listener will accept the argument - listener_argument_count = 1 - else: - try: - dummy_event_data: dict = {} - signature.bind(dummy_event_data) - listener_argument_count = 1 - except TypeError: - try: - signature.bind() - listener_argument_count = 0 - except TypeError as err: - raise ValueError('The "listener" argument must be a callable which accepts 0 or 1 arguments!') from err - - event_name = maybe_extract_enum_member_value(event_name) - - async def inner_wrapper(event_data: Any) -> None: - if inspect.iscoroutinefunction(listener): - if listener_argument_count == 0: - await listener() - else: - await listener(event_data) - elif listener_argument_count == 0: - listener() # type: ignore[call-arg] - else: - listener(event_data) # type: ignore[call-arg] - - async def outer_wrapper(event_data: Any) -> None: - listener_task = asyncio.create_task(inner_wrapper(event_data)) - self._listener_tasks.add(listener_task) - try: - await listener_task - except asyncio.CancelledError: - raise - except Exception: - # We need to swallow the exception and just log it here, since it could break the event emitter otherwise - logger.exception('Exception in event listener', extra={'event_name': event_name, 'listener_name': listener.__name__}) - finally: - self._listener_tasks.remove(listener_task) - - self._listeners_to_wrappers[event_name][listener].append(outer_wrapper) - - return self._event_emitter.add_listener(event_name, outer_wrapper) - - def off(self: EventManager, event_name: ActorEventTypes, listener: Callable | None = None) -> None: - """Remove a listener, or all listeners, from an actor event. - - Args: - event_name (ActorEventTypes): The actor event for which to remove listeners. - listener (Callable, optional): The listener which is supposed to be removed. If not passed, all listeners of this event are removed. - """ - if not self._initialized: - raise RuntimeError('EventManager was not initialized!') - - event_name = maybe_extract_enum_member_value(event_name) - - if listener: - for listener_wrapper in self._listeners_to_wrappers[event_name][listener]: - self._event_emitter.remove_listener(event_name, listener_wrapper) - self._listeners_to_wrappers[event_name][listener] = [] - else: - self._listeners_to_wrappers[event_name] = defaultdict(list) - self._event_emitter.remove_all_listeners(event_name) - - def emit(self: EventManager, event_name: ActorEventTypes, data: Any) -> None: - """Emit an actor event manually. - - Args: - event_name (ActorEventTypes): The actor event which should be emitted. - data (Any): The data that should be emitted with the event. - """ - event_name = maybe_extract_enum_member_value(event_name) - - self._event_emitter.emit(event_name, data) - - async def wait_for_all_listeners_to_complete(self: EventManager, *, timeout_secs: float | None = None) -> None: - """Wait for all event listeners which are currently being executed to complete. - - Args: - timeout_secs (float, optional): Timeout for the wait. If the event listeners don't finish until the timeout, they will be canceled. - """ - - async def _wait_for_listeners() -> None: - results = await asyncio.gather(*self._listener_tasks, return_exceptions=True) - for result in results: - if result is Exception: - logger.exception('Event manager encountered an exception in one of the event listeners', exc_info=result) - - if timeout_secs: - _, pending = await asyncio.wait([asyncio.create_task(_wait_for_listeners())], timeout=timeout_secs) - if pending: - logger.warning('Timed out waiting for event listeners to complete, unfinished event listeners will be canceled') - for pending_task in pending: - pending_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await pending_task - else: - await _wait_for_listeners() - - async def _process_platform_messages(self: EventManager) -> None: - # This should be called only on the platform, where we have the ACTOR_EVENTS_WS_URL configured - assert self._config.actor_events_ws_url is not None # noqa: S101 - assert self._connected_to_platform_websocket is not None # noqa: S101 + await super().__aexit__(exc_type, exc_value, exc_traceback) + async def _process_platform_messages(self, ws_url: str) -> None: try: - async with websockets.client.connect(self._config.actor_events_ws_url) as websocket: + async with websockets.client.connect(ws_url) as websocket: self._platform_events_websocket = websocket self._connected_to_platform_websocket.set_result(True) + async for message in websocket: try: - parsed_message = json.loads(message) - assert isinstance(parsed_message, dict) # noqa: S101 - parsed_message = parse_date_fields(parsed_message) - event_name = parsed_message['name'] - event_data = parsed_message.get('data') # 'data' can be missing + parsed_message = event_data_adapter.validate_json(message) + + if isinstance(parsed_message, UnknownEvent): + logger.info(f'Unknown message received: event_name={parsed_message.name}, event_data={parsed_message.data}') + continue - self._event_emitter.emit(event_name, event_data) + self.emit(event=parsed_message.name, event_data=parsed_message.data) + if parsed_message.name == Event.MIGRATING: + await self._emit_persist_state_event_rec_task.stop() + self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True)) except Exception: logger.exception('Cannot parse actor event', extra={'message': message}) except Exception: From fd20bba6e99f11ce349da74f37d3ab09ae09b2fe Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 13 Jun 2024 08:55:44 +0200 Subject: [PATCH 14/68] Use LocalEventManager when not on platform --- src/apify/actor.py | 5 +++-- src/apify/event_manager.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/apify/actor.py b/src/apify/actor.py index 61510288..f8f13bb2 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -18,7 +18,7 @@ from apify.apify_storage_client.apify_storage_client import ApifyStorageClient from apify.config import Configuration from apify.consts import EVENT_LISTENERS_TIMEOUT -from apify.event_manager import EventManager, PlatformEventManager +from apify.event_manager import EventManager, LocalEventManager, PlatformEventManager from apify.log import logger from apify.proxy_configuration import ProxyConfiguration from apify.storages import Dataset, KeyValueStore, RequestQueue @@ -113,10 +113,11 @@ def __init__(self: Actor, config: Configuration | None = None) -> None: self._configuration = config or Configuration() self._apify_client = self.new_client() + self._event_manager: EventManager if self._configuration.is_at_home: self._event_manager = PlatformEventManager(config=self._configuration) else: - self._event_manager = EventManager() + self._event_manager = LocalEventManager() self._is_initialized = False diff --git a/src/apify/event_manager.py b/src/apify/event_manager.py index 1cc7967b..d203d280 100644 --- a/src/apify/event_manager.py +++ b/src/apify/event_manager.py @@ -6,6 +6,7 @@ import websockets.client from apify_shared.utils import ignore_docs from crawlee.events.event_manager import EventManager, EventManagerOptions +from crawlee.events.local_event_manager import LocalEventManager from crawlee.events.types import Event, EventAbortingData, EventExitData, EventMigratingData, EventPersistStateData, EventSystemInfoData from pydantic import BaseModel, Discriminator, Field, TypeAdapter from typing_extensions import Self, Unpack, override @@ -18,7 +19,7 @@ from apify.config import Configuration -__all__ = ['EventManager', 'PlatformEventManager'] +__all__ = ['EventManager', 'LocalEventManager', 'PlatformEventManager'] class PersistStateEvent(BaseModel): From ee0e959ca4dfe98d6e6b5ce9f0d6b3a3e6448384 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 13 Jun 2024 10:04:45 +0200 Subject: [PATCH 15/68] Remove dual properties and methods --- src/apify/_utils.py | 103 +--------- src/apify/actor.py | 478 +++++--------------------------------------- 2 files changed, 54 insertions(+), 527 deletions(-) diff --git a/src/apify/_utils.py b/src/apify/_utils.py index 22823e13..eb87f6a2 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -1,19 +1,13 @@ from __future__ import annotations import builtins -import functools import sys -from collections import OrderedDict -from collections.abc import MutableMapping from hashlib import sha256 from importlib import metadata from logging import getLogger -from typing import Any, Callable, Generic, ItemsView, Iterator, TypeVar, ValuesView, cast -from typing import OrderedDict as OrderedDictType +from typing import TypeVar from urllib.parse import parse_qsl, urlencode, urlparse -from apify_shared.utils import ignore_docs - T = TypeVar('T') logger = getLogger(__name__) @@ -34,101 +28,6 @@ def get_system_info() -> dict: return system_info -DualPropertyType = TypeVar('DualPropertyType') -DualPropertyOwner = TypeVar('DualPropertyOwner') - - -@ignore_docs -class dualproperty(Generic[DualPropertyType]): # noqa: N801 - """Descriptor combining `property` and `classproperty`. - - When accessing the decorated attribute on an instance, it calls the getter with the instance as the first argument, - and when accessing it on a class, it calls the getter with the class as the first argument. - """ - - def __init__(self: dualproperty, getter: Callable[..., DualPropertyType]) -> None: - """Initialize the dualproperty. - - Args: - getter (Callable): The getter of the property. - It should accept either an instance or a class as its first argument. - """ - self.getter = getter - - def __get__(self: dualproperty, obj: DualPropertyOwner | None, owner: type[DualPropertyOwner]) -> DualPropertyType: - """Call the getter with the right object. - - Args: - obj (T | None): The instance of class T on which the getter will be called - owner (type[T]): The class object of class T on which the getter will be called, if obj is None - - Returns: - The result of the getter. - """ - val = self.getter(obj or owner) - return cast(DualPropertyType, val) - - -ImplementationType = TypeVar('ImplementationType', bound=Callable) -MetadataType = TypeVar('MetadataType', bound=Callable) - - -def wrap_internal(implementation: ImplementationType, metadata_source: MetadataType) -> MetadataType: - @functools.wraps(metadata_source) - def wrapper(*args: Any, **kwargs: Any) -> Any: - return implementation(*args, **kwargs) - - return cast(MetadataType, wrapper) - - -@ignore_docs -class LRUCache(MutableMapping, Generic[T]): - """Attempt to reimplement LRUCache from `@apify/datastructures` using `OrderedDict`.""" - - _cache: OrderedDictType[str, T] - - _max_length: int - - def __init__(self: LRUCache, max_length: int) -> None: - """Create a LRUCache with a specific max_length.""" - self._cache = OrderedDict() - self._max_length = max_length - - def __getitem__(self: LRUCache, key: str) -> T: - """Get an item from the cache. Move it to the end if present.""" - val = self._cache[key] - # No 'key in cache' condition since the previous line would raise KeyError - self._cache.move_to_end(key) - return cast(T, val) - - # Sadly TS impl returns bool indicating whether the key was already present or not - def __setitem__(self: LRUCache, key: str, value: T) -> None: - """Add an item to the cache. Remove least used item if max_length exceeded.""" - self._cache[key] = value - if len(self._cache) > self._max_length: - self._cache.popitem(last=False) - - def __delitem__(self: LRUCache, key: str) -> None: - """Remove an item from the cache.""" - del self._cache[key] - - def __iter__(self: LRUCache) -> Iterator[str]: - """Iterate over the keys of the cache in order of insertion.""" - return self._cache.__iter__() - - def __len__(self: LRUCache) -> int: - """Get the number of items in the cache.""" - return len(self._cache) - - def values(self: LRUCache) -> ValuesView[T]: # Needed so we don't mutate the cache by __getitem__ - """Iterate over the values in the cache in order of insertion.""" - return self._cache.values() - - def items(self: LRUCache) -> ItemsView[str, T]: # Needed so we don't mutate the cache by __getitem__ - """Iterate over the pairs of (key, value) in the cache in order of insertion.""" - return self._cache.items() - - def is_running_in_ipython() -> bool: return getattr(builtins, '__IPYTHON__', False) diff --git a/src/apify/actor.py b/src/apify/actor.py index f8f13bb2..092f2c07 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -8,13 +8,14 @@ from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast from apify_client import ApifyClientAsync -from apify_shared.consts import ActorEnvVars, ActorEventTypes, ActorExitCodes, ApifyEnvVars, WebhookEventType +from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars, WebhookEventType from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value from crawlee.events.types import Event, EventPersistStateData from crawlee.storage_client_manager import StorageClientManager +from typing_extensions import Self from apify._crypto import decrypt_input_secrets, load_private_key -from apify._utils import dualproperty, get_system_info, is_running_in_ipython, wrap_internal +from apify._utils import get_system_info, is_running_in_ipython from apify.apify_storage_client.apify_storage_client import ApifyStorageClient from apify.config import Configuration from apify.consts import EVENT_LISTENERS_TIMEOUT @@ -29,44 +30,17 @@ from types import TracebackType -T = TypeVar('T') MainReturnType = TypeVar('MainReturnType') -# This metaclass is needed so you can do `async with Actor: ...` instead of `async with Actor() as a: ...` -# and have automatic `Actor.init()` and `Actor.exit()` +class _ActorType: + """The class of `Actor`. Only make a new instance if you're absolutely sure you need to.""" -class _ActorContextManager(type): - @staticmethod - async def __aenter__() -> type[Actor]: - await Actor.init() - return Actor - - @staticmethod - async def __aexit__( - _exc_type: type[BaseException] | None, - exc_value: BaseException | None, - _exc_traceback: TracebackType | None, - ) -> None: - if not Actor._get_default_instance()._is_exiting: - if exc_value: - await Actor.fail( - exit_code=ActorExitCodes.ERROR_USER_FUNCTION_THREW.value, - exception=exc_value, - ) - else: - await Actor.exit() - - -class Actor(metaclass=_ActorContextManager): - """The main class of the SDK, through which all the actor operations should be done.""" - - _default_instance: Actor | None = None _apify_client: ApifyClientAsync _configuration: Configuration _is_exiting = False - def __init__(self: Actor, config: Configuration | None = None) -> None: + def __init__(self, config: Configuration | None = None) -> None: """Create an Actor instance. Note that you don't have to do this, all the methods on this class function as classmethods too, @@ -75,41 +49,6 @@ def __init__(self: Actor, config: Configuration | None = None) -> None: Args: config (Configuration, optional): The actor configuration to be used. If not passed, a new Configuration instance will be created. """ - # To have methods which work the same as classmethods and instance methods, - # so you can do both Actor.xxx() and Actor().xxx(), - # we need to have an `_xxx_internal` instance method which contains the actual implementation of the method, - # and then in the instance constructor overwrite the `xxx` classmethod with the `_xxx_internal` instance method, - # while copying the annotations, types and so on. - self.init = wrap_internal(self._init_internal, self.init) # type: ignore - self.exit = wrap_internal(self._exit_internal, self.exit) # type: ignore - self.fail = wrap_internal(self._fail_internal, self.fail) # type: ignore - self.main = wrap_internal(self._main_internal, self.main) # type: ignore - self.new_client = wrap_internal(self._new_client_internal, self.new_client) # type: ignore - - self.open_dataset = wrap_internal(self._open_dataset_internal, self.open_dataset) # type: ignore - self.open_key_value_store = wrap_internal(self._open_key_value_store_internal, self.open_key_value_store) # type: ignore - self.open_request_queue = wrap_internal(self._open_request_queue_internal, self.open_request_queue) # type: ignore - self.push_data = wrap_internal(self._push_data_internal, self.push_data) # type: ignore - self.get_input = wrap_internal(self._get_input_internal, self.get_input) # type: ignore - self.get_value = wrap_internal(self._get_value_internal, self.get_value) # type: ignore - self.set_value = wrap_internal(self._set_value_internal, self.set_value) # type: ignore - - self.on = wrap_internal(self._on_internal, self.on) # type: ignore - self.off = wrap_internal(self._off_internal, self.off) # type: ignore - - self.is_at_home = wrap_internal(self._is_at_home_internal, self.is_at_home) # type: ignore - self.get_env = wrap_internal(self._get_env_internal, self.get_env) # type: ignore - - self.start = wrap_internal(self._start_internal, self.start) # type: ignore - self.call = wrap_internal(self._call_internal, self.call) # type: ignore - self.call_task = wrap_internal(self._call_task_internal, self.call_task) # type: ignore - self.abort = wrap_internal(self._abort_internal, self.abort) # type: ignore - self.metamorph = wrap_internal(self._metamorph_internal, self.metamorph) # type: ignore - self.reboot = wrap_internal(self._reboot_internal, self.reboot) # type: ignore - self.add_webhook = wrap_internal(self._add_webhook_internal, self.add_webhook) # type: ignore - self.set_status_message = wrap_internal(self._set_status_message_internal, self.set_status_message) # type: ignore - self.create_proxy_configuration = wrap_internal(self._create_proxy_configuration_internal, self.create_proxy_configuration) # type: ignore - self._configuration = config or Configuration() self._apify_client = self.new_client() @@ -122,7 +61,7 @@ def __init__(self: Actor, config: Configuration | None = None) -> None: self._is_initialized = False @ignore_docs - async def __aenter__(self: Actor) -> Actor: + async def __aenter__(self) -> Self: """Initialize the Actor. Automatically initializes the Actor instance when you use it in an `async with ...` statement. @@ -136,7 +75,7 @@ async def __aenter__(self: Actor) -> Actor: @ignore_docs async def __aexit__( - self: Actor, + self, _exc_type: type[BaseException] | None, exc_value: BaseException | None, _exc_traceback: TracebackType | None, @@ -156,46 +95,31 @@ async def __aexit__( else: await self.exit() - @classmethod - def _get_default_instance(cls: type[Actor]) -> Actor: - if not cls._default_instance: - cls._default_instance = cls(config=Configuration.get_global_configuration()) - - return cls._default_instance - - @dualproperty - def apify_client(self_or_cls: type[Actor] | Actor) -> ApifyClientAsync: # noqa: N805 + @property + def apify_client(self) -> ApifyClientAsync: """The ApifyClientAsync instance the Actor instance uses.""" - if isinstance(self_or_cls, type): - return self_or_cls._get_default_instance()._apify_client - return self_or_cls._apify_client + return self._apify_client - @dualproperty - def config(self_or_cls: type[Actor] | Actor) -> Configuration: # noqa: N805 + @property + def config(self) -> Configuration: """The Configuration instance the Actor instance uses.""" - if isinstance(self_or_cls, type): - return self_or_cls._get_default_instance()._configuration - return self_or_cls._configuration + return self._configuration - @dualproperty - def event_manager(self_or_cls: type[Actor] | Actor) -> EventManager: # noqa: N805 + @property + def event_manager(self) -> EventManager: """The EventManager instance the Actor instance uses.""" - if isinstance(self_or_cls, type): - return self_or_cls._get_default_instance()._event_manager + return self._event_manager - return self_or_cls._event_manager - - @dualproperty - def log(_self_or_cls: type[Actor] | Actor) -> logging.Logger: # noqa: N805 + @property + def log(self) -> logging.Logger: """The logging.Logger instance the Actor uses.""" return logger - def _raise_if_not_initialized(self: Actor) -> None: + def _raise_if_not_initialized(self) -> None: if not self._is_initialized: raise RuntimeError('The actor was not initialized!') - @classmethod - async def init(cls: type[Actor]) -> None: + async def init(self) -> None: """Initialize the actor instance. This initializes the Actor instance. @@ -207,9 +131,6 @@ async def init(cls: type[Actor]) -> None: This method should be called immediately before performing any additional actor actions, and it should be called only once. """ - return await cls._get_default_instance().init() - - async def _init_internal(self: Actor) -> None: if self._is_initialized: raise RuntimeError('The actor was already initialized!') @@ -229,9 +150,8 @@ async def _init_internal(self: Actor) -> None: self._is_initialized = True - @classmethod async def exit( - cls: type[Actor], + self, *, exit_code: int = 0, event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, @@ -252,21 +172,6 @@ async def exit( status_message (str, optional): The final status message that the actor should display. cleanup_timeout (timedelta, optional): How long we should wait for event listeners. """ - return await cls._get_default_instance().exit( - exit_code=exit_code, - event_listeners_timeout=event_listeners_timeout, - status_message=status_message, - cleanup_timeout=cleanup_timeout, - ) - - async def _exit_internal( - self: Actor, - *, - exit_code: int = 0, - event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, - status_message: str | None = None, - cleanup_timeout: timedelta = timedelta(seconds=30), - ) -> None: self._raise_if_not_initialized() self._is_exiting = True @@ -296,9 +201,8 @@ async def finalize() -> None: else: sys.exit(exit_code) - @classmethod async def fail( - cls: type[Actor], + self, *, exit_code: int = 1, exception: BaseException | None = None, @@ -314,19 +218,6 @@ async def fail( exception (BaseException, optional): The exception with which the actor failed. status_message (str, optional): The final status message that the actor should display. """ - return await cls._get_default_instance().fail( - exit_code=exit_code, - exception=exception, - status_message=status_message, - ) - - async def _fail_internal( - self: Actor, - *, - exit_code: int = 1, - exception: BaseException | None = None, - status_message: str | None = None, - ) -> None: self._raise_if_not_initialized() # In IPython, we don't run `sys.exit()` during actor exits, @@ -336,8 +227,7 @@ async def _fail_internal( await self.exit(exit_code=exit_code, status_message=status_message) - @classmethod - async def main(cls: type[Actor], main_actor_function: Callable[[], MainReturnType]) -> MainReturnType | None: + async def main(self, main_actor_function: Callable[[], MainReturnType]) -> MainReturnType | None: """Initialize the actor, run the passed function and finish the actor cleanly. **The `Actor.main()` function is optional** and is provided merely for your convenience. @@ -358,11 +248,6 @@ async def main(cls: type[Actor], main_actor_function: Callable[[], MainReturnTyp Args: main_actor_function (Callable): The user function which should be run in the actor """ - return await cls._get_default_instance().main( - main_actor_function=main_actor_function, - ) - - async def _main_internal(self: Actor, main_actor_function: Callable[[], MainReturnType]) -> MainReturnType | None: if not inspect.isfunction(main_actor_function): raise TypeError(f'First argument passed to Actor.main() must be a function, but instead it was {type(main_actor_function)}') @@ -381,9 +266,8 @@ async def _main_internal(self: Actor, main_actor_function: Callable[[], MainRetu ) return None - @classmethod def new_client( - cls: type[Actor], + self, *, token: str | None = None, api_url: str | None = None, @@ -407,23 +291,6 @@ def new_client( (increases exponentially from this value) timeout (timedelta, optional): The socket timeout of the HTTP requests sent to the Apify API """ - return cls._get_default_instance().new_client( - token=token, - api_url=api_url, - max_retries=max_retries, - min_delay_between_retries=min_delay_between_retries, - timeout=timeout, - ) - - def _new_client_internal( - self: Actor, - *, - token: str | None = None, - api_url: str | None = None, - max_retries: int | None = None, - min_delay_between_retries: timedelta | None = None, - timeout: timedelta | None = None, - ) -> ApifyClientAsync: token = token or self._configuration.token api_url = api_url or self._configuration.api_base_url return ApifyClientAsync( @@ -434,12 +301,8 @@ def _new_client_internal( timeout_secs=int(timeout.total_seconds()) if timeout else None, ) - def _get_storage_client(self: Actor, force_cloud: bool) -> ApifyClientAsync | None: # noqa: FBT001 - return self._apify_client if force_cloud else None - - @classmethod async def open_dataset( - cls: type[Actor], + self, *, id: str | None = None, # noqa: A002 name: str | None = None, @@ -463,15 +326,6 @@ async def open_dataset( Dataset: An instance of the `Dataset` class for the given ID or name. """ - return await cls._get_default_instance().open_dataset(id=id, name=name, force_cloud=force_cloud) - - async def _open_dataset_internal( - self: Actor, - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - ) -> Dataset: self._raise_if_not_initialized() configuration_updates = {} @@ -480,9 +334,8 @@ async def _open_dataset_internal( return await Dataset.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) - @classmethod async def open_key_value_store( - cls: type[Actor], + self, *, id: str | None = None, # noqa: A002 name: str | None = None, @@ -505,15 +358,6 @@ async def open_key_value_store( Returns: KeyValueStore: An instance of the `KeyValueStore` class for the given ID or name. """ - return await cls._get_default_instance().open_key_value_store(id=id, name=name, force_cloud=force_cloud) - - async def _open_key_value_store_internal( - self: Actor, - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - ) -> KeyValueStore: self._raise_if_not_initialized() configuration_updates = {} @@ -522,9 +366,8 @@ async def _open_key_value_store_internal( return await KeyValueStore.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) - @classmethod async def open_request_queue( - cls: type[Actor], + self, *, id: str | None = None, # noqa: A002 name: str | None = None, @@ -548,15 +391,6 @@ async def open_request_queue( Returns: RequestQueue: An instance of the `RequestQueue` class for the given ID or name. """ - return await cls._get_default_instance().open_request_queue(id=id, name=name, force_cloud=force_cloud) - - async def _open_request_queue_internal( - self: Actor, - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - ) -> RequestQueue: self._raise_if_not_initialized() configuration_updates = {} @@ -565,16 +399,12 @@ async def _open_request_queue_internal( return await RequestQueue.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) - @classmethod - async def push_data(cls: type[Actor], data: Any) -> None: + async def push_data(self, data: Any) -> None: """Store an object or a list of objects to the default dataset of the current actor run. Args: data (object or list of objects, optional): The data to push to the default dataset. """ - return await cls._get_default_instance().push_data(data=data) - - async def _push_data_internal(self: Actor, data: Any) -> None: self._raise_if_not_initialized() if not data: @@ -583,12 +413,8 @@ async def _push_data_internal(self: Actor, data: Any) -> None: dataset = await self.open_dataset() await dataset.push_data(data) - @classmethod - async def get_input(cls: type[Actor]) -> Any: + async def get_input(self) -> Any: """Get the actor input value from the default key-value store associated with the current actor run.""" - return await cls._get_default_instance().get_input() - - async def _get_input_internal(self: Actor) -> Any: self._raise_if_not_initialized() input_value = await self.get_value(self._configuration.input_key) @@ -603,25 +429,20 @@ async def _get_input_internal(self: Actor) -> Any: return input_value - @classmethod - async def get_value(cls: type[Actor], key: str, default_value: Any = None) -> Any: + async def get_value(self, key: str, default_value: Any = None) -> Any: """Get a value from the default key-value store associated with the current actor run. Args: key (str): The key of the record which to retrieve. default_value (Any, optional): Default value returned in case the record does not exist. """ - return await cls._get_default_instance().get_value(key=key, default_value=default_value) - - async def _get_value_internal(self: Actor, key: str, default_value: Any = None) -> Any: self._raise_if_not_initialized() key_value_store = await self.open_key_value_store() return await key_value_store.get_value(key, default_value) - @classmethod async def set_value( - cls: type[Actor], + self, key: str, value: Any, *, @@ -634,26 +455,12 @@ async def set_value( value (any): The value of the record which to set, or None, if the record should be deleted. content_type (str, optional): The content type which should be set to the value. """ - return await cls._get_default_instance().set_value( - key=key, - value=value, - content_type=content_type, - ) - - async def _set_value_internal( - self: Actor, - key: str, - value: Any, - *, - content_type: str | None = None, - ) -> None: self._raise_if_not_initialized() key_value_store = await self.open_key_value_store() return await key_value_store.set_value(key, value, content_type=content_type) - @classmethod - def on(cls: type[Actor], event_name: ActorEventTypes, listener: Callable) -> Callable: + def on(self, event_name: Event, listener: Callable) -> Callable: """Add an event listener to the actor's event manager. The following events can be emitted: @@ -679,57 +486,41 @@ def on(cls: type[Actor], event_name: ActorEventTypes, listener: Callable) -> Cal event_name (ActorEventTypes): The actor event for which to listen to. listener (Callable): The function which is to be called when the event is emitted (can be async). """ - return cls._get_default_instance().on(event_name, listener) - - def _on_internal(self: Actor, event_name: Event, listener: Callable) -> Callable: self._raise_if_not_initialized() self._event_manager.on(event=event_name, listener=listener) return listener - @classmethod - def off(cls: type[Actor], event_name: ActorEventTypes, listener: Callable | None = None) -> None: + def off(self, event_name: Event, listener: Callable | None = None) -> None: """Remove a listener, or all listeners, from an actor event. Args: event_name (ActorEventTypes): The actor event for which to remove listeners. listener (Callable, optional): The listener which is supposed to be removed. If not passed, all listeners of this event are removed. """ - return cls._get_default_instance().off(event_name, listener) - - def _off_internal(self: Actor, event_name: Event, listener: Callable | None = None) -> None: self._raise_if_not_initialized() self._event_manager.off(event=event_name, listener=listener) - @classmethod - def is_at_home(cls: type[Actor]) -> bool: + def is_at_home(self) -> bool: """Return `True` when the actor is running on the Apify platform, and `False` otherwise (for example when running locally).""" - return cls._get_default_instance().is_at_home() - - def _is_at_home_internal(self: Actor) -> bool: return self._configuration.is_at_home - @classmethod - def get_env(cls: type[Actor]) -> dict: + def get_env(self) -> dict: """Return a dictionary with information parsed from all the `APIFY_XXX` environment variables. For a list of all the environment variables, see the [Actor documentation](https://docs.apify.com/actors/development/environment-variables). If some variables are not defined or are invalid, the corresponding value in the resulting dictionary will be None. """ - return cls._get_default_instance().get_env() - - def _get_env_internal(self: Actor) -> dict: self._raise_if_not_initialized() config = self._configuration.model_dump(by_alias=True) env_vars = {env_var.value.lower(): env_var.name.lower() for env_var in [*ActorEnvVars, *ApifyEnvVars]} return {option_name: config[env_var] for env_var, option_name in env_vars} - @classmethod async def start( - cls: type[Actor], + self, actor_id: str, run_input: Any = None, *, @@ -737,7 +528,7 @@ async def start( content_type: str | None = None, build: str | None = None, memory_mbytes: int | None = None, - timeout: int | None = None, + timeout: timedelta | None = None, wait_for_finish: int | None = None, webhooks: list[dict] | None = None, ) -> dict: @@ -770,31 +561,6 @@ async def start( Returns: dict: Info about the started actor run """ - return await cls._get_default_instance().start( - actor_id=actor_id, - run_input=run_input, - token=token, - content_type=content_type, - build=build, - memory_mbytes=memory_mbytes, - timeout=timeout, - wait_for_finish=wait_for_finish, - webhooks=webhooks, - ) - - async def _start_internal( - self: Actor, - actor_id: str, - run_input: Any = None, - *, - token: str | None = None, - content_type: str | None = None, - build: str | None = None, - memory_mbytes: int | None = None, - timeout: timedelta | None = None, - wait_for_finish: int | None = None, - webhooks: list[dict] | None = None, - ) -> dict: self._raise_if_not_initialized() client = self.new_client(token=token) if token else self._apify_client @@ -809,12 +575,12 @@ async def _start_internal( webhooks=webhooks, ) - @classmethod async def abort( - cls: type[Actor], + self, run_id: str, *, token: str | None = None, + status_message: str | None = None, gracefully: bool | None = None, ) -> dict: """Abort given actor run on the Apify platform using the current user account (determined by the `APIFY_TOKEN` environment variable). @@ -822,6 +588,7 @@ async def abort( Args: run_id (str): The ID of the actor run to be aborted. token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). + status_message (str, optional): Status message of the actor to be set on the platform. gracefully (bool, optional): If True, the actor run will abort gracefully. It will send ``aborting`` and ``persistStates`` events into the run and force-stop the run after 30 seconds. It is helpful in cases where you plan to resurrect the run later. @@ -829,20 +596,6 @@ async def abort( Returns: dict: Info about the aborted actor run """ - return await cls._get_default_instance().abort( - run_id=run_id, - token=token, - gracefully=gracefully, - ) - - async def _abort_internal( - self: Actor, - run_id: str, - *, - token: str | None = None, - status_message: str | None = None, - gracefully: bool | None = None, - ) -> dict: self._raise_if_not_initialized() client = self.new_client(token=token) if token else self._apify_client @@ -852,9 +605,8 @@ async def _abort_internal( return await client.run(run_id).abort(gracefully=gracefully) - @classmethod async def call( - cls: type[Actor], + self, actor_id: str, run_input: Any = None, *, @@ -889,31 +641,6 @@ async def call( Returns: dict: Info about the started actor run """ - return await cls._get_default_instance().call( - actor_id=actor_id, - token=token, - run_input=run_input, - content_type=content_type, - build=build, - memory_mbytes=memory_mbytes, - timeout=timeout, - webhooks=webhooks, - wait=wait, - ) - - async def _call_internal( - self: Actor, - actor_id: str, - run_input: Any = None, - *, - token: str | None = None, - content_type: str | None = None, - build: str | None = None, - memory_mbytes: int | None = None, - timeout: timedelta | None = None, - webhooks: list[dict] | None = None, - wait: timedelta | None = None, - ) -> dict | None: self._raise_if_not_initialized() client = self.new_client(token=token) if token else self._apify_client @@ -928,9 +655,8 @@ async def _call_internal( wait_secs=int(wait.total_seconds()) if wait is not None else None, ) - @classmethod async def call_task( - cls: type[Actor], + self, task_id: str, task_input: dict | None = None, *, @@ -967,29 +693,6 @@ async def call_task( Returns: dict: Info about the started actor run """ - return await cls._get_default_instance().call_task( - task_id=task_id, - task_input=task_input, - token=token, - build=build, - memory_mbytes=memory_mbytes, - timeout=timeout, - webhooks=webhooks, - wait=wait, - ) - - async def _call_task_internal( - self: Actor, - task_id: str, - task_input: dict | None = None, - *, - build: str | None = None, - memory_mbytes: int | None = None, - timeout: timedelta | None = None, - webhooks: list[dict] | None = None, - wait: timedelta | None = None, - token: str | None = None, - ) -> dict | None: self._raise_if_not_initialized() client = self.new_client(token=token) if token else self._apify_client @@ -1003,9 +706,8 @@ async def _call_task_internal( wait_secs=int(wait.total_seconds()) if wait is not None else None, ) - @classmethod async def metamorph( - cls: type[Actor], + self, target_actor_id: str, run_input: Any = None, *, @@ -1030,23 +732,6 @@ async def metamorph( Returns: dict: The actor run data. """ - return await cls._get_default_instance().metamorph( - target_actor_id=target_actor_id, - target_actor_build=target_actor_build, - run_input=run_input, - content_type=content_type, - custom_after_sleep=custom_after_sleep, - ) - - async def _metamorph_internal( - self: Actor, - target_actor_id: str, - run_input: Any = None, - *, - target_actor_build: str | None = None, - content_type: str | None = None, - custom_after_sleep: timedelta | None = None, - ) -> None: self._raise_if_not_initialized() if not self.is_at_home(): @@ -1069,9 +754,8 @@ async def _metamorph_internal( if custom_after_sleep: await asyncio.sleep(custom_after_sleep.total_seconds()) - @classmethod async def reboot( - cls: type[Actor], + self, *, event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, custom_after_sleep: timedelta | None = None, @@ -1084,17 +768,6 @@ async def reboot( event_listeners_timeout (timedelta, optional): How long should the actor wait for actor event listeners to finish before exiting custom_after_sleep (timedelta, optional): How long to sleep for after the reboot, to wait for the container to be stopped. """ - return await cls._get_default_instance().reboot( - event_listeners_timeout=event_listeners_timeout, - custom_after_sleep=custom_after_sleep, - ) - - async def _reboot_internal( - self: Actor, - *, - event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, - custom_after_sleep: timedelta | None = None, - ) -> None: self._raise_if_not_initialized() if not self.is_at_home(): @@ -1114,9 +787,8 @@ async def _reboot_internal( if custom_after_sleep: await asyncio.sleep(custom_after_sleep.total_seconds()) - @classmethod async def add_webhook( - cls: type[Actor], + self, *, event_types: list[WebhookEventType], request_url: str, @@ -1124,7 +796,7 @@ async def add_webhook( ignore_ssl_errors: bool | None = None, do_not_retry: bool | None = None, idempotency_key: str | None = None, - ) -> dict: + ) -> dict | None: """Create an ad-hoc webhook for the current actor run. This webhook lets you receive a notification when the actor run finished or failed. @@ -1147,25 +819,6 @@ async def add_webhook( Returns: dict: The created webhook """ - return await cls._get_default_instance().add_webhook( - event_types=event_types, - request_url=request_url, - payload_template=payload_template, - ignore_ssl_errors=ignore_ssl_errors, - do_not_retry=do_not_retry, - idempotency_key=idempotency_key, - ) - - async def _add_webhook_internal( - self: Actor, - *, - event_types: list[WebhookEventType], - request_url: str, - payload_template: str | None = None, - ignore_ssl_errors: bool | None = None, - do_not_retry: bool | None = None, - idempotency_key: str | None = None, - ) -> dict | None: self._raise_if_not_initialized() if not self.is_at_home(): @@ -1185,9 +838,8 @@ async def _add_webhook_internal( idempotency_key=idempotency_key, ) - @classmethod async def set_status_message( - cls: type[Actor], + self, status_message: str, *, is_terminal: bool | None = None, @@ -1201,14 +853,6 @@ async def set_status_message( Returns: dict: The updated actor run object """ - return await cls._get_default_instance().set_status_message(status_message=status_message, is_terminal=is_terminal) - - async def _set_status_message_internal( - self: Actor, - status_message: str, - *, - is_terminal: bool | None = None, - ) -> dict | None: self._raise_if_not_initialized() if not self.is_at_home(): @@ -1223,9 +867,8 @@ async def _set_status_message_internal( status_message=status_message, is_status_message_terminal=is_terminal ) - @classmethod async def create_proxy_configuration( - cls: type[Actor], + self, *, actor_proxy_input: dict | None = None, # this is the raw proxy input from the actor run input, it is not spread or snake_cased in here password: str | None = None, @@ -1254,25 +897,6 @@ async def create_proxy_configuration( ProxyConfiguration, optional: ProxyConfiguration object with the passed configuration, or None, if no proxy should be used based on the configuration. """ - return await cls._get_default_instance().create_proxy_configuration( - password=password, - groups=groups, - country_code=country_code, - proxy_urls=proxy_urls, - new_url_function=new_url_function, - actor_proxy_input=actor_proxy_input, - ) - - async def _create_proxy_configuration_internal( - self: Actor, - *, - actor_proxy_input: dict | None = None, # this is the raw proxy input from the actor run input, it is not spread or snake_cased in here - password: str | None = None, - groups: list[str] | None = None, - country_code: str | None = None, - proxy_urls: list[str] | None = None, - new_url_function: Callable[[str | None], str] | Callable[[str | None], Awaitable[str]] | None = None, - ) -> ProxyConfiguration | None: self._raise_if_not_initialized() if actor_proxy_input is not None: @@ -1297,3 +921,7 @@ async def _create_proxy_configuration_internal( await proxy_configuration.initialize() return proxy_configuration + + +Actor = _ActorType() +"""The entry point of the SDK, through which all the actor operations should be done.""" From a084bcf80daa68932746981184b1b36ccd7f97e3 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 13 Jun 2024 10:22:19 +0200 Subject: [PATCH 16/68] Use crypto utils from crawlee where possible --- src/apify/_crypto.py | 8 +- src/apify/_utils.py | 112 -------------------- src/apify/scrapy/requests.py | 5 +- src/apify/scrapy/scheduler.py | 3 +- tests/integration/_utils.py | 2 +- tests/integration/test_actor_api_helpers.py | 3 +- tests/integration/test_fixtures.py | 3 +- 7 files changed, 11 insertions(+), 125 deletions(-) diff --git a/src/apify/_crypto.py b/src/apify/_crypto.py index 237bc8cd..075247e3 100644 --- a/src/apify/_crypto.py +++ b/src/apify/_crypto.py @@ -1,10 +1,10 @@ from __future__ import annotations import base64 -import secrets from typing import Any from apify_shared.utils import ignore_docs +from crawlee._utils.crypto import crypto_random_object_id from cryptography.exceptions import InvalidTag as InvalidTagException from cryptography.hazmat.primitives import hashes, serialization from cryptography.hazmat.primitives.asymmetric import padding, rsa @@ -125,12 +125,6 @@ def _load_public_key(public_key_file_base64: str) -> rsa.RSAPublicKey: return public_key -def crypto_random_object_id(length: int = 17) -> str: - """Python reimplementation of cryptoRandomObjectId from `@apify/utilities`.""" - chars = 'abcdefghijklmnopqrstuvwxyzABCEDFGHIJKLMNOPQRSTUVWXYZ0123456789' - return ''.join(secrets.choice(chars) for _ in range(length)) - - def decrypt_input_secrets(private_key: rsa.RSAPrivateKey, input: Any) -> Any: # noqa: A002 """Decrypt input secrets.""" if not isinstance(input, dict): diff --git a/src/apify/_utils.py b/src/apify/_utils.py index eb87f6a2..ff1426fd 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -2,11 +2,9 @@ import builtins import sys -from hashlib import sha256 from importlib import metadata from logging import getLogger from typing import TypeVar -from urllib.parse import parse_qsl, urlencode, urlparse T = TypeVar('T') logger = getLogger(__name__) @@ -30,113 +28,3 @@ def get_system_info() -> dict: def is_running_in_ipython() -> bool: return getattr(builtins, '__IPYTHON__', False) - - -def compute_short_hash(data: bytes, *, length: int = 8) -> str: - """Computes a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it. - - Args: - data: The binary data to be hashed. - length: The length of the hash to be returned. - - Returns: - A substring (prefix) of the hexadecimal hash of the data. - """ - hash_object = sha256(data) - return hash_object.hexdigest()[:length] - - -def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str: - """Normalizes a URL. - - This function cleans and standardizes a URL by removing leading and trailing whitespaces, - converting the scheme and netloc to lower case, stripping unwanted tracking parameters - (specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically, - and optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally - identical but differ in trivial ways (such as parameter order or casing) are treated as the same. - - Args: - url: The URL to be normalized. - keep_url_fragment: Flag to determine whether the fragment part of the URL should be retained. - - Returns: - A string containing the normalized URL. - """ - # Parse the URL - parsed_url = urlparse(url.strip()) - search_params = dict(parse_qsl(parsed_url.query)) # Convert query to a dict - - # Remove any 'utm_' parameters - search_params = {k: v for k, v in search_params.items() if not k.startswith('utm_')} - - # Construct the new query string - sorted_keys = sorted(search_params.keys()) - sorted_query = urlencode([(k, search_params[k]) for k in sorted_keys]) - - # Construct the final URL - new_url = ( - parsed_url._replace( - query=sorted_query, - scheme=parsed_url.scheme, - netloc=parsed_url.netloc, - path=parsed_url.path.rstrip('/'), - ) - .geturl() - .lower() - ) - - # Retain the URL fragment if required - if not keep_url_fragment: - new_url = new_url.split('#')[0] - - return new_url - - -def compute_unique_key( - url: str, - method: str = 'GET', - payload: bytes | None = None, - *, - keep_url_fragment: bool = False, - use_extended_unique_key: bool = False, -) -> str: - """Computes a unique key for caching & deduplication of requests. - - This function computes a unique key by normalizing the provided URL and method. - If 'use_extended_unique_key' is True and a payload is provided, the payload is hashed and - included in the key. Otherwise, the unique key is just the normalized URL. - - Args: - url: The request URL. - method: The HTTP method, defaults to 'GET'. - payload: The request payload, defaults to None. - keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False. - use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False. - - Returns: - A string representing the unique key for the request. - """ - # Normalize the URL and method. - try: - normalized_url = normalize_url(url, keep_url_fragment=keep_url_fragment) - except Exception as exc: - logger.warning(f'Failed to normalize URL: {exc}') - normalized_url = url - - normalized_method = method.upper() - - # Compute and return the extended unique key if required. - if use_extended_unique_key: - payload_hash = compute_short_hash(payload) if payload else '' - return f'{normalized_method}({payload_hash}):{normalized_url}' - - # Log information if there is a non-GET request with a payload. - if normalized_method != 'GET' and payload: - logger.info( - f'We have encountered a {normalized_method} Request with a payload. This is fine. Just letting you know ' - 'that if your requests point to the same URL and differ only in method and payload, you should consider ' - 'using the "use_extended_unique_key" option.' - ) - - # Return the normalized URL as the unique key. - return normalized_url diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 688c2bc4..dd527497 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -12,8 +12,9 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from apify._crypto import crypto_random_object_id -from apify._utils import compute_unique_key +from crawlee._utils.crypto import crypto_random_object_id +from crawlee._utils.requests import compute_unique_key + from apify.actor import Actor diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 4b280d8a..03e8b78c 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -12,7 +12,8 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from apify._crypto import crypto_random_object_id +from crawlee._utils.crypto import crypto_random_object_id + from apify.actor import Actor from apify.scrapy.requests import to_apify_request, to_scrapy_request from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client diff --git a/tests/integration/_utils.py b/tests/integration/_utils.py index b69d6d58..cbea845d 100644 --- a/tests/integration/_utils.py +++ b/tests/integration/_utils.py @@ -1,6 +1,6 @@ from __future__ import annotations -from apify._crypto import crypto_random_object_id +from crawlee._utils.crypto import crypto_random_object_id def generate_unique_resource_name(label: str) -> str: diff --git a/tests/integration/test_actor_api_helpers.py b/tests/integration/test_actor_api_helpers.py index 2d9e96e0..d1937aca 100644 --- a/tests/integration/test_actor_api_helpers.py +++ b/tests/integration/test_actor_api_helpers.py @@ -4,9 +4,10 @@ import json from typing import TYPE_CHECKING +from crawlee._utils.crypto import crypto_random_object_id + from ._utils import generate_unique_resource_name from apify import Actor -from apify._crypto import crypto_random_object_id if TYPE_CHECKING: from apify_client import ApifyClientAsync diff --git a/tests/integration/test_fixtures.py b/tests/integration/test_fixtures.py index c5c67a4d..93ff5588 100644 --- a/tests/integration/test_fixtures.py +++ b/tests/integration/test_fixtures.py @@ -3,8 +3,9 @@ from datetime import datetime, timezone from typing import TYPE_CHECKING +from crawlee._utils.crypto import crypto_random_object_id + from apify import Actor -from apify._crypto import crypto_random_object_id if TYPE_CHECKING: from apify_client import ApifyClientAsync From 4b3b5811abde13b9bd17d2d0a3d58e8bf447ef33 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 13 Jun 2024 10:25:53 +0200 Subject: [PATCH 17/68] Remove obsolete tests --- tests/unit/test_lru_cache.py | 59 ------ tests/unit/test_utils.py | 368 ----------------------------------- 2 files changed, 427 deletions(-) delete mode 100644 tests/unit/test_lru_cache.py delete mode 100644 tests/unit/test_utils.py diff --git a/tests/unit/test_lru_cache.py b/tests/unit/test_lru_cache.py deleted file mode 100644 index fe298ae6..00000000 --- a/tests/unit/test_lru_cache.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import annotations - -import pytest - -from apify._utils import LRUCache - - -@pytest.fixture() -def lru_cache() -> LRUCache[int]: - cache = LRUCache[int](3) - cache['a'] = 1 - cache['c'] = 3 - cache['b'] = 2 - return cache - - -def test_get(lru_cache: LRUCache[int]) -> None: - # Key error with non-existent key - with pytest.raises(KeyError): - _ = lru_cache['non-existent-key'] - # None when using .get instead - assert lru_cache.get('non-existent-key') is None - # Should return correct value for existing key - assert lru_cache['c'] == 3 - # Check if order of keys changed based on LRU rule - for actual, target in zip(lru_cache, ['a', 'b', 'c']): - assert actual == target - - -def test_set(lru_cache: LRUCache[int]) -> None: - assert len(lru_cache) == 3 - lru_cache['d'] = 4 - # Check if max_length is not exceeded - assert len(lru_cache) == 3 - # Check if oldest key is removed - assert 'a' not in lru_cache - # Check if the newest addition is at the end - assert list(lru_cache.items())[-1] == ('d', 4) - - -def test_del(lru_cache: LRUCache[int]) -> None: - # Key error on non-existent key - with pytest.raises(KeyError): - del lru_cache['non-existent-key'] - # No error with existing key - len_before_del = len(lru_cache) - del lru_cache['a'] - assert len(lru_cache) == len_before_del - 1 - assert 'a' not in lru_cache - - -def test_len(lru_cache: LRUCache[int]) -> None: - assert len(lru_cache) == len(lru_cache._cache) - lru_cache.clear() - assert len(lru_cache) == 0 - - -def test_iter(lru_cache: LRUCache[int]) -> None: - assert list(lru_cache) == ['a', 'c', 'b'] diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py deleted file mode 100644 index ac9b3567..00000000 --- a/tests/unit/test_utils.py +++ /dev/null @@ -1,368 +0,0 @@ -from __future__ import annotations - -import asyncio -import contextlib -import os -import time -from collections import OrderedDict -from datetime import datetime, timezone -from typing import TYPE_CHECKING - -import pytest -from aiofiles.os import mkdir -from apify_shared.consts import ActorEnvVars, ApifyEnvVars - -from apify._utils import ( - budget_ow, - compute_short_hash, - compute_unique_key, - fetch_and_parse_env_var, - force_remove, - force_rename, - get_cpu_usage_percent, - get_memory_usage_bytes, - guess_file_extension, - maybe_parse_bool, - maybe_parse_datetime, - maybe_parse_int, - normalize_url, - raise_on_duplicate_storage, - raise_on_non_existing_storage, - run_func_at_interval_async, - unique_key_to_request_id, -) -from apify.consts import StorageTypes - -if TYPE_CHECKING: - from pathlib import Path - - -def test__fetch_and_parse_env_var(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, 'True') - monkeypatch.setenv(ActorEnvVars.MEMORY_MBYTES, '1024') - monkeypatch.setenv(ApifyEnvVars.META_ORIGIN, 'API') - monkeypatch.setenv(ActorEnvVars.STARTED_AT, '2022-12-02T15:19:34.907Z') - monkeypatch.setenv('DUMMY_BOOL', '1') - monkeypatch.setenv('DUMMY_DATETIME', '2022-12-02T15:19:34.907Z') - monkeypatch.setenv('DUMMY_INT', '1') - monkeypatch.setenv('DUMMY_STRING', 'DUMMY') - - assert fetch_and_parse_env_var(ApifyEnvVars.IS_AT_HOME) is True - assert fetch_and_parse_env_var(ActorEnvVars.MEMORY_MBYTES) == 1024 - assert fetch_and_parse_env_var(ApifyEnvVars.META_ORIGIN) == 'API' - assert fetch_and_parse_env_var(ActorEnvVars.STARTED_AT) == datetime(2022, 12, 2, 15, 19, 34, 907000, tzinfo=timezone.utc) - - assert fetch_and_parse_env_var('DUMMY_BOOL') == '1' # type: ignore - assert fetch_and_parse_env_var('DUMMY_DATETIME') == '2022-12-02T15:19:34.907Z' # type: ignore - assert fetch_and_parse_env_var('DUMMY_INT') == '1' # type: ignore - assert fetch_and_parse_env_var('DUMMY_STRING') == 'DUMMY' # type: ignore - assert fetch_and_parse_env_var('NONEXISTENT_ENV_VAR') is None # type: ignore - assert fetch_and_parse_env_var('NONEXISTENT_ENV_VAR', 'default') == 'default' # type: ignore - - -def test__get_cpu_usage_percent() -> None: - assert get_cpu_usage_percent() >= 0 - assert get_cpu_usage_percent() <= 100 - - -def test__get_memory_usage_bytes() -> None: - assert get_memory_usage_bytes() >= 0 - assert get_memory_usage_bytes() <= 1024 * 1024 * 1024 * 1024 - - -def test__maybe_parse_bool() -> None: - assert maybe_parse_bool('True') is True - assert maybe_parse_bool('true') is True - assert maybe_parse_bool('1') is True - assert maybe_parse_bool('False') is False - assert maybe_parse_bool('false') is False - assert maybe_parse_bool('0') is False - assert maybe_parse_bool(None) is False - assert maybe_parse_bool('bflmpsvz') is False - - -def test__maybe_parse_datetime() -> None: - assert maybe_parse_datetime('2022-12-02T15:19:34.907Z') == datetime(2022, 12, 2, 15, 19, 34, 907000, tzinfo=timezone.utc) - assert maybe_parse_datetime('2022-12-02T15:19:34.907') == '2022-12-02T15:19:34.907' - assert maybe_parse_datetime('anything') == 'anything' - - -def test__maybe_parse_int() -> None: - assert maybe_parse_int('0') == 0 - assert maybe_parse_int('1') == 1 - assert maybe_parse_int('-1') == -1 - assert maybe_parse_int('136749825') == 136749825 - assert maybe_parse_int('') is None - assert maybe_parse_int('abcd') is None - - -async def test__run_func_at_interval_async__sync_function() -> None: - # Test that it works with a synchronous functions - interval = 1.0 - initial_delay = 0.5 - increments = 3 - - test_var = 0 - - def sync_increment() -> None: - nonlocal test_var - test_var += 1 - - started_at = time.perf_counter() - sync_increment_task = asyncio.create_task(run_func_at_interval_async(sync_increment, interval)) - - try: - await asyncio.sleep(initial_delay) - - for i in range(increments): - assert test_var == i - - now = time.perf_counter() - sleep_until = started_at + initial_delay + (i + 1) * interval - sleep_for_secs = sleep_until - now - await asyncio.sleep(sleep_for_secs) - - assert test_var == increments - finally: - sync_increment_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await sync_increment_task - - await asyncio.sleep(1.5) - assert test_var == increments - - -async def test__run_func_at_interval_async_async__function() -> None: - # Test that it works with an asynchronous functions - interval = 1.0 - initial_delay = 0.5 - increments = 3 - - test_var = 0 - - async def async_increment() -> None: - nonlocal test_var - await asyncio.sleep(0.1) - test_var += 1 - - started_at = time.perf_counter() - async_increment_task = asyncio.create_task(run_func_at_interval_async(async_increment, interval)) - - try: - await asyncio.sleep(initial_delay) - - for i in range(increments): - assert test_var == i - - now = time.perf_counter() - sleep_until = started_at + initial_delay + (i + 1) * interval - sleep_for_secs = sleep_until - now - await asyncio.sleep(sleep_for_secs) - - assert test_var == increments - finally: - async_increment_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await async_increment_task - - await asyncio.sleep(1.5) - assert test_var == increments - - -async def test__force_remove(tmp_path: Path) -> None: - test_file_path = os.path.join(tmp_path, 'test.txt') - # Does not crash/raise when the file does not exist - assert os.path.exists(test_file_path) is False - await force_remove(test_file_path) - assert os.path.exists(test_file_path) is False - - # Removes the file if it exists - with open(test_file_path, 'a', encoding='utf-8'): # noqa: ASYNC101 - pass - assert os.path.exists(test_file_path) is True - await force_remove(test_file_path) - assert os.path.exists(test_file_path) is False - - -def test__raise_on_non_existing_storage() -> None: - with pytest.raises(ValueError, match='Dataset with id "kckxQw6j6AtrgyA09" does not exist.'): - raise_on_non_existing_storage(StorageTypes.DATASET, 'kckxQw6j6AtrgyA09') - - -def test__raise_on_duplicate_storage() -> None: - with pytest.raises(ValueError, match='Dataset with name "test" already exists.'): - raise_on_duplicate_storage(StorageTypes.DATASET, 'name', 'test') - - -def test__guess_file_extension() -> None: - # Can guess common types properly - assert guess_file_extension('application/json') == 'json' - assert guess_file_extension('application/xml') == 'xml' - assert guess_file_extension('text/plain') == 'txt' - - # Can handle unusual formats - assert guess_file_extension(' application/json ') == 'json' - assert guess_file_extension('APPLICATION/JSON') == 'json' - assert guess_file_extension('application/json;charset=utf-8') == 'json' - - # Returns None for non-existent content types - assert guess_file_extension('clearly not a content type') is None - assert guess_file_extension('') is None - - -def test__unique_key_to_request_id() -> None: - # Right side from `uniqueKeyToRequestId` in Crawlee - assert unique_key_to_request_id('abc') == 'ungWv48BzpBQUDe' - assert unique_key_to_request_id('test') == 'n4bQgYhMfWWaLqg' - - -async def test__force_rename(tmp_path: Path) -> None: - src_dir = os.path.join(tmp_path, 'src') - dst_dir = os.path.join(tmp_path, 'dst') - src_file = os.path.join(src_dir, 'src_dir.txt') - dst_file = os.path.join(dst_dir, 'dst_dir.txt') - # Won't crash if source directory does not exist - assert os.path.exists(src_dir) is False - await force_rename(src_dir, dst_dir) - - # Will remove dst_dir if it exists (also covers normal case) - # Create the src_dir with a file in it - await mkdir(src_dir) - with open(src_file, 'a', encoding='utf-8'): # noqa: ASYNC101 - pass - # Create the dst_dir with a file in it - await mkdir(dst_dir) - with open(dst_file, 'a', encoding='utf-8'): # noqa: ASYNC101 - pass - assert os.path.exists(src_file) is True - assert os.path.exists(dst_file) is True - await force_rename(src_dir, dst_dir) - assert os.path.exists(src_dir) is False - assert os.path.exists(dst_file) is False - # src_dir.txt should exist in dst_dir - assert os.path.exists(os.path.join(dst_dir, 'src_dir.txt')) is True - - -def test__budget_ow() -> None: - budget_ow( - { - 'a': 123, - 'b': 'string', - 'c': datetime.now(timezone.utc), - }, - { - 'a': (int, True), - 'b': (str, False), - 'c': (datetime, True), - }, - ) - with pytest.raises(ValueError, match='required'): - budget_ow({}, {'id': (str, True)}) - with pytest.raises(ValueError, match='must be of type'): - budget_ow({'id': 123}, {'id': (str, True)}) - # Check if subclasses pass the check - budget_ow( - { - 'ordered_dict': OrderedDict(), - }, - { - 'ordered_dict': (dict, False), - }, - ) - - -def test_get_short_base64_hash_with_known_input() -> None: - data = b'Hello world!' - expected_hash = 'c0535e4b' - assert compute_short_hash(data) == expected_hash, 'The hash does not match the expected output' - - -def test_get_short_base64_hash_with_empty_input() -> None: - data = b'' - expected_hash = 'e3b0c442' - assert compute_short_hash(data) == expected_hash, 'The hash for an empty input should follow the expected pattern' - - -def test_get_short_base64_hash_output_length() -> None: - data = b'some random data' - assert len(compute_short_hash(data)) == 8, 'The output hash should be 8 characters long' - - -def test_get_short_base64_hash_differentiates_input() -> None: - data1 = b'input 1' - data2 = b'input 2' - assert compute_short_hash(data1) != compute_short_hash(data2), 'Different inputs should produce different hashes' - - -@pytest.mark.parametrize( - ('url', 'expected_output', 'keep_url_fragment'), - [ - ('https://example.com/?utm_source=test&utm_medium=test&key=value', 'https://example.com?key=value', False), - ('http://example.com/?key=value&another_key=another_value', 'http://example.com?another_key=another_value&key=value', False), - ('HTTPS://EXAMPLE.COM/?KEY=VALUE', 'https://example.com?key=value', False), - ('', '', False), - ('http://example.com/#fragment', 'http://example.com#fragment', True), - ('http://example.com/#fragment', 'http://example.com', False), - (' https://example.com/ ', 'https://example.com', False), - ('http://example.com/?b=2&a=1', 'http://example.com?a=1&b=2', False), - ], - ids=[ - 'remove_utm_params', - 'retain_sort_non_utm_params', - 'convert_scheme_netloc_to_lowercase', - 'handle_empty_url', - 'retain_fragment', - 'remove_fragment', - 'trim_whitespace', - 'sort_query_params', - ], -) -def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: bool) -> None: - output = normalize_url(url, keep_url_fragment=keep_url_fragment) - assert output == expected_output - - -@pytest.mark.parametrize( - ('url', 'method', 'payload', 'keep_url_fragment', 'use_extended_unique_key', 'expected_output'), - [ - ('http://example.com', 'GET', None, False, False, 'http://example.com'), - ('http://example.com', 'POST', None, False, False, 'http://example.com'), - ('http://example.com', 'GET', b'data', False, False, 'http://example.com'), - ('http://example.com', 'GET', b'data', False, True, 'GET(3a6eb079):http://example.com'), - ('http://example.com', 'POST', b'data', False, True, 'POST(3a6eb079):http://example.com'), - ('http://example.com#fragment', 'GET', None, True, False, 'http://example.com#fragment'), - ('http://example.com#fragment', 'GET', None, False, False, 'http://example.com'), - ('http://example.com', 'DELETE', b'test', False, True, 'DELETE(9f86d081):http://example.com'), - ('https://example.com?utm_content=test', 'GET', None, False, False, 'https://example.com'), - ('https://example.com?utm_content=test', 'GET', None, True, False, 'https://example.com'), - ], - ids=[ - 'simple_get', - 'simple_post', - 'get_with_payload', - 'get_with_payload_extended', - 'post_with_payload_extended', - 'get_with_fragment', - 'get_remove_fragment', - 'delete_with_payload_extended', - 'get_remove_utm', - 'get_keep_utm_fragment', - ], -) -def test_compute_unique_key( - url: str, - method: str, - payload: bytes | None, - *, - keep_url_fragment: bool, - use_extended_unique_key: bool, - expected_output: str, -) -> None: - output = compute_unique_key( - url, - method, - payload, - keep_url_fragment=keep_url_fragment, - use_extended_unique_key=use_extended_unique_key, - ) - assert output == expected_output From 47c2c13c9b7075e4a2b1d2e8083daf9caa809ea9 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 13 Jun 2024 13:17:33 +0200 Subject: [PATCH 18/68] Migrate ProxyConfiguration --- src/apify/actor.py | 5 +- src/apify/proxy_configuration.py | 222 +++++++++++-------------------- 2 files changed, 82 insertions(+), 145 deletions(-) diff --git a/src/apify/actor.py b/src/apify/actor.py index 092f2c07..794be11a 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -26,9 +26,10 @@ if TYPE_CHECKING: import logging - from collections.abc import Awaitable from types import TracebackType + from crawlee.proxy_configuration import NewUrlFunction + MainReturnType = TypeVar('MainReturnType') @@ -875,7 +876,7 @@ async def create_proxy_configuration( groups: list[str] | None = None, country_code: str | None = None, proxy_urls: list[str] | None = None, - new_url_function: Callable[[str | None], str] | Callable[[str | None], Awaitable[str]] | None = None, + new_url_function: NewUrlFunction | None = None, ) -> ProxyConfiguration | None: """Create a ProxyConfiguration object with the passed proxy configuration. diff --git a/src/apify/proxy_configuration.py b/src/apify/proxy_configuration.py index c8c84510..9ff534b2 100644 --- a/src/apify/proxy_configuration.py +++ b/src/apify/proxy_configuration.py @@ -1,21 +1,25 @@ from __future__ import annotations -import inspect import ipaddress import re -from typing import TYPE_CHECKING, Any, Awaitable, Callable, Pattern, TypedDict +from dataclasses import dataclass, field +from re import Pattern +from typing import TYPE_CHECKING, Any from urllib.parse import urljoin, urlparse import httpx from apify_shared.consts import ApifyEnvVars from apify_shared.utils import ignore_docs +from crawlee.proxy_configuration import NewUrlFunction +from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration +from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo from apify.config import Configuration from apify.log import logger if TYPE_CHECKING: from apify_client import ApifyClientAsync - from typing_extensions import NotRequired + from crawlee.models import Request APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$') COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$') @@ -62,30 +66,16 @@ def _check( raise ValueError(f'{error_str} does not match pattern {pattern.pattern!r}') -class ProxyInfo(TypedDict): +@dataclass +class ProxyInfo(CrawleeProxyInfo): """Provides information about a proxy connection that is used for requests.""" - url: str - """The URL of the proxy.""" - - hostname: str - """The hostname of the proxy.""" - - port: int - """The proxy port.""" - - username: NotRequired[str] - """The username for the proxy.""" - - password: str - """The password for the proxy.""" - - groups: NotRequired[list[str]] + groups: list[str] = field(default_factory=list) """An array of proxy groups to be used by the [Apify Proxy](https://docs.apify.com/proxy). If not provided, the proxy will select the groups automatically. """ - country_code: NotRequired[str] + country_code: str | None = None """If set and relevant proxies are available in your Apify account, all proxied requests will use IP addresses that are geolocated to the specified country. For example `GB` for IPs from Great Britain. Note that online services often have their own rules for handling @@ -96,11 +86,8 @@ class ProxyInfo(TypedDict): This parameter is optional, by default, the proxy uses all available proxy servers from all countries. """ - session_id: NotRequired[str] - """The identifier of the used proxy session, if used. Using the same session ID guarantees getting the same proxy URL.""" - -class ProxyConfiguration: +class ProxyConfiguration(CrawleeProxyConfiguration): """Configures a connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or blacklists. @@ -112,30 +99,18 @@ class ProxyConfiguration: Your list of proxy URLs will be rotated by the configuration, if this option is provided. """ - is_man_in_the_middle = False - - _next_custom_url_index = 0 - _proxy_urls: list[str] - _used_proxy_urls: dict[str, str] - _new_url_function: Callable[[str | None], str] | Callable[[str | None], Awaitable[str]] | None = None - _groups: list[str] - _country_code: str | None = None - _password: str | None = None - _hostname: str - _port: int - _uses_apify_proxy: bool | None = None - _actor_config: Configuration - _apify_client: ApifyClientAsync | None = None + _configuration: Configuration @ignore_docs def __init__( - self: ProxyConfiguration, + self, *, password: str | None = None, groups: list[str] | None = None, country_code: str | None = None, proxy_urls: list[str] | None = None, - new_url_function: Callable[[str | None], str] | Callable[[str | None], Awaitable[str]] | None = None, + new_url_function: NewUrlFunction | None = None, + tiered_proxy_urls: list[list[str]] | None = None, _actor_config: Configuration | None = None, _apify_client: ApifyClientAsync | None = None, ) -> None: @@ -147,52 +122,48 @@ def __init__( country_code (str, optional): Country which the Apify Proxy should use, if provided. proxy_urls (list of str, optional): Custom proxy server URLs which should be rotated through. new_url_function (Callable, optional): Function which returns a custom proxy URL to be used. + tiered_proxy_urls (list of list of str, optional): Proxy URLs arranged into tiers """ + _actor_config = _actor_config or Configuration.get_global_configuration() + + super().__init__(proxy_urls=proxy_urls, new_url_function=new_url_function, tiered_proxy_urls=tiered_proxy_urls, configuration=_actor_config) + self._configuration = _actor_config + if groups: groups = [str(group) for group in groups] for group in groups: _check(group, label='groups', pattern=APIFY_PROXY_VALUE_REGEX) + if country_code: country_code = str(country_code) _check(country_code, label='country_code', pattern=COUNTRY_CODE_REGEX) - if proxy_urls: - for i, url in enumerate(proxy_urls): - if not is_url(url): - raise ValueError(f'proxy_urls[{i}] ("{url}") is not a valid URL') - # Validation - if proxy_urls and new_url_function: - raise ValueError('Cannot combine custom proxies in "proxy_urls" with custom generating function in "new_url_function".') - - if (proxy_urls or new_url_function) and (groups or country_code): + if (proxy_urls or new_url_function or tiered_proxy_urls) and (groups or country_code): raise ValueError( 'Cannot combine custom proxies with Apify Proxy!' ' It is not allowed to set "proxy_urls" or "new_url_function" combined with' ' "groups" or "country_code".' ) - # mypy has a bug with narrowing types for filter (https://github.com/python/mypy/issues/12682) - if proxy_urls and next(filter(lambda url: 'apify.com' in url, proxy_urls), None): # type: ignore + if proxy_urls and any('apify.com' in url for url in proxy_urls): logger.warning( 'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties instead of `proxy_urls`.\n' 'See https://sdk.apify.com/docs/guides/proxy-management#apify-proxy-configuration' ) - self._actor_config = _actor_config or Configuration.get_global_configuration() + self.is_man_in_the_middle = False + self._apify_client = _apify_client - self._hostname = self._actor_config.proxy_hostname - self._port = self._actor_config.proxy_port - self._password = password or self._actor_config.proxy_password + self._hostname = self._configuration.proxy_hostname + self._port = self._configuration.proxy_port + self._password = password or self._configuration.proxy_password - self._proxy_urls = list(proxy_urls) if proxy_urls else [] - self._used_proxy_urls = {} - self._new_url_function = new_url_function self._groups = list(groups) if groups else [] self._country_code = country_code self._uses_apify_proxy = not (proxy_urls or new_url_function) - async def initialize(self: ProxyConfiguration) -> None: + async def initialize(self) -> None: """Load the Apify Proxy password if the API token is provided and check access to Apify Proxy and provided proxy groups. Only called if Apify Proxy configuration is used. @@ -205,50 +176,12 @@ async def initialize(self: ProxyConfiguration) -> None: await self._maybe_fetch_password() await self._check_access() - async def new_url(self: ProxyConfiguration, session_id: int | str | None = None) -> str: - """Return a new proxy URL based on provided configuration options and the `sessionId` parameter. - - Args: - session_id (int or str, optional): Represents the identifier of a proxy session (https://docs.apify.com/proxy#sessions). - All the HTTP requests going through the proxy with the same session identifier - will use the same target proxy server (i.e. the same IP address). - The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. - - Returns: - str: A string with a proxy URL, including authentication credentials and port number. - For example, `http://bob:password123@proxy.example.com:8000` - """ - if session_id is not None: - session_id = f'{session_id}' - _check(session_id, label='session_id', max_length=SESSION_ID_MAX_LENGTH, pattern=APIFY_PROXY_VALUE_REGEX) - - if self._new_url_function: - try: - res = self._new_url_function(session_id) - if inspect.isawaitable(res): - res = await res - return str(res) - except Exception as exc: - raise ValueError('The provided "new_url_function" did not return a valid URL') from exc - - if self._proxy_urls: - if not session_id: - index = self._next_custom_url_index - self._next_custom_url_index = (self._next_custom_url_index + 1) % len(self._proxy_urls) - return self._proxy_urls[index] - - if session_id not in self._used_proxy_urls: - index = self._next_custom_url_index - self._next_custom_url_index = (self._next_custom_url_index + 1) % len(self._proxy_urls) - self._used_proxy_urls[session_id] = self._proxy_urls[index] - - return self._used_proxy_urls[session_id] - - username = self._get_username(session_id) - - return f'http://{username}:{self._password}@{self._hostname}:{self._port}' - - async def new_proxy_info(self: ProxyConfiguration, session_id: int | str | None = None) -> ProxyInfo: + async def new_proxy_info( + self, + session_id: str | None = None, + request: Request | None = None, + proxy_tier: int | None = None, + ) -> ProxyInfo | None: """Create a new ProxyInfo object. Use it if you want to work with a rich representation of a proxy URL. @@ -256,49 +189,48 @@ async def new_proxy_info(self: ProxyConfiguration, session_id: int | str | None Args: session_id (int or str, optional): Represents the identifier of a proxy session (https://docs.apify.com/proxy#sessions). - All the HTTP requests going through the proxy with the same session identifier - will use the same target proxy server (i.e. the same IP address). - The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. + All the HTTP requests going through the proxy with the same session identifier + will use the same target proxy server (i.e. the same IP address). + The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. + request (Request, optional): request for which the proxy info is being issued, used in proxy tier handling + proxy_tier (int, optional): allows forcing the proxy tier to be used Returns: ProxyInfo: Dictionary that represents information about the proxy and its configuration. """ if session_id is not None: - session_id = f'{session_id}' _check(session_id, label='session_id', max_length=SESSION_ID_MAX_LENGTH, pattern=APIFY_PROXY_VALUE_REGEX) - url = await self.new_url(session_id) - res: ProxyInfo + proxy_info = await super().new_proxy_info(session_id=session_id, request=request, proxy_tier=proxy_tier) + + if proxy_info is None: + return None + if self._uses_apify_proxy: - res = { - 'url': url, - 'hostname': self._hostname, - 'port': self._port, - 'username': self._get_username(session_id), - 'password': self._password or '', - 'groups': self._groups, - } - if self._country_code: - res['country_code'] = self._country_code - if session_id is not None: - res['session_id'] = session_id - return res - - parsed_url = urlparse(url) - assert parsed_url.hostname is not None # noqa: S101 - assert parsed_url.port is not None # noqa: S101 - res = { - 'url': url, - 'hostname': parsed_url.hostname, - 'port': parsed_url.port, - 'password': parsed_url.password or '', - } - if parsed_url.username: - res['username'] = parsed_url.username - return res - - async def _maybe_fetch_password(self: ProxyConfiguration) -> None: - token = self._actor_config.token + return ProxyInfo( + url=proxy_info.url, + hostname=proxy_info.hostname, + port=proxy_info.port, + username=self._get_username(session_id), + password=self._password or '', + session_id=proxy_info.session_id, + proxy_tier=proxy_info.proxy_tier, + groups=self._groups, + country_code=self._country_code or None, + ) + + return ProxyInfo( + url=proxy_info.url, + hostname=proxy_info.hostname, + port=proxy_info.port, + username=proxy_info.username, + password=proxy_info.password, + session_id=proxy_info.session_id, + proxy_tier=proxy_info.proxy_tier, + ) + + async def _maybe_fetch_password(self) -> None: + token = self._configuration.token if token and self._apify_client: user_info = await self._apify_client.user().get() @@ -321,11 +253,15 @@ async def _maybe_fetch_password(self: ProxyConfiguration) -> None: f' If you add the "{ApifyEnvVars.TOKEN}" environment variable, the password will be automatically inferred.' ) - async def _check_access(self: ProxyConfiguration) -> None: - proxy_status_url = f'{self._actor_config.proxy_status_url}/?format=json' + async def _check_access(self) -> None: + proxy_status_url = f'{self._configuration.proxy_status_url}/?format=json' + proxy_info = await self.new_proxy_info() + + if proxy_info is None: + return status = None - async with httpx.AsyncClient(proxies=await self.new_url()) as client: + async with httpx.AsyncClient(proxies=proxy_info.url) as client: for _ in range(2): try: response = await client.get(proxy_status_url) @@ -346,7 +282,7 @@ async def _check_access(self: ProxyConfiguration) -> None: "If you see some, it most likely means you don't have access to either all or some of the proxies you're trying to use." ) - def _get_username(self: ProxyConfiguration, session_id: int | str | None = None) -> str: + def _get_username(self, session_id: int | str | None = None) -> str: if session_id is not None: session_id = f'{session_id}' From 7abbe38250c9993f716e5d9de73843fa47a3ce27 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 18 Jun 2024 12:38:53 +0200 Subject: [PATCH 19/68] Use correct types in RequestQueueClient --- .../request_queue_client.py | 73 ++++++++++++------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/src/apify/apify_storage_client/request_queue_client.py b/src/apify/apify_storage_client/request_queue_client.py index 5db8077b..5ca4729c 100644 --- a/src/apify/apify_storage_client/request_queue_client.py +++ b/src/apify/apify_storage_client/request_queue_client.py @@ -3,10 +3,21 @@ from typing import TYPE_CHECKING from crawlee.base_storage_client.base_request_queue_client import BaseRequestQueueClient -from crawlee.models import Request, RequestQueueHead, RequestQueueMetadata, RequestQueueOperationInfo +from crawlee.models import ( + BatchRequestsOperationResponse, + ProcessedRequest, + ProlongRequestLockResponse, + Request, + RequestListResponse, + RequestQueueHead, + RequestQueueHeadWithLocks, + RequestQueueMetadata, +) from typing_extensions import override if TYPE_CHECKING: + from collections.abc import Sequence + from apify_client.clients import RequestQueueClientAsync @@ -46,10 +57,12 @@ async def list_head(self, *, limit: int | None = None) -> RequestQueueHead: ) @override - async def list_and_lock_head(self, *, lock_secs: int, limit: int | None = None) -> dict: - return await self._client.list_and_lock_head( - lock_secs=lock_secs, - limit=limit, + async def list_and_lock_head(self, *, lock_secs: int, limit: int | None = None) -> RequestQueueHeadWithLocks: + return RequestQueueHeadWithLocks.model_validate( + await self._client.list_and_lock_head( + lock_secs=lock_secs, + limit=limit, + ) ) @override @@ -58,8 +71,8 @@ async def add_request( request: Request, *, forefront: bool = False, - ) -> RequestQueueOperationInfo: - return RequestQueueOperationInfo.model_validate( + ) -> ProcessedRequest: + return ProcessedRequest.model_validate( await self._client.add_request( request=request.model_dump(by_alias=True), forefront=forefront, @@ -77,8 +90,8 @@ async def update_request( request: Request, *, forefront: bool = False, - ) -> RequestQueueOperationInfo: - return RequestQueueOperationInfo.model_validate( + ) -> ProcessedRequest: + return ProcessedRequest.model_validate( await self._client.update_request( request=request.model_dump(by_alias=True), forefront=forefront, @@ -96,11 +109,13 @@ async def prolong_request_lock( *, forefront: bool = False, lock_secs: int, - ) -> dict: - return await self._client.prolong_request_lock( - request_id=request_id, - forefront=forefront, - lock_secs=lock_secs, + ) -> ProlongRequestLockResponse: + return ProlongRequestLockResponse.model_validate( + await self._client.prolong_request_lock( + request_id=request_id, + forefront=forefront, + lock_secs=lock_secs, + ) ) @override @@ -118,19 +133,23 @@ async def delete_request_lock( @override async def batch_add_requests( self, - requests: list[Request], + requests: Sequence[Request], *, forefront: bool = False, - ) -> dict: - return await self._client.batch_add_requests( - requests=[r.model_dump(by_alias=True) for r in requests], - forefront=forefront, + ) -> BatchRequestsOperationResponse: + return BatchRequestsOperationResponse.model_validate( + await self._client.batch_add_requests( + requests=[r.model_dump(by_alias=True) for r in requests], + forefront=forefront, + ) ) @override - async def batch_delete_requests(self, requests: list[Request]) -> dict: - return await self._client.batch_delete_requests( - requests=[r.model_dump(by_alias=True) for r in requests], + async def batch_delete_requests(self, requests: list[Request]) -> BatchRequestsOperationResponse: + return BatchRequestsOperationResponse.model_validate( + await self._client.batch_delete_requests( + requests=[r.model_dump(by_alias=True) for r in requests], + ) ) @override @@ -139,8 +158,10 @@ async def list_requests( *, limit: int | None = None, exclusive_start_id: str | None = None, - ) -> dict: # TODO type - return await self._client.list_requests( - limit=limit, - exclusive_start_id=exclusive_start_id, + ) -> RequestListResponse: + return RequestListResponse.model_validate( + await self._client.list_requests( + limit=limit, + exclusive_start_id=exclusive_start_id, + ) ) From b5765d4315ee7583ff0db245defa9e520164cad1 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 27 Jun 2024 17:38:42 +0200 Subject: [PATCH 20/68] Update Actor class --- pyproject.toml | 2 ++ src/apify/actor.py | 56 ++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 663afd4e..7d8ac064 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ dependencies = [ "sortedcollections >= 2.0.0", "typing-extensions >= 4.1.0", "websockets >= 10.1", + "werkzeug >= 3.0.0", ] [project.optional-dependencies] @@ -90,6 +91,7 @@ line-length = 150 select = ["ALL"] ignore = [ "ANN101", # Missing type annotation for `self` in method + "ANN102", # Missing type annotation for `cls` in method "ANN401", # Dynamically typed expressions (typing.Any) are disallowed in {filename} "BLE001", # Do not catch blind exception "C901", # `{name}` is too complex diff --git a/src/apify/actor.py b/src/apify/actor.py index 794be11a..b6a2fc0d 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -12,7 +12,9 @@ from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value from crawlee.events.types import Event, EventPersistStateData from crawlee.storage_client_manager import StorageClientManager +from pydantic import AliasChoices from typing_extensions import Self +from werkzeug.local import LocalProxy from apify._crypto import decrypt_input_secrets, load_private_key from apify._utils import get_system_info, is_running_in_ipython @@ -50,14 +52,20 @@ def __init__(self, config: Configuration | None = None) -> None: Args: config (Configuration, optional): The actor configuration to be used. If not passed, a new Configuration instance will be created. """ - self._configuration = config or Configuration() + self._configuration = config or Configuration.get_global_configuration() self._apify_client = self.new_client() self._event_manager: EventManager if self._configuration.is_at_home: - self._event_manager = PlatformEventManager(config=self._configuration) + self._event_manager = PlatformEventManager( + config=self._configuration, + persist_state_interval=self._configuration.persist_state_interval, + ) else: - self._event_manager = LocalEventManager() + self._event_manager = LocalEventManager( + system_info_interval=self._configuration.system_info_interval, + persist_state_interval=self._configuration.persist_state_interval, + ) self._is_initialized = False @@ -96,6 +104,12 @@ async def __aexit__( else: await self.exit() + def __repr__(self) -> str: + if self is _default_instance: + return '' + + return super().__repr__() + @property def apify_client(self) -> ApifyClientAsync: """The ApifyClientAsync instance the Actor instance uses.""" @@ -516,9 +530,25 @@ def get_env(self) -> dict: """ self._raise_if_not_initialized() - config = self._configuration.model_dump(by_alias=True) + config = dict[str, Any]() + for field_name, field in Configuration.model_fields.items(): + if field.deprecated: + continue + + if field.alias: + aliases = [field.alias] + elif isinstance(field.validation_alias, str): + aliases = [field.validation_alias] + elif isinstance(field.validation_alias, AliasChoices): + aliases = cast(list[str], field.validation_alias.choices) + else: + aliases = [field_name] + + for alias in aliases: + config[alias] = getattr(self._configuration, field_name) + env_vars = {env_var.value.lower(): env_var.name.lower() for env_var in [*ActorEnvVars, *ApifyEnvVars]} - return {option_name: config[env_var] for env_var, option_name in env_vars} + return {option_name: config[env_var] for env_var, option_name in env_vars.items() if env_var in config} async def start( self, @@ -886,7 +916,7 @@ async def create_proxy_configuration( For more details and code examples, see the `ProxyConfiguration` class. Args: - actor_proxy_input (dict, optional): Proxy configuration field from the actor input, if actor has such input field. + actor_proxy_input (dict, optional): Proxy configuration field from the actor input, if input has such input field. If you pass this argument, all the other arguments will be inferred from it. password (str, optional): Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available. groups (list of str, optional): Proxy groups which the Apify Proxy should use, if provided. @@ -924,5 +954,17 @@ async def create_proxy_configuration( return proxy_configuration -Actor = _ActorType() +_default_instance: _ActorType | None = None + + +def _get_default_instance() -> _ActorType: + global _default_instance # noqa: PLW0603 + + if not _default_instance: + _default_instance = _ActorType() + + return _default_instance + + +Actor = cast(_ActorType, LocalProxy(_get_default_instance)) """The entry point of the SDK, through which all the actor operations should be done.""" From 23b822d87cbd4a9cb6bc68a25d2d76f24f016f6c Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 27 Jun 2024 17:39:03 +0200 Subject: [PATCH 21/68] Update ProxyConfiguration class --- src/apify/proxy_configuration.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/apify/proxy_configuration.py b/src/apify/proxy_configuration.py index 9ff534b2..553088ee 100644 --- a/src/apify/proxy_configuration.py +++ b/src/apify/proxy_configuration.py @@ -126,9 +126,6 @@ def __init__( """ _actor_config = _actor_config or Configuration.get_global_configuration() - super().__init__(proxy_urls=proxy_urls, new_url_function=new_url_function, tiered_proxy_urls=tiered_proxy_urls, configuration=_actor_config) - self._configuration = _actor_config - if groups: groups = [str(group) for group in groups] for group in groups: @@ -151,6 +148,16 @@ def __init__( 'See https://sdk.apify.com/docs/guides/proxy-management#apify-proxy-configuration' ) + self._uses_apify_proxy = not (proxy_urls or new_url_function or tiered_proxy_urls) + + super().__init__( + proxy_urls=[f'http://{_actor_config.proxy_hostname}:{_actor_config.proxy_port}'] if self._uses_apify_proxy else proxy_urls, + new_url_function=new_url_function, + tiered_proxy_urls=tiered_proxy_urls, + configuration=_actor_config, + ) + self._configuration = _actor_config + self.is_man_in_the_middle = False self._apify_client = _apify_client @@ -161,7 +168,6 @@ def __init__( self._groups = list(groups) if groups else [] self._country_code = country_code - self._uses_apify_proxy = not (proxy_urls or new_url_function) async def initialize(self) -> None: """Load the Apify Proxy password if the API token is provided and check access to Apify Proxy and provided proxy groups. @@ -207,11 +213,14 @@ async def new_proxy_info( return None if self._uses_apify_proxy: + parsed_url = httpx.URL(proxy_info.url) + username = self._get_username(session_id) + return ProxyInfo( - url=proxy_info.url, + url=f'http://{username}:{self._password or ""}@{parsed_url.host}:{parsed_url.port}', hostname=proxy_info.hostname, port=proxy_info.port, - username=self._get_username(session_id), + username=username, password=self._password or '', session_id=proxy_info.session_id, proxy_tier=proxy_info.proxy_tier, From 2988a15656ee14d14a4a20dc388d7a6af45db408 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 27 Jun 2024 17:40:10 +0200 Subject: [PATCH 22/68] Add all known and supported config options --- src/apify/config.py | 179 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 149 insertions(+), 30 deletions(-) diff --git a/src/apify/config.py b/src/apify/config.py index 8234e892..38fad707 100644 --- a/src/apify/config.py +++ b/src/apify/config.py @@ -2,11 +2,11 @@ from __future__ import annotations from datetime import datetime, timedelta -from typing import Annotated +from typing import Annotated, cast from crawlee._utils.models import timedelta_ms from crawlee.configuration import Configuration as CrawleeConfiguration -from pydantic import Field +from pydantic import AliasChoices, Field from typing_extensions import Self @@ -17,45 +17,158 @@ class Configuration(CrawleeConfiguration): or it can be specific to each `Actor` instance on the `actor.config` property. """ - actor_id: Annotated[str | None, Field(alias='actor_id')] = None - actor_run_id: Annotated[str | None, Field(alias='actor_run_id')] = None - actor_build_id: Annotated[str | None, Field()] = None - actor_build_number: Annotated[str | None, Field()] = None - actor_task_id: Annotated[str | None, Field(alias='actor_task_id')] = None - actor_events_ws_url: Annotated[str | None, Field(alias='actor_events_websocket_url')] = None + actor_id: Annotated[ + str | None, + Field( + validation_alias=AliasChoices( + 'actor_id', + 'apify_actor_id', + 'apify_act_id', + ) + ), + ] = None + + actor_run_id: Annotated[ + str | None, + Field( + validation_alias=AliasChoices( + 'actor_run_id', + 'apify_actor_run_id', + 'apify_act_run_id', + ) + ), + ] = None + + actor_build_id: Annotated[ + str | None, + Field( + validation_alias=AliasChoices( + 'actor_build_id', + 'apify_actor_build_id', + ) + ), + ] = None + + actor_build_number: Annotated[ + str | None, + Field( + validation_alias=AliasChoices( + 'actor_build_number', + 'apify_actor_build_number', + ) + ), + ] = None + + actor_task_id: Annotated[ + str | None, + Field( + validation_alias=AliasChoices( + 'actor_task_id', + 'apify_actor_task_id', + ) + ), + ] = None + + actor_events_ws_url: Annotated[ + str | None, + Field( + validation_alias=AliasChoices( + 'actor_events_websocket_url', + 'apify_actor_events_ws_url', + ) + ), + ] = None + api_base_url: Annotated[str, Field(alias='apify_api_base_url')] = 'https://api.apify.com' + api_public_base_url: Annotated[str, Field(alias='apify_api_public_base_url')] = 'https://api.apify.com' - default_dataset_id: Annotated[str, Field(alias='actor_default_dataset_id')] = 'default' - default_key_value_store_id: Annotated[str, Field(alias='actor_default_key_value_store_id')] = 'default' - default_request_queue_id: Annotated[str, Field(alias='actor_default_request_queue_id')] = 'default' - disable_browser_sandbox: Annotated[bool, Field(alias='apify_disable_browser_sandbox')] = False - headless: Annotated[bool, Field(alias='apify_headless')] = True - input_key: Annotated[str, Field(alias='actor_input_key')] = 'INPUT' + + dedicated_cpus: Annotated[float | None, Field(alias='apify_dedicated_cpus')] = None + + disable_outdated_warning: Annotated[bool, Field(alias='apify_disable_outdated_warning')] = False + + fact: Annotated[str | None, Field(alias='apify_fact')] = None + + input_key: Annotated[ + str, + Field( + validation_alias=AliasChoices( + 'actor_input_key', + 'apify_input_key', + 'crawlee_input_key', + ) + ), + ] = 'INPUT' + input_secrets_private_key_file: Annotated[str | None, Field(alias='apify_input_secrets_private_key_file')] = None + input_secrets_private_key_passphrase: Annotated[str | None, Field(alias='apify_input_secrets_private_key_passphrase')] = None + is_at_home: Annotated[bool, Field(alias='apify_is_at_home')] = False + + latest_sdk_version: Annotated[str | None, Field(alias='apify_sdk_latest_version', deprecated=True)] = None + + log_format: Annotated[str | None, Field(alias='apify_log_format', deprecated=True)] = None + max_paid_dataset_items: Annotated[int | None, Field(alias='actor_max_paid_dataset_items')] = None - memory_mbytes: Annotated[int | None, Field(alias='actor_memory_mbytes')] = None + meta_origin: Annotated[str | None, Field(alias='apify_meta_origin')] = None - metamorph_after_sleep: Annotated[timedelta_ms, Field('apify_metamorph_after_sleep_millis')] = timedelta(minutes=5) - persist_state_interval: Annotated[timedelta_ms, Field('apify_persist_state_interval_millis')] = timedelta(minutes=1) - persist_storage: Annotated[bool, Field(alias='apify_persist_storage')] = True + + metamorph_after_sleep: Annotated[timedelta_ms, Field(alias='apify_metamorph_after_sleep_millis')] = timedelta(minutes=5) + proxy_hostname: Annotated[str, Field(alias='apify_proxy_hostname')] = 'proxy.apify.com' + proxy_password: Annotated[str | None, Field(alias='apify_proxy_password')] = None + proxy_port: Annotated[int, Field(alias='apify_proxy_port')] = 8000 + proxy_status_url: Annotated[str, Field(alias='apify_proxy_status_url')] = 'http://proxy.apify.com' - purge_on_start: Annotated[bool, Field(alias='apify_purge_on_start')] = False - started_at: Annotated[datetime | None, Field(alias='actor_started_at')] = None - timeout_at: Annotated[datetime | None, Field(alias='actor_timeout_at')] = None + + started_at: Annotated[ + datetime | None, + Field( + validation_alias=AliasChoices( + 'actor_started_at', + 'apify_started_at', + ) + ), + ] = None + + timeout_at: Annotated[ + datetime | None, + Field( + validation_alias=AliasChoices( + 'actor_timeout_at', + 'apify_timeout_at', + ) + ), + ] = None + token: Annotated[str | None, Field(alias='apify_token')] = None + user_id: Annotated[str | None, Field(alias='apify_user_id')] = None - web_server_port: Annotated[int, Field(alias='actor_web_server_port')] = 4321 - web_server_url: Annotated[str, Field(alias='actor_web_server_url')] = 'http://localhost:4321' - xvfb: Annotated[bool, Field(alias='apify_xvfb')] = False - system_info_interval: Annotated[timedelta_ms, Field(alias='apify_system_info_interval_millis')] = timedelta(minutes=1) - # TODO chrome_executable_path, container_port, container_url, dedicated_cpus, default_browser_path, - # disable_browser_sandbox, input_secrets_private_key_file, input_secrets_private_key_passphrase, max_used_cpu_ratio + web_server_port: Annotated[ + int, + Field( + validation_alias=AliasChoices( + 'actor_web_server_port', + 'apify_container_port', + ) + ), + ] = 4321 + + web_server_url: Annotated[ + str, + Field( + validation_alias=AliasChoices( + 'actor_web_server_url', + 'apify_container_url', + ) + ), + ] = 'http://localhost:4321' + + workflow_key: Annotated[str | None, Field(alias='apify_workflow_key')] = None @classmethod def get_global_configuration(cls) -> Self: @@ -64,7 +177,13 @@ def get_global_configuration(cls) -> Self: The global configuration applies when you call actor methods via their static versions, e.g. `Actor.init()`. Also accessible via `Actor.config`. """ - if cls._default_instance is None: - cls._default_instance = cls() + if CrawleeConfiguration._default_instance is None: + import os + print(os.environ.get('APIFY_IS_AT_HOME')) + CrawleeConfiguration._default_instance = cls() + + return cast(Self, CrawleeConfiguration._default_instance) + - return cls._default_instance +# Monkey-patch the base class so that it works with the extended configuration +CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration From 697087e33d41bd87c308d78c4fe070e4046079f3 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 27 Jun 2024 17:40:22 +0200 Subject: [PATCH 23/68] Update tests --- tests/integration/conftest.py | 7 +- tests/integration/test_actor_api_helpers.py | 4 +- tests/integration/test_request_queue.py | 4 +- tests/unit/actor/test_actor_dataset.py | 5 +- tests/unit/actor/test_actor_env_helpers.py | 67 ++- tests/unit/actor/test_actor_helpers.py | 63 ++- .../unit/actor/test_actor_key_value_store.py | 6 +- tests/unit/actor/test_actor_lifecycle.py | 82 ++-- tests/unit/actor/test_actor_log.py | 4 +- .../actor/test_actor_memory_storage_e2e.py | 57 ++- tests/unit/conftest.py | 31 +- tests/unit/test_config.py | 86 ---- tests/unit/test_event_manager.py | 381 +++++------------- tests/unit/test_proxy_configuration.py | 106 +++-- 14 files changed, 394 insertions(+), 509 deletions(-) delete mode 100644 tests/unit/test_config.py diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 37c5a654..380a5b1a 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -6,8 +6,9 @@ import subprocess import sys import textwrap +from collections.abc import AsyncIterator, Awaitable, Mapping from pathlib import Path -from typing import TYPE_CHECKING, AsyncIterator, Awaitable, Callable, Mapping, Protocol +from typing import TYPE_CHECKING, Callable, Protocol import pytest from apify_client import ApifyClientAsync @@ -15,8 +16,8 @@ from crawlee.storage_client_manager import StorageClientManager from filelock import FileLock +import apify.actor from ._utils import generate_unique_resource_name -from apify import Actor from apify.config import Configuration if TYPE_CHECKING: @@ -31,9 +32,9 @@ # We also patch the default storage client with a tmp_path @pytest.fixture(autouse=True) def _reset_and_patch_default_instances(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(Actor, '_default_instance', None) monkeypatch.setattr(Configuration, '_default_instance', None) monkeypatch.setattr(StorageClientManager, '_cloud_client', None) + apify.actor._default_instance = None # TODO StorageClientManager local client purge diff --git a/tests/integration/test_actor_api_helpers.py b/tests/integration/test_actor_api_helpers.py index d1937aca..3f05b9ba 100644 --- a/tests/integration/test_actor_api_helpers.py +++ b/tests/integration/test_actor_api_helpers.py @@ -375,12 +375,12 @@ async def main_server() -> None: async with Actor: class WebhookHandler(BaseHTTPRequestHandler): - def do_GET(self) -> None: # noqa: N802, ANN101 + def do_GET(self) -> None: # noqa: N802 self.send_response(200) self.end_headers() self.wfile.write(bytes('Hello, world!', encoding='utf-8')) - def do_POST(self) -> None: # noqa: N802, ANN101 + def do_POST(self) -> None: # noqa: N802 nonlocal webhook_body content_length = self.headers.get('content-length') length = int(content_length) if content_length else 0 diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index 9e81aa43..46afa2ab 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -19,14 +19,14 @@ async def main() -> None: # Add some requests for i in range(desired_request_count): print(f'Adding request {i}...') - await rq.add_request({'url': f'https://example.com/{i}'}) + await rq.add_request(f'https://example.com/{i}') handled_request_count = 0 while next_request := await rq.fetch_next_request(): print('Fetching next request...') queue_operation_info = await rq.mark_request_as_handled(next_request) assert queue_operation_info is not None - assert queue_operation_info['wasAlreadyHandled'] is False + assert queue_operation_info.was_already_handled is False handled_request_count += 1 assert handled_request_count == desired_request_count diff --git a/tests/unit/actor/test_actor_dataset.py b/tests/unit/actor/test_actor_dataset.py index c03911df..d64d3e81 100644 --- a/tests/unit/actor/test_actor_dataset.py +++ b/tests/unit/actor/test_actor_dataset.py @@ -52,7 +52,7 @@ async def test_open_datatset_based_env_var( class TestActorPushData: async def test_push_data(self: TestActorPushData) -> None: - async with Actor() as my_actor: + async with Actor as my_actor: dataset = await my_actor.open_dataset() desired_item_count = 100 await dataset.push_data([{'id': i} for i in range(desired_item_count)]) @@ -61,5 +61,4 @@ async def test_push_data(self: TestActorPushData) -> None: assert dataset_info is not None list_page = await dataset.get_data(limit=desired_item_count) - assert list_page.items[0]['id'] == 0 - assert list_page.items[-1]['id'] == desired_item_count - 1 + assert {item['id'] for item in list_page.items} == set(range(desired_item_count)) diff --git a/tests/unit/actor/test_actor_env_helpers.py b/tests/unit/actor/test_actor_env_helpers.py index a6e3d6fd..d1f46d35 100644 --- a/tests/unit/actor/test_actor_env_helpers.py +++ b/tests/unit/actor/test_actor_env_helpers.py @@ -2,10 +2,11 @@ import random import string -from datetime import datetime, timezone +from datetime import datetime, timedelta from typing import TYPE_CHECKING, Any from apify_shared.consts import BOOL_ENV_VARS, DATETIME_ENV_VARS, FLOAT_ENV_VARS, INTEGER_ENV_VARS, STRING_ENV_VARS, ActorEnvVars, ApifyEnvVars +from pydantic_core import TzInfo from apify import Actor @@ -14,12 +15,13 @@ class TestIsAtHome: - async def test_is_at_home_local(self: TestIsAtHome) -> None: + async def test_is_at_home_local(self) -> None: async with Actor as actor: is_at_home = actor.is_at_home() assert is_at_home is False - async def test_is_at_home_on_apify(self: TestIsAtHome, monkeypatch: pytest.MonkeyPatch) -> None: + async def test_is_at_home_on_apify(self, monkeypatch: pytest.MonkeyPatch) -> None: + print('setenv') monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, 'true') async with Actor as actor: is_at_home = actor.is_at_home() @@ -27,39 +29,92 @@ async def test_is_at_home_on_apify(self: TestIsAtHome, monkeypatch: pytest.Monke class TestGetEnv: - async def test_get_env_use_env_vars(self: TestGetEnv, monkeypatch: pytest.MonkeyPatch) -> None: + async def test_get_env_use_env_vars(self, monkeypatch: pytest.MonkeyPatch) -> None: # noqa: PLR0912 + ignored_env_vars = { + ApifyEnvVars.INPUT_KEY, + ApifyEnvVars.MEMORY_MBYTES, + ApifyEnvVars.STARTED_AT, + ApifyEnvVars.TIMEOUT_AT, + ApifyEnvVars.DEFAULT_DATASET_ID, + ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID, + ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID, + ApifyEnvVars.SDK_LATEST_VERSION, + ApifyEnvVars.LOG_FORMAT, + } + + legacy_env_vars = { + ApifyEnvVars.ACT_ID: ActorEnvVars.ID, + ApifyEnvVars.ACT_RUN_ID: ActorEnvVars.RUN_ID, + ApifyEnvVars.ACTOR_ID: ActorEnvVars.ID, + ApifyEnvVars.ACTOR_BUILD_ID: ActorEnvVars.BUILD_ID, + ApifyEnvVars.ACTOR_BUILD_NUMBER: ActorEnvVars.BUILD_NUMBER, + ApifyEnvVars.ACTOR_RUN_ID: ActorEnvVars.RUN_ID, + ApifyEnvVars.ACTOR_TASK_ID: ActorEnvVars.TASK_ID, + ApifyEnvVars.CONTAINER_URL: ActorEnvVars.WEB_SERVER_URL, + ApifyEnvVars.CONTAINER_PORT: ActorEnvVars.WEB_SERVER_PORT, + } + # Set up random env vars expected_get_env: dict[str, Any] = {} for int_env_var in INTEGER_ENV_VARS: + if int_env_var in ignored_env_vars: + continue + int_get_env_var = int_env_var.name.lower() expected_get_env[int_get_env_var] = random.randint(1, 99999) monkeypatch.setenv(int_env_var, f'{expected_get_env[int_get_env_var]}') for float_env_var in FLOAT_ENV_VARS: + if float_env_var in ignored_env_vars: + continue + float_get_env_var = float_env_var.name.lower() expected_get_env[float_get_env_var] = random.random() monkeypatch.setenv(float_env_var, f'{expected_get_env[float_get_env_var]}') for bool_env_var in BOOL_ENV_VARS: + if bool_env_var in ignored_env_vars: + continue + bool_get_env_var = bool_env_var.name.lower() expected_get_env[bool_get_env_var] = random.choice([True, False]) monkeypatch.setenv(bool_env_var, f'{"true" if expected_get_env[bool_get_env_var] else "false"}') for datetime_env_var in DATETIME_ENV_VARS: + if datetime_env_var in ignored_env_vars: + continue + datetime_get_env_var = datetime_env_var.name.lower() - expected_get_env[datetime_get_env_var] = datetime.now(timezone.utc) + expected_get_env[datetime_get_env_var] = datetime.now(TzInfo(0)) # type: ignore monkeypatch.setenv(datetime_env_var, expected_get_env[datetime_get_env_var].strftime('%Y-%m-%dT%H:%M:%S.%fZ')) for string_env_var in STRING_ENV_VARS: + if string_env_var in ignored_env_vars: + continue + string_get_env_var = string_env_var.name.lower() expected_get_env[string_get_env_var] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)) monkeypatch.setenv(string_env_var, expected_get_env[string_get_env_var]) # We need this override so that the actor doesn't fail when connecting to the platform events websocket monkeypatch.delenv(ActorEnvVars.EVENTS_WEBSOCKET_URL) + monkeypatch.delenv(ApifyEnvVars.ACTOR_EVENTS_WS_URL) expected_get_env[ActorEnvVars.EVENTS_WEBSOCKET_URL.name.lower()] = None + expected_get_env[ApifyEnvVars.ACTOR_EVENTS_WS_URL.name.lower()] = None + + # Adjust expectations for timedelta fields + for env_name, env_value in expected_get_env.items(): + if env_name.endswith('_millis'): + expected_get_env[env_name] = timedelta(milliseconds=env_value) + + # Convert dedicated_cpus to float + expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] = float(expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()]) + + # Update expectations for legacy configuration + for old_name, new_name in legacy_env_vars.items(): + expected_get_env[old_name.name.lower()] = expected_get_env[new_name.name.lower()] await Actor.init() - assert expected_get_env == Actor.get_env() + assert Actor.get_env() == expected_get_env await Actor.exit() diff --git a/tests/unit/actor/test_actor_helpers.py b/tests/unit/actor/test_actor_helpers.py index 92d0716d..34613f4f 100644 --- a/tests/unit/actor/test_actor_helpers.py +++ b/tests/unit/actor/test_actor_helpers.py @@ -5,7 +5,7 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars, WebhookEventType -from apify import Actor +from apify.actor import Actor, _ActorType if TYPE_CHECKING: import pytest @@ -17,7 +17,7 @@ class TestActorNewClient: async def test_actor_new_client_config(self: TestActorNewClient, monkeypatch: pytest.MonkeyPatch) -> None: token = 'my-token' monkeypatch.setenv(ApifyEnvVars.TOKEN, token) - my_actor = Actor() + my_actor = _ActorType() await my_actor.init() client = my_actor.new_client() @@ -39,71 +39,64 @@ async def test_actor_call( ) -> None: apify_client_async_patcher.patch('actor', 'call', return_value=None) actor_id = 'some-actor-id' - my_actor = Actor() - await my_actor.init() - await my_actor.call(actor_id) + async with Actor: + await Actor.call(actor_id) + assert len(apify_client_async_patcher.calls['actor']['call']) == 1 # The first argument is ActorClientAsync, which was called, let's check its id. assert apify_client_async_patcher.calls['actor']['call'][0][0][0].resource_id == actor_id - await my_actor.exit() - async def test_actor_call_task( self: TestActorCallStartAbortActor, apify_client_async_patcher: ApifyClientAsyncPatcher, ) -> None: apify_client_async_patcher.patch('task', 'call', return_value=None) task_id = 'some-task-id' - my_actor = Actor() - await my_actor.init() - await my_actor.call_task(task_id) + async with Actor: + await Actor.call_task(task_id) + assert len(apify_client_async_patcher.calls['task']['call']) == 1 assert apify_client_async_patcher.calls['task']['call'][0][0][0].resource_id == task_id - await my_actor.exit() - async def test_actor_start( self: TestActorCallStartAbortActor, apify_client_async_patcher: ApifyClientAsyncPatcher, ) -> None: apify_client_async_patcher.patch('actor', 'start', return_value=None) actor_id = 'some-id' - my_actor = Actor() - await my_actor.init() - await my_actor.start(actor_id) + async with Actor: + await Actor.start(actor_id) + assert len(apify_client_async_patcher.calls['actor']['start']) == 1 assert apify_client_async_patcher.calls['actor']['start'][0][0][0].resource_id == actor_id - await my_actor.exit() - async def test_actor_abort( self: TestActorCallStartAbortActor, apify_client_async_patcher: ApifyClientAsyncPatcher, ) -> None: apify_client_async_patcher.patch('run', 'abort', return_value=None) run_id = 'some-run-id' - my_actor = Actor() - await my_actor.init() - await my_actor.abort(run_id) + async with Actor: + await Actor.abort(run_id) + assert len(apify_client_async_patcher.calls['run']['abort']) == 1 assert apify_client_async_patcher.calls['run']['abort'][0][0][0].resource_id == run_id - await my_actor.exit() - class TestActorMethodsWorksOnlyOnPlatform: - # NOTE: These medhods will be tested properly using integrations tests. + # NOTE: These methods will be tested properly using integrations tests. async def test_actor_metamorpth_not_work_locally( self: TestActorMethodsWorksOnlyOnPlatform, caplog: pytest.LogCaptureFixture, ) -> None: - async with Actor() as my_actor: - await my_actor.metamorph('random-id') + async with Actor: + await Actor.metamorph('random-id') + assert len(caplog.records) == 1 assert caplog.records[0].levelname == 'ERROR' assert 'Actor.metamorph() is only supported when running on the Apify platform.' in caplog.records[0].message @@ -112,8 +105,9 @@ async def test_actor_reboot_not_work_locally( self: TestActorMethodsWorksOnlyOnPlatform, caplog: pytest.LogCaptureFixture, ) -> None: - async with Actor() as my_actor: - await my_actor.reboot() + async with Actor: + await Actor.reboot() + assert len(caplog.records) == 1 assert caplog.records[0].levelname == 'ERROR' assert 'Actor.reboot() is only supported when running on the Apify platform.' in caplog.records[0].message @@ -122,8 +116,9 @@ async def test_actor_add_webhook_not_work_locally( self: TestActorMethodsWorksOnlyOnPlatform, caplog: pytest.LogCaptureFixture, ) -> None: - async with Actor() as my_actor: - await my_actor.add_webhook(event_types=[WebhookEventType.ACTOR_BUILD_ABORTED], request_url='https://example.com') + async with Actor: + await Actor.add_webhook(event_types=[WebhookEventType.ACTOR_BUILD_ABORTED], request_url='https://example.com') + assert len(caplog.records) == 1 assert caplog.records[0].levelname == 'ERROR' assert 'Actor.add_webhook() is only supported when running on the Apify platform.' in caplog.records[0].message @@ -133,8 +128,9 @@ async def test_actor_set_status_message_mock_locally( caplog: pytest.LogCaptureFixture, ) -> None: caplog.set_level('INFO') - async with Actor() as my_actor: - await my_actor.set_status_message('test-status-message') + async with Actor: + await Actor.set_status_message('test-status-message') + matching_records = [record for record in caplog.records if 'test-status-message' in record.message] assert len(matching_records) == 1 assert matching_records[0].levelname == 'INFO' @@ -145,8 +141,9 @@ async def test_actor_set_status_message_terminal_mock_locally( caplog: pytest.LogCaptureFixture, ) -> None: caplog.set_level('INFO') - async with Actor() as my_actor: - await my_actor.fail(status_message='test-terminal-message') + async with Actor: + await Actor.fail(status_message='test-terminal-message') + matching_records = [record for record in caplog.records if 'test-terminal-message' in record.message] assert len(matching_records) == 1 assert matching_records[0].levelname == 'INFO' diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index e341318a..17955dcc 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -44,7 +44,7 @@ async def test_get_set_value(self: TestKeyValueStoreOnActor) -> None: test_key = 'test_key' test_value = 'test_value' test_content_type = 'text/plain' - async with Actor() as my_actor: + async with Actor as my_actor: await my_actor.set_value(key=test_key, value=test_value, content_type=test_content_type) value = await my_actor.get_value(key=test_key) assert value == test_value @@ -60,7 +60,7 @@ async def test_get_input(self: TestKeyValueStoreOnActor, memory_storage_client: content_type='application/json', ) - async with Actor() as my_actor: + async with Actor as my_actor: input = await my_actor.get_input() # noqa: A001 assert input['foo'] == test_input['foo'] @@ -87,7 +87,7 @@ async def test_get_input_with_secrets( content_type='application/json', ) - async with Actor() as my_actor: + async with Actor as my_actor: input = await my_actor.get_input() # noqa: A001 assert input['foo'] == input_with_secret['foo'] assert input['secret'] == secret_string diff --git a/tests/unit/actor/test_actor_lifecycle.py b/tests/unit/actor/test_actor_lifecycle.py index ef187f47..9983bd45 100644 --- a/tests/unit/actor/test_actor_lifecycle.py +++ b/tests/unit/actor/test_actor_lifecycle.py @@ -2,24 +2,28 @@ import asyncio import contextlib -from datetime import datetime +import json from typing import Any, Callable from unittest.mock import AsyncMock import pytest -from apify_shared.consts import ActorEventTypes, ApifyEnvVars +import websockets.server +from apify_shared.consts import ApifyEnvVars +from crawlee.events.types import Event, EventPersistStateData -from apify import Actor +import apify.actor +from apify.actor import Actor, _ActorType class TestActorInit: async def test_async_with_actor_properly_initialize(self: TestActorInit) -> None: async with Actor: - assert Actor._get_default_instance()._is_initialized - assert Actor._get_default_instance()._is_initialized is False + assert apify.actor._default_instance is not None + assert apify.actor._default_instance._is_initialized + assert not apify.actor._default_instance._is_initialized async def test_actor_init(self: TestActorInit) -> None: - my_actor = Actor() + my_actor = _ActorType() await my_actor.init() assert my_actor._is_initialized is True @@ -28,7 +32,7 @@ async def test_actor_init(self: TestActorInit) -> None: assert my_actor._is_initialized is False async def test_double_init(self: TestActorInit) -> None: - my_actor = Actor() + my_actor = _ActorType() await my_actor.init() with pytest.raises(RuntimeError): @@ -48,20 +52,20 @@ async def test_with_actor_exit(self: TestActorExit, monkeypatch: pytest.MonkeyPa on_persist = [] on_system_info = [] - def on_event(event_type: ActorEventTypes) -> Callable: + def on_event(event_type: Event) -> Callable: nonlocal on_persist nonlocal on_system_info - if event_type == ActorEventTypes.PERSIST_STATE: + if event_type == Event.PERSIST_STATE: return lambda data: on_persist.append(data) - if event_type == ActorEventTypes.SYSTEM_INFO: + if event_type == Event.SYSTEM_INFO: return lambda data: on_system_info.append(data) return lambda data: print(data) - my_actor = Actor() + my_actor = _ActorType() async with my_actor: assert my_actor._is_initialized - my_actor.on(ActorEventTypes.PERSIST_STATE, on_event(ActorEventTypes.PERSIST_STATE)) - my_actor.on(ActorEventTypes.SYSTEM_INFO, on_event(ActorEventTypes.SYSTEM_INFO)) + my_actor.on(Event.PERSIST_STATE, on_event(Event.PERSIST_STATE)) + my_actor.on(Event.SYSTEM_INFO, on_event(Event.SYSTEM_INFO)) await asyncio.sleep(1) on_persist_count = len(on_persist) @@ -73,26 +77,28 @@ def on_event(event_type: ActorEventTypes) -> Callable: await asyncio.sleep(0.2) assert on_persist_count == len(on_persist) assert on_system_info_count == len(on_system_info) - # Check `createdAt` is a datetime (so it's the same locally and on platform) - assert isinstance(on_system_info[0]['createdAt'], datetime) - async def test_raise_on_exit_witout_init(self: TestActorExit) -> None: + async def test_raise_on_exit_without_init(self: TestActorExit) -> None: with pytest.raises(RuntimeError): await Actor.exit() class TestActorFail: async def test_with_actor_fail(self: TestActorFail) -> None: - async with Actor() as my_actor: + async with _ActorType() as my_actor: assert my_actor._is_initialized await my_actor.fail() assert my_actor._is_initialized is False async def test_with_actor_failed(self: TestActorFail) -> None: + my_actor = None + with contextlib.suppress(Exception): - async with Actor() as my_actor: + async with _ActorType() as my_actor: assert my_actor._is_initialized raise Exception('Failed') # noqa: TRY002 + + assert my_actor is not None assert my_actor._is_initialized is False async def test_raise_on_fail_without_init(self: TestActorFail) -> None: @@ -106,7 +112,7 @@ async def test_actor_reboot_not_work_locally(self: TestActorFail) -> None: class TestActorMainMethod: async def test_actor_main_method(self: TestActorMainMethod) -> None: - my_actor = Actor() + my_actor = _ActorType() main_was_called = False async def actor_function() -> None: @@ -119,7 +125,7 @@ async def actor_function() -> None: assert main_was_called async def test_actor_main_method_throw_exception(self: TestActorMainMethod) -> None: - my_actor = Actor() + my_actor = _ActorType() err = Exception('Failed') my_actor.fail = AsyncMock() # type: ignore @@ -135,7 +141,7 @@ async def actor_function() -> None: await my_actor.exit() async def test_actor_main_method_raise_return_value(self: TestActorMainMethod) -> None: - my_actor = Actor() + my_actor = _ActorType() expected_string = 'Hello world' async def actor_function() -> str: @@ -151,6 +157,7 @@ async def test_migrating_event(self: TestMigratingEvent, monkeypatch: pytest.Mon # This should test whether when you get a MIGRATING event, # the actor automatically emits the PERSIST_STATE event with data `{'isMigrating': True}` monkeypatch.setenv(ApifyEnvVars.PERSIST_STATE_INTERVAL_MILLIS, '500') + monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, '1') persist_state_events_data = [] @@ -158,19 +165,38 @@ def log_persist_state(data: Any) -> None: nonlocal persist_state_events_data persist_state_events_data.append(data) - async with Actor: - Actor.on(ActorEventTypes.PERSIST_STATE, log_persist_state) - await asyncio.sleep(2) - Actor._get_default_instance()._event_manager.emit(ActorEventTypes.MIGRATING, None) - await asyncio.sleep(1) + async def handler(websocket: websockets.server.WebSocketServerProtocol) -> None: + await websocket.wait_closed() + + async with websockets.server.serve(handler, host='localhost') as ws_server: + port: int = ws_server.sockets[0].getsockname()[1] # type: ignore[index] + monkeypatch.setenv(ApifyEnvVars.ACTOR_EVENTS_WS_URL, f'ws://localhost:{port}') + + async with Actor: + Actor.on(Event.PERSIST_STATE, log_persist_state) + await asyncio.sleep(2) + + for socket in ws_server.websockets: + await socket.send( + json.dumps( + { + 'name': 'migrating', + 'data': { + 'isMigrating': True, + }, + } + ) + ) + + await asyncio.sleep(1) assert len(persist_state_events_data) >= 3 print(persist_state_events_data) # Check if the last event is from the migration - assert persist_state_events_data.pop() == {'isMigrating': True} + assert persist_state_events_data.pop() == EventPersistStateData(is_migrating=True) # Check if all the other events are regular persist state events for event_data in persist_state_events_data: - assert event_data == {'isMigrating': False} + assert event_data == EventPersistStateData(is_migrating=False) diff --git a/tests/unit/actor/test_actor_log.py b/tests/unit/actor/test_actor_log.py index 3bfe153c..62c72368 100644 --- a/tests/unit/actor/test_actor_log.py +++ b/tests/unit/actor/test_actor_log.py @@ -15,8 +15,10 @@ class TestActorLog: - async def test_actor_log(self: TestActorLog, caplog: pytest.LogCaptureFixture) -> None: + async def test_actor_log(self: TestActorLog, caplog: pytest.LogCaptureFixture, monkeypatch: pytest.MonkeyPatch) -> None: caplog.set_level(logging.DEBUG, logger='apify') + monkeypatch.setenv('APIFY_IS_AT_HOME', '1') + with contextlib.suppress(RuntimeError): async with Actor: # Test Actor.log diff --git a/tests/unit/actor/test_actor_memory_storage_e2e.py b/tests/unit/actor/test_actor_memory_storage_e2e.py index 31268bbc..330236b2 100644 --- a/tests/unit/actor/test_actor_memory_storage_e2e.py +++ b/tests/unit/actor/test_actor_memory_storage_e2e.py @@ -5,6 +5,7 @@ import pytest from apify_shared.consts import ApifyEnvVars +from crawlee.models import Request from crawlee.storage_client_manager import StorageClientManager from apify import Actor @@ -42,12 +43,13 @@ async def test_actor_memory_storage_client_key_value_store_e2e( non_default_kvs = await Actor.open_key_value_store(name='non-default') assert non_default_kvs is not old_non_default_kvs default_value = await default_kvs.get_value('test') - non_default_value = await non_default_kvs.get_value('test') + if purge_on_start: assert default_value is None else: assert default_value == 'default value' - assert non_default_value == 'non-default value' + + assert await non_default_kvs.get_value('test') == 'non-default value' @pytest.mark.parametrize('purge_on_start', [True, False]) @@ -69,11 +71,11 @@ async def test_actor_memory_storage_client_request_queue_e2e( forefront = i % 3 == 1 was_handled = i % 3 == 2 await default_queue.add_request( - { - 'uniqueKey': str(i), - 'url': request_url, - 'handledAt': datetime.now(timezone.utc) if was_handled else None, - }, + Request.from_url( + unique_key=str(i), + url=request_url, + handled_at=datetime.now(timezone.utc) if was_handled else None, + ), forefront=forefront, ) @@ -89,11 +91,11 @@ async def test_actor_memory_storage_client_request_queue_e2e( forefront = i % 3 == 1 was_handled = i % 3 == 2 await default_queue.add_request( - { - 'uniqueKey': str(i), - 'url': request_url, - 'handledAt': datetime.now(timezone.utc) if was_handled else None, - }, + Request.from_url( + unique_key=str(i), + url=request_url, + handled_at=datetime.now(timezone.utc) if was_handled else None, + ), forefront=forefront, ) @@ -102,29 +104,20 @@ async def test_actor_memory_storage_client_request_queue_e2e( # If the queue was purged between the runs, only the requests from the second run should be present, in the right order if purge_on_start: - assert queue_info.get('totalRequestCount') == 6 - assert queue_info.get('handledRequestCount') == 2 + assert queue_info.total_request_count == 6 + assert queue_info.handled_request_count == 2 expected_pending_request_order = [10, 7, 6, 9] - for request_number in expected_pending_request_order: - next_request = await default_queue.fetch_next_request() - assert next_request is not None - assert next_request.get('uniqueKey') == f'{request_number}' - assert next_request.get('url') == f'http://example.com/{request_number}' - - next_request = await default_queue.fetch_next_request() - assert next_request is None # If the queue was NOT purged between the runs, all the requests should be in the queue in the right order else: - assert queue_info.get('totalRequestCount') == 12 - assert queue_info.get('handledRequestCount') == 4 + assert queue_info.total_request_count == 12 + assert queue_info.handled_request_count == 4 expected_pending_request_order = [10, 7, 4, 1, 0, 3, 6, 9] - for request_number in expected_pending_request_order: - next_request = await default_queue.fetch_next_request() - assert next_request is not None - assert next_request.get('uniqueKey') == f'{request_number}' - assert next_request.get('url') == f'http://example.com/{request_number}' - - next_request = await default_queue.fetch_next_request() - assert next_request is None + + actual_requests = list[Request]() + while req := await default_queue.fetch_next_request(): + actual_requests.append(req) + + assert [int(req.unique_key) for req in actual_requests] == expected_pending_request_order + assert [req.url for req in actual_requests] == [f'http://example.com/{req.unique_key}' for req in actual_requests] diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 976c331e..6bc9b296 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -9,12 +9,11 @@ import pytest from apify_client.client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars +from crawlee.configuration import Configuration as CrawleeConfiguration from crawlee.memory_storage_client.memory_storage_client import MemoryStorageClient from crawlee.storage_client_manager import StorageClientManager -from crawlee.configuration import Configuration as CrawleeConfiguration -from apify import Actor -from apify.config import Configuration +import apify.actor if TYPE_CHECKING: from pathlib import Path @@ -23,9 +22,27 @@ @pytest.fixture() def reset_default_instances(monkeypatch: pytest.MonkeyPatch) -> Callable[[], None]: def reset() -> None: - monkeypatch.setattr(Actor, '_default_instance', None) - monkeypatch.setattr(Configuration, '_default_instance', None) + from crawlee.storages._creation_management import ( + _cache_dataset_by_id, + _cache_dataset_by_name, + _cache_kvs_by_id, + _cache_kvs_by_name, + _cache_rq_by_id, + _cache_rq_by_name, + ) + + _cache_dataset_by_id.clear() + _cache_dataset_by_name.clear() + _cache_kvs_by_id.clear() + _cache_kvs_by_name.clear() + _cache_rq_by_id.clear() + _cache_rq_by_name.clear() + + monkeypatch.setattr(CrawleeConfiguration, '_default_instance', None) monkeypatch.setattr(StorageClientManager, '_cloud_client', None) + monkeypatch.setattr(StorageClientManager, '_local_client', MemoryStorageClient()) + + apify.actor._default_instance = None # TODO StorageClientManager local client purge return reset @@ -35,11 +52,11 @@ def reset() -> None: # We also set the MemoryStorageClient to use a temp path @pytest.fixture(autouse=True) def _reset_and_patch_default_instances(monkeypatch: pytest.MonkeyPatch, tmp_path: Path, reset_default_instances: Callable[[], None]) -> None: - reset_default_instances() - # This forces the MemoryStorageClient to use tmp_path for its storage dir monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path)) + reset_default_instances() + # This class is used to patch the ApifyClientAsync methods to return a fixed value or be replaced with another method. class ApifyClientAsyncPatcher: diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py deleted file mode 100644 index b7770e38..00000000 --- a/tests/unit/test_config.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import annotations - -from datetime import datetime, timezone -from typing import TYPE_CHECKING - -from apify_shared.consts import ActorEnvVars, ApifyEnvVars - -from apify.config import Configuration - -if TYPE_CHECKING: - import pytest - - -class TestConfiguration: - # Test that some config properties have some reasonable defaults - def test_configuration_defaults(self: TestConfiguration) -> None: - config = Configuration() - assert config.token is None - assert config.proxy_password is None - assert config.api_base_url == 'https://api.apify.com' - assert config.proxy_hostname == 'proxy.apify.com' - assert config.default_dataset_id == 'default' - assert config.default_key_value_store_id == 'default' - assert config.default_request_queue_id == 'default' - assert config.is_at_home is False - assert config.proxy_port == 8000 - assert config.memory_mbytes is None - assert config.started_at is None - - # Test that defining properties via env vars works - def test_configuration_from_env_vars(self: TestConfiguration, monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setenv(ApifyEnvVars.TOKEN, 'DUMMY_TOKEN') - monkeypatch.setenv(ApifyEnvVars.PROXY_PASSWORD, 'DUMMY_PROXY_PASSWORD') - monkeypatch.setenv(ApifyEnvVars.API_BASE_URL, 'DUMMY_API_BASE_URL') - monkeypatch.setenv(ApifyEnvVars.PROXY_HOSTNAME, 'DUMMY_PROXY_HOSTNAME') - monkeypatch.setenv(ActorEnvVars.DEFAULT_KEY_VALUE_STORE_ID, 'DUMMY_DEFAULT_KEY_VALUE_STORE_ID') - monkeypatch.setenv(ActorEnvVars.DEFAULT_REQUEST_QUEUE_ID, 'DUMMY_DEFAULT_REQUEST_QUEUE_ID') - monkeypatch.setenv(ActorEnvVars.DEFAULT_DATASET_ID, 'DUMMY_DEFAULT_DATASET_ID') - monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, '1') - monkeypatch.setenv(ApifyEnvVars.PROXY_PORT, '1234') - monkeypatch.setenv(ActorEnvVars.MEMORY_MBYTES, '1024') - monkeypatch.setenv(ActorEnvVars.STARTED_AT, '2023-01-01T12:34:56.789Z') - - config = Configuration() - assert config.token == 'DUMMY_TOKEN' - assert config.proxy_password == 'DUMMY_PROXY_PASSWORD' - assert config.api_base_url == 'DUMMY_API_BASE_URL' - assert config.proxy_hostname == 'DUMMY_PROXY_HOSTNAME' - assert config.default_dataset_id == 'DUMMY_DEFAULT_DATASET_ID' - assert config.default_key_value_store_id == 'DUMMY_DEFAULT_KEY_VALUE_STORE_ID' - assert config.default_request_queue_id == 'DUMMY_DEFAULT_REQUEST_QUEUE_ID' - assert config.is_at_home is True - assert config.proxy_port == 1234 - assert config.memory_mbytes == 1024 - assert config.started_at == datetime(2023, 1, 1, 12, 34, 56, 789000, tzinfo=timezone.utc) - - # Test that constructor arguments take precedence over env vars - def test_configuration_from_constructor_arguments(self: TestConfiguration, monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setenv(ApifyEnvVars.TOKEN, 'DUMMY_TOKEN') - monkeypatch.setenv(ApifyEnvVars.PROXY_PASSWORD, 'DUMMY_PROXY_PASSWORD') - monkeypatch.setenv(ApifyEnvVars.API_BASE_URL, 'DUMMY_API_BASE_URL') - monkeypatch.setenv(ApifyEnvVars.PROXY_HOSTNAME, 'DUMMY_PROXY_HOSTNAME') - monkeypatch.setenv(ActorEnvVars.DEFAULT_DATASET_ID, 'DUMMY_DEFAULT_DATASET_ID') - monkeypatch.setenv(ActorEnvVars.DEFAULT_KEY_VALUE_STORE_ID, 'DUMMY_DEFAULT_KEY_VALUE_STORE_ID') - monkeypatch.setenv(ActorEnvVars.DEFAULT_REQUEST_QUEUE_ID, 'DUMMY_DEFAULT_REQUEST_QUEUE_ID') - monkeypatch.setenv(ApifyEnvVars.PROXY_PORT, '1234') - - config = Configuration( - token='TOKEN_FROM_CONSTRUCTOR', - proxy_password='PROXY_PASSWORD_FROM_CONSTRUCTOR', - proxy_hostname='PROXY_HOSTNAME_FROM_CONSTRUCTOR', - api_base_url='API_BASE_URL_FROM_CONSTRUCTOR', - default_dataset_id='DEFAULT_DATASET_ID_FROM_CONSTRUCTOR', - default_key_value_store_id='DEFAULT_KEY_VALUE_STORE_ID_FROM_CONSTRUCTOR', - default_request_queue_id='DEFAULT_REQUEST_QUEUE_ID_FROM_CONSTRUCTOR', - proxy_port=5678, - ) - - assert config.token == 'TOKEN_FROM_CONSTRUCTOR' - assert config.proxy_password == 'PROXY_PASSWORD_FROM_CONSTRUCTOR' - assert config.api_base_url == 'API_BASE_URL_FROM_CONSTRUCTOR' - assert config.proxy_hostname == 'PROXY_HOSTNAME_FROM_CONSTRUCTOR' - assert config.default_dataset_id == 'DEFAULT_DATASET_ID_FROM_CONSTRUCTOR' - assert config.default_key_value_store_id == 'DEFAULT_KEY_VALUE_STORE_ID_FROM_CONSTRUCTOR' - assert config.default_request_queue_id == 'DEFAULT_REQUEST_QUEUE_ID_FROM_CONSTRUCTOR' - assert config.proxy_port == 5678 diff --git a/tests/unit/test_event_manager.py b/tests/unit/test_event_manager.py index 539bd472..ecc3a451 100644 --- a/tests/unit/test_event_manager.py +++ b/tests/unit/test_event_manager.py @@ -3,275 +3,115 @@ import asyncio import json import logging -import time from collections import defaultdict -from pprint import pprint from typing import Any, Callable +from unittest.mock import Mock import pytest import websockets import websockets.server -from apify_shared.consts import ActorEnvVars, ActorEventTypes +from apify_shared.consts import ActorEnvVars +from crawlee.events.types import Event, EventSystemInfoData from apify.config import Configuration -from apify.event_manager import EventManager +from apify.event_manager import EventManager, PlatformEventManager class TestEventManagerLocal: - async def test_lifecycle_local(self: TestEventManagerLocal, caplog: pytest.LogCaptureFixture) -> None: + async def test_lifecycle_local(self, caplog: pytest.LogCaptureFixture) -> None: caplog.set_level(logging.DEBUG, logger='apify') - config = Configuration() - event_manager = EventManager(config) - - await event_manager.init() - assert event_manager._initialized is True + async with PlatformEventManager(Configuration.get_global_configuration()): + pass assert len(caplog.records) == 1 assert caplog.records[0].levelno == logging.DEBUG assert caplog.records[0].message == 'APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.' - with pytest.raises(RuntimeError, match='EventManager was already initialized!'): - await event_manager.init() - - await event_manager.close() - - with pytest.raises(RuntimeError, match='EventManager was not initialized!'): - await event_manager.close() - - assert event_manager._initialized is False - - async def test_event_handling_local(self: TestEventManagerLocal) -> None: - config = Configuration() - event_manager = EventManager(config) - - await event_manager.init() - - event_calls = defaultdict(list) - - def on_event(event: ActorEventTypes, id: int | None = None) -> Callable: # noqa: A002 - def event_handler(data: Any) -> None: - nonlocal event_calls - event_calls[event].append((id, data)) - - return event_handler - - handler_system_info = on_event(ActorEventTypes.SYSTEM_INFO) - - # Basic test with just one handler on event - # Test adding the handler - event_manager.on(ActorEventTypes.SYSTEM_INFO, handler_system_info) - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(0.1) - assert event_calls[ActorEventTypes.SYSTEM_INFO] == [(None, 'DUMMY_SYSTEM_INFO')] - event_calls[ActorEventTypes.SYSTEM_INFO].clear() - - # Test removing the handler - event_manager.off(ActorEventTypes.SYSTEM_INFO, handler_system_info) - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO_2') - await asyncio.sleep(0.1) - assert event_calls[ActorEventTypes.SYSTEM_INFO] == [] - - # Complicated test with multiple handlers - # Add three handlers - handler_persist_state_1 = on_event(ActorEventTypes.PERSIST_STATE, 1) - handler_persist_state_2 = on_event(ActorEventTypes.PERSIST_STATE, 2) - handler_persist_state_3 = on_event(ActorEventTypes.PERSIST_STATE, 3) - event_manager.on(ActorEventTypes.PERSIST_STATE, handler_persist_state_1) - event_manager.on(ActorEventTypes.PERSIST_STATE, handler_persist_state_2) - event_manager.on(ActorEventTypes.PERSIST_STATE, handler_persist_state_3) - - # Test that they all work, and that they're called in order - event_manager.emit(ActorEventTypes.PERSIST_STATE, 'DUMMY_PERSIST_STATE') - await asyncio.sleep(0.1) - assert event_calls[ActorEventTypes.PERSIST_STATE] == [ - (1, 'DUMMY_PERSIST_STATE'), - (2, 'DUMMY_PERSIST_STATE'), - (3, 'DUMMY_PERSIST_STATE'), - ] - event_calls[ActorEventTypes.PERSIST_STATE].clear() - - # Test that if you remove one, the others stay - event_manager.off(ActorEventTypes.PERSIST_STATE, handler_persist_state_3) - event_manager.emit(ActorEventTypes.PERSIST_STATE, 'DUMMY_PERSIST_STATE') - await asyncio.sleep(0.1) - assert event_calls[ActorEventTypes.PERSIST_STATE] == [ - (1, 'DUMMY_PERSIST_STATE'), - (2, 'DUMMY_PERSIST_STATE'), - ] - event_calls[ActorEventTypes.PERSIST_STATE].clear() - - # Test that removing all in bulk works - event_manager.off(ActorEventTypes.PERSIST_STATE) - event_manager.emit(ActorEventTypes.PERSIST_STATE, 'DUMMY_PERSIST_STATE') - await asyncio.sleep(0.1) - assert event_calls[ActorEventTypes.PERSIST_STATE] == [] - - await event_manager.close() - - async def test_event_handler_argument_counts_local(self: TestEventManagerLocal) -> None: - config = Configuration() - event_manager = EventManager(config) - - await event_manager.init() - - event_calls = [] - - def sync_no_arguments() -> None: - nonlocal event_calls - event_calls.append(('sync_no_arguments', None)) - - async def async_no_arguments() -> None: - nonlocal event_calls - event_calls.append(('async_no_arguments', None)) - - def sync_one_argument(event_data: Any) -> None: - nonlocal event_calls - event_calls.append(('sync_one_argument', event_data)) - - async def async_one_argument(event_data: Any) -> None: - nonlocal event_calls - event_calls.append(('async_one_argument', event_data)) - - def sync_two_arguments(_arg1: Any, _arg2: Any) -> None: - pass - - async def async_two_arguments(_arg1: Any, _arg2: Any) -> None: - pass - - def sync_two_arguments_one_default(event_data: Any, _arg2: Any = 'default_value') -> None: - nonlocal event_calls - event_calls.append(('sync_two_arguments_one_default', event_data)) + async def test_event_handling_local(self) -> None: + async with EventManager() as event_manager: + event_calls = defaultdict(list) - async def async_two_arguments_one_default(event_data: Any, _arg2: Any = 'default_value') -> None: - nonlocal event_calls - event_calls.append(('async_two_arguments_one_default', event_data)) + def on_event(event: Event, id: int | None = None) -> Callable: # noqa: A002 + def event_handler(data: Any) -> None: + nonlocal event_calls + event_calls[event].append((id, data)) - event_manager.on(ActorEventTypes.SYSTEM_INFO, sync_no_arguments) - event_manager.on(ActorEventTypes.SYSTEM_INFO, async_no_arguments) - event_manager.on(ActorEventTypes.SYSTEM_INFO, sync_one_argument) - event_manager.on(ActorEventTypes.SYSTEM_INFO, async_one_argument) - event_manager.on(ActorEventTypes.SYSTEM_INFO, sync_two_arguments_one_default) - event_manager.on(ActorEventTypes.SYSTEM_INFO, async_two_arguments_one_default) + return event_handler - # built-in functions should work too - event_manager.on(ActorEventTypes.SYSTEM_INFO, print) + handler_system_info = on_event(Event.SYSTEM_INFO) + dummy_system_info = Mock() + dummy_system_info_2 = Mock() - # functions from the standard library should work too - event_manager.on(ActorEventTypes.SYSTEM_INFO, pprint) - - with pytest.raises(ValueError, match='The "listener" argument must be a callable which accepts 0 or 1 arguments!'): - event_manager.on(ActorEventTypes.SYSTEM_INFO, sync_two_arguments) # type: ignore[arg-type] - with pytest.raises(ValueError, match='The "listener" argument must be a callable which accepts 0 or 1 arguments!'): - event_manager.on(ActorEventTypes.SYSTEM_INFO, async_two_arguments) # type: ignore[arg-type] - - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(0.1) - - assert len(event_calls) == 6 - assert ('sync_no_arguments', None) in event_calls - assert ('async_no_arguments', None) in event_calls - assert ('sync_one_argument', 'DUMMY_SYSTEM_INFO') in event_calls - assert ('async_one_argument', 'DUMMY_SYSTEM_INFO') in event_calls - assert ('sync_two_arguments_one_default', 'DUMMY_SYSTEM_INFO') in event_calls - assert ('async_two_arguments_one_default', 'DUMMY_SYSTEM_INFO') in event_calls - - async def test_event_async_handling_local(self: TestEventManagerLocal) -> None: - config = Configuration() - event_manager = EventManager(config) - - await event_manager.init() - - event_calls = [] + # Basic test with just one handler on event + # Test adding the handler + event_manager.on(event=Event.SYSTEM_INFO, listener=handler_system_info) + event_manager.emit(event=Event.SYSTEM_INFO, event_data=dummy_system_info) + await asyncio.sleep(0.1) + assert event_calls[Event.SYSTEM_INFO] == [(None, dummy_system_info)] + event_calls[Event.SYSTEM_INFO].clear() - async def event_handler(data: Any) -> None: - nonlocal event_calls - await asyncio.sleep(2) - event_calls.append(data) + # Test removing the handler + event_manager.off(event=Event.SYSTEM_INFO, listener=handler_system_info) + event_manager.emit(event=Event.SYSTEM_INFO, event_data=dummy_system_info_2) + await asyncio.sleep(0.1) + assert event_calls[Event.SYSTEM_INFO] == [] - # Test that async event handlers work, and that they don't block the main thread - event_manager.on(ActorEventTypes.SYSTEM_INFO, event_handler) - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(1) - assert event_calls == [] - await asyncio.sleep(2) - assert event_calls == ['DUMMY_SYSTEM_INFO'] + # Complicated test with multiple handlers + # Add three handlers + handler_persist_state_1 = on_event(Event.PERSIST_STATE, 1) + handler_persist_state_2 = on_event(Event.PERSIST_STATE, 2) + handler_persist_state_3 = on_event(Event.PERSIST_STATE, 3) + event_manager.on(event=Event.PERSIST_STATE, listener=handler_persist_state_1) + event_manager.on(event=Event.PERSIST_STATE, listener=handler_persist_state_2) + event_manager.on(event=Event.PERSIST_STATE, listener=handler_persist_state_3) - await event_manager.close() + dummy_persist_state = Mock() - async def test_wait_for_all_listeners_to_complete( - self: TestEventManagerLocal, - caplog: pytest.LogCaptureFixture, - ) -> None: - config = Configuration() - event_manager = EventManager(config) + # Test that they all work, and that they're called in order + event_manager.emit(event=Event.PERSIST_STATE, event_data=dummy_persist_state) + await asyncio.sleep(0.1) + assert event_calls[Event.PERSIST_STATE] == [ + (1, dummy_persist_state), + (2, dummy_persist_state), + (3, dummy_persist_state), + ] + event_calls[Event.PERSIST_STATE].clear() + + # Test that if you remove one, the others stay + event_manager.off(event=Event.PERSIST_STATE, listener=handler_persist_state_3) + event_manager.emit(event=Event.PERSIST_STATE, event_data=dummy_persist_state) + await asyncio.sleep(0.1) + assert event_calls[Event.PERSIST_STATE] == [ + (1, dummy_persist_state), + (2, dummy_persist_state), + ] + event_calls[Event.PERSIST_STATE].clear() + + # Test that removing all in bulk works + event_manager.off(event=Event.PERSIST_STATE) + event_manager.emit(event=Event.PERSIST_STATE, event_data=dummy_persist_state) + await asyncio.sleep(0.1) + assert event_calls[Event.PERSIST_STATE] == [] - await event_manager.init() + async def test_event_async_handling_local(self) -> None: + dummy_system_info = Mock() - event_calls = [] + async with EventManager() as event_manager: + event_calls = [] - def on_event(sleep_secs: int | None = None) -> Callable: async def event_handler(data: Any) -> None: nonlocal event_calls - if sleep_secs: - await asyncio.sleep(sleep_secs) + await asyncio.sleep(2) event_calls.append(data) - return event_handler - - # Create three handlers, all with a different sleep time, and add them - handler_1 = on_event(1) - handler_2 = on_event(2) - handler_3 = on_event(3) - event_manager.on(ActorEventTypes.SYSTEM_INFO, handler_1) - event_manager.on(ActorEventTypes.SYSTEM_INFO, handler_2) - event_manager.on(ActorEventTypes.SYSTEM_INFO, handler_3) - - # Emit the event, record the emitting time - emmitted_at = time.perf_counter() - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(0.1) - - # Wait for all of the handlers to finish and check that it took the right amount of time - await event_manager.wait_for_all_listeners_to_complete() - - duration = time.perf_counter() - emmitted_at - assert duration > 2.8 - assert duration < 4 - assert event_calls == ['DUMMY_SYSTEM_INFO', 'DUMMY_SYSTEM_INFO', 'DUMMY_SYSTEM_INFO'] - event_calls.clear() - - # Emit the event again, record the emitting time - emmitted_at = time.perf_counter() - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(0.1) - - # Wait for all of the handlers to finish and check that it took the right amount of time - # This time add a timeout so that only 1 handler should have time to finish - await event_manager.wait_for_all_listeners_to_complete(timeout_secs=1.5) - - duration = time.perf_counter() - emmitted_at - assert duration > 1.3 - assert duration < 2 - assert event_calls == ['DUMMY_SYSTEM_INFO'] - await asyncio.sleep(2) - assert event_calls == ['DUMMY_SYSTEM_INFO'] - event_calls.clear() - - assert caplog.records[0].levelno == logging.WARNING - assert caplog.records[0].message == 'Timed out waiting for event listeners to complete, unfinished event listeners will be canceled' - - # Emit the event again, test that closing the event manager waits for the handlers to complete - emmitted_at = time.perf_counter() - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(0.1) - - await event_manager.close() - - duration = time.perf_counter() - emmitted_at - assert duration > 2.8 - assert duration < 4 - assert event_calls == ['DUMMY_SYSTEM_INFO', 'DUMMY_SYSTEM_INFO', 'DUMMY_SYSTEM_INFO'] + # Test that async event handlers work, and that they don't block the main thread + event_manager.on(event=Event.SYSTEM_INFO, listener=event_handler) + event_manager.emit(event=Event.SYSTEM_INFO, event_data=dummy_system_info) + await asyncio.sleep(1) + assert event_calls == [] + await asyncio.sleep(2) + assert event_calls == [dummy_system_info] class TestEventManagerOnPlatform: @@ -280,14 +120,11 @@ async def test_lifecycle_on_platform_without_websocket( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setenv(ActorEnvVars.EVENTS_WEBSOCKET_URL, 'ws://localhost:56565') - - config = Configuration() - event_manager = EventManager(config) + event_manager = PlatformEventManager(Configuration.get_global_configuration()) with pytest.raises(RuntimeError, match='Error connecting to platform events websocket!'): - await event_manager.init() - - assert event_manager._initialized is False + async with event_manager: + pass async def test_lifecycle_on_platform(self: TestEventManagerOnPlatform, monkeypatch: pytest.MonkeyPatch) -> None: connected_ws_clients: set[websockets.server.WebSocketServerProtocol] = set() @@ -305,17 +142,8 @@ async def handler(websocket: websockets.server.WebSocketServerProtocol) -> None: port: int = ws_server.sockets[0].getsockname()[1] # type: ignore[index] monkeypatch.setenv(ActorEnvVars.EVENTS_WEBSOCKET_URL, f'ws://localhost:{port}') - config = Configuration() - event_manager = EventManager(config) - - await event_manager.init() - assert event_manager._initialized is True - - assert len(connected_ws_clients) == 1 - - await event_manager.close() - - assert event_manager._initialized is False + async with PlatformEventManager(Configuration.get_global_configuration()): + assert len(connected_ws_clients) == 1 async def test_event_handling_on_platform( self: TestEventManagerOnPlatform, @@ -330,8 +158,8 @@ async def handler(websocket: websockets.server.WebSocketServerProtocol) -> None: finally: connected_ws_clients.remove(websocket) - async def send_platform_event(event_name: ActorEventTypes, data: Any = None) -> None: - message: dict[str, Any] = {'name': event_name} + async def send_platform_event(event_name: Event, data: Any = None) -> None: + message: dict[str, Any] = {'name': event_name.value} if data: message['data'] = data @@ -343,23 +171,26 @@ async def send_platform_event(event_name: ActorEventTypes, data: Any = None) -> port: int = ws_server.sockets[0].getsockname()[1] # type: ignore[index] monkeypatch.setenv(ActorEnvVars.EVENTS_WEBSOCKET_URL, f'ws://localhost:{port}') - config = Configuration() - event_manager = EventManager(config) - - await event_manager.init() - - event_calls = [] - event_manager.on(ActorEventTypes.SYSTEM_INFO, lambda data: event_calls.append(data)) - - # Test sending event with data - await send_platform_event(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(0.1) - assert event_calls == ['DUMMY_SYSTEM_INFO'] - event_calls.clear() - - # Test sending event without data - await send_platform_event(ActorEventTypes.SYSTEM_INFO) - await asyncio.sleep(0.1) - assert event_calls == [None] - - await event_manager.close() + dummy_system_info = { + 'cpuInfo': {'usedRatio': 0.66, 'createdAt': '2024-04-04T12:44:00Z'}, + 'memoryInfo': { + 'currentSize': 11, + 'totalSize': 42, + 'createdAt': '2024-04-04T12:44:00Z', + }, + } + EventSystemInfoData.model_validate(dummy_system_info) + + async with PlatformEventManager(Configuration.get_global_configuration()) as event_manager: + event_calls = [] + + def listener(data: Any) -> None: + event_calls.append(json.loads(data.model_dump_json(by_alias=True)) if data else None) + + event_manager.on(event=Event.SYSTEM_INFO, listener=listener) + + # Test sending event with data + await send_platform_event(Event.SYSTEM_INFO, dummy_system_info) + await asyncio.sleep(0.1) + assert event_calls == [dummy_system_info] + event_calls.clear() diff --git a/tests/unit/test_proxy_configuration.py b/tests/unit/test_proxy_configuration.py index 03dc8b41..b13b3a55 100644 --- a/tests/unit/test_proxy_configuration.py +++ b/tests/unit/test_proxy_configuration.py @@ -1,8 +1,10 @@ +# ruff: noqa: ARG001 ARG005 from __future__ import annotations import asyncio import re -from typing import TYPE_CHECKING +from dataclasses import asdict +from typing import TYPE_CHECKING, Any import httpx import pytest @@ -63,17 +65,17 @@ def test__fails_with_invalid_arguments(self: TestProxyConfiguration) -> None: with pytest.raises(ValueError, match=re.escape(str(invalid_country_code))): ProxyConfiguration(country_code=invalid_country_code) # type: ignore - with pytest.raises(ValueError, match='Cannot combine custom proxies in "proxy_urls" with custom generating function in "new_url_function".'): - ProxyConfiguration(proxy_urls=['http://proxy.com:1111'], new_url_function=lambda _: 'http://proxy.com:2222') + with pytest.raises(ValueError, match='Exactly one of .* must be specified'): + ProxyConfiguration(proxy_urls=['http://proxy.com:1111'], new_url_function=lambda session_id=None, request=None: 'http://proxy.com:2222') with pytest.raises(ValueError, match='Cannot combine custom proxies with Apify Proxy'): ProxyConfiguration(proxy_urls=['http://proxy.com:1111'], groups=['GROUP1']) - with pytest.raises(ValueError, match=re.escape('proxy_urls[0] ("http://bad-url") is not a valid URL')): - ProxyConfiguration(proxy_urls=['http://bad-url']) + with pytest.raises(ValueError, match=re.escape('bad-url')): + ProxyConfiguration(proxy_urls=['bad-url']) with pytest.raises(ValueError, match='Cannot combine custom proxies with Apify Proxy'): - ProxyConfiguration(new_url_function=lambda _: 'http://proxy.com:2222', groups=['GROUP1']) + ProxyConfiguration(new_url_function=lambda session_id=None, request=None: 'http://proxy.com:2222', groups=['GROUP1']) class TestProxyConfigurationNewUrl: @@ -104,7 +106,7 @@ async def test_new_url_session_id(self: TestProxyConfigurationNewUrl) -> None: country_code=country_code, ) - session_ids: list[str | int] = [ + session_ids: list[str] = [ 'a', 'a_b', 'a_2', @@ -112,7 +114,7 @@ async def test_new_url_session_id(self: TestProxyConfigurationNewUrl) -> None: 'aaa~BBB', '1', '0.34252352', - 123456, + '123456', 'XXXXXXXXXXxxxxxxxxxxXXXXXXXXXXxxxxxxxxxxXXXXXXXXXX', ] for session_id in session_ids: @@ -171,7 +173,7 @@ async def test_custom_new_url_function(self: TestProxyConfigurationNewUrl) -> No 'http://proxy.com:6666', ] - def custom_new_url_function(_session_id: str | None) -> str: + def custom_new_url_function(session_id: str | None = None, request: Any = None) -> str: nonlocal custom_urls return custom_urls.pop() @@ -190,7 +192,7 @@ async def test_custom_new_url_function_async(self: TestProxyConfigurationNewUrl) 'http://proxy.com:6666', ] - async def custom_new_url_function(_session_id: str | None) -> str: + async def custom_new_url_function(session_id: str | None = None, request: Any = None) -> str: nonlocal custom_urls await asyncio.sleep(0.1) return custom_urls.pop() @@ -201,7 +203,7 @@ async def custom_new_url_function(_session_id: str | None) -> str: assert await proxy_configuration.new_url() == custom_url async def test_invalid_custom_new_url_function(self: TestProxyConfigurationNewUrl) -> None: - def custom_new_url_function(_session_id: str | None) -> str: + def custom_new_url_function(session_id: str | None = None, request: Any = None) -> str: raise ValueError proxy_configuration = ProxyConfiguration(new_url_function=custom_new_url_function) @@ -245,13 +247,15 @@ async def test_new_proxy_info_basic(self: TestProxyConfigurationNewProxyInfo) -> password=password, country_code=country_code, ) + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None expected_hostname = 'proxy.apify.com' expected_port = 8000 expected_username = f'groups-{"+".join(groups)},country-{country_code}' - assert proxy_info == { + assert asdict(proxy_info) == { 'url': f'http://{expected_username}:{password}@{expected_hostname}:{expected_port}', 'hostname': expected_hostname, 'port': expected_port, @@ -259,18 +263,37 @@ async def test_new_proxy_info_basic(self: TestProxyConfigurationNewProxyInfo) -> 'country_code': country_code, 'username': expected_username, 'password': password, + 'proxy_tier': None, + 'session_id': None, } async def test_new_proxy_info_rotates_urls(self: TestProxyConfigurationNewProxyInfo) -> None: proxy_urls = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] proxy_configuration = ProxyConfiguration(proxy_urls=proxy_urls) - assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[0] - assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[1] - assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[2] - assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[0] - assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[1] - assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[2] + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] + + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None + assert proxy_info.url == proxy_urls[1] + + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None + assert proxy_info.url == proxy_urls[2] + + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] + + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None + assert proxy_info.url == proxy_urls[1] + + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None + assert proxy_info.url == proxy_urls[2] async def test_new_proxy_info_rotates_urls_with_sessions(self: TestProxyConfigurationNewProxyInfo) -> None: sessions = ['sesssion_01', 'sesssion_02', 'sesssion_03', 'sesssion_04', 'sesssion_05', 'sesssion_06'] @@ -279,20 +302,47 @@ async def test_new_proxy_info_rotates_urls_with_sessions(self: TestProxyConfigur proxy_configuration = ProxyConfiguration(proxy_urls=proxy_urls) # same session should use same proxy URL - assert (await proxy_configuration.new_proxy_info(sessions[0]))['url'] == proxy_urls[0] - assert (await proxy_configuration.new_proxy_info(sessions[0]))['url'] == proxy_urls[0] - assert (await proxy_configuration.new_proxy_info(sessions[0]))['url'] == proxy_urls[0] + proxy_info = await proxy_configuration.new_proxy_info(sessions[0]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[0]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[0]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] # different sessions should rotate different proxies - assert (await proxy_configuration.new_proxy_info(sessions[1]))['url'] == proxy_urls[1] - assert (await proxy_configuration.new_proxy_info(sessions[2]))['url'] == proxy_urls[2] - assert (await proxy_configuration.new_proxy_info(sessions[3]))['url'] == proxy_urls[0] - assert (await proxy_configuration.new_proxy_info(sessions[4]))['url'] == proxy_urls[1] - assert (await proxy_configuration.new_proxy_info(sessions[5]))['url'] == proxy_urls[2] + proxy_info = await proxy_configuration.new_proxy_info(sessions[1]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[1] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[2]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[2] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[3]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[4]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[1] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[5]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[2] # already used sessions should be remembered - assert (await proxy_configuration.new_proxy_info(sessions[1]))['url'] == proxy_urls[1] - assert (await proxy_configuration.new_proxy_info(sessions[3]))['url'] == proxy_urls[0] + proxy_info = await proxy_configuration.new_proxy_info(sessions[1]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[1] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[3]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] @pytest.fixture() From c565adbf068f4ff7fa5506dfbf7aaae7be0abe73 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 16 Jul 2024 15:44:34 +0200 Subject: [PATCH 24/68] Use crawlee 0.10 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7d8ac064..fdbd9a63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "aiofiles >= 22.1.0", "aioshutil >= 1.0", "colorama >= 0.4.6", - "crawlee >= 0.0.5b8", + "crawlee >= 0.1.0", "cryptography >= 39.0.0", "httpx >= 0.24.0", "psutil >= 5.9.0", From 9743d4b5e116b1b288bef0cb0399ffa5cf74a149 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 16 Jul 2024 16:58:01 +0200 Subject: [PATCH 25/68] Use newer python in CI --- .github/workflows/check_version_availability.yaml | 2 +- .github/workflows/docs.yaml | 2 +- .github/workflows/integration_tests.yaml | 2 +- .github/workflows/lint_and_type_checks.yaml | 2 +- .github/workflows/unit_tests.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/check_version_availability.yaml b/.github/workflows/check_version_availability.yaml index 6a8caf80..977866f3 100644 --- a/.github/workflows/check_version_availability.yaml +++ b/.github/workflows/check_version_availability.yaml @@ -20,7 +20,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.9" - name: Install dependencies run: make install-dev diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index a11e2299..1c00e0a5 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -48,7 +48,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: 3.8 + python-version: 3.9 - name: Install Python dependencies run: make install-dev diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index 1785ddcd..af2b7881 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -18,7 +18,7 @@ jobs: matrix: # Run integration tests only on the oldest and newest supported Python versions, # as these tests are time-consuming and these versions are the most likely to encounter issues. - python-version: ["3.8", "3.12"] + python-version: ["3.9", "3.12"] max-parallel: 1 # no concurrency on this level, to not overshoot the test user limits steps: diff --git a/.github/workflows/lint_and_type_checks.yaml b/.github/workflows/lint_and_type_checks.yaml index c55ed6d7..83ddd644 100644 --- a/.github/workflows/lint_and_type_checks.yaml +++ b/.github/workflows/lint_and_type_checks.yaml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: # We need to check out the head commit in case of PRs, diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml index 274bae9c..02828eed 100644 --- a/.github/workflows/unit_tests.yaml +++ b/.github/workflows/unit_tests.yaml @@ -9,7 +9,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, windows-latest] - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] runs-on: ${{ matrix.os }} steps: From 40577528bce246c131135b4a2327cf5fde193533 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 17 Jul 2024 13:10:37 +0200 Subject: [PATCH 26/68] Resolve lint errors --- pyproject.toml | 1 + src/apify/_crypto.py | 2 +- src/apify/actor.py | 10 +++++----- .../apify_storage_client/apify_storage_client.py | 4 ++-- src/apify/apify_storage_client/dataset_client.py | 4 +++- .../apify_storage_client/dataset_collection_client.py | 2 +- .../apify_storage_client/key_value_store_client.py | 11 +++++++++-- .../key_value_store_collection_client.py | 2 +- .../request_queue_collection_client.py | 4 ++-- tests/integration/conftest.py | 5 +++-- tests/unit/conftest.py | 2 +- tests/unit/test_event_manager.py | 2 +- 12 files changed, 30 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7752009c..21ef1a44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,6 +92,7 @@ line-length = 150 [tool.ruff.lint] select = ["ALL"] ignore = [ + "A002", # Argument is shadowing a Python builtin "ANN101", # Missing type annotation for `self` in method "ANN102", # Missing type annotation for `cls` in method "ANN401", # Dynamically typed expressions (typing.Any) are disallowed in {filename} diff --git a/src/apify/_crypto.py b/src/apify/_crypto.py index 075247e3..830681b2 100644 --- a/src/apify/_crypto.py +++ b/src/apify/_crypto.py @@ -125,7 +125,7 @@ def _load_public_key(public_key_file_base64: str) -> rsa.RSAPublicKey: return public_key -def decrypt_input_secrets(private_key: rsa.RSAPrivateKey, input: Any) -> Any: # noqa: A002 +def decrypt_input_secrets(private_key: rsa.RSAPrivateKey, input: Any) -> Any: """Decrypt input secrets.""" if not isinstance(input, dict): return input diff --git a/src/apify/actor.py b/src/apify/actor.py index b6a2fc0d..a11f69f1 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -169,7 +169,7 @@ async def exit( self, *, exit_code: int = 0, - event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, + event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, # noqa: ARG002 status_message: str | None = None, cleanup_timeout: timedelta = timedelta(seconds=30), ) -> None: @@ -319,7 +319,7 @@ def new_client( async def open_dataset( self, *, - id: str | None = None, # noqa: A002 + id: str | None = None, name: str | None = None, force_cloud: bool = False, ) -> Dataset: @@ -352,7 +352,7 @@ async def open_dataset( async def open_key_value_store( self, *, - id: str | None = None, # noqa: A002 + id: str | None = None, name: str | None = None, force_cloud: bool = False, ) -> KeyValueStore: @@ -384,7 +384,7 @@ async def open_key_value_store( async def open_request_queue( self, *, - id: str | None = None, # noqa: A002 + id: str | None = None, name: str | None = None, force_cloud: bool = False, ) -> RequestQueue: @@ -788,7 +788,7 @@ async def metamorph( async def reboot( self, *, - event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, + event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, # noqa: ARG002 custom_after_sleep: timedelta | None = None, ) -> None: """Internally reboot this actor. diff --git a/src/apify/apify_storage_client/apify_storage_client.py b/src/apify/apify_storage_client/apify_storage_client.py index c00aed5c..885588ed 100644 --- a/src/apify/apify_storage_client/apify_storage_client.py +++ b/src/apify/apify_storage_client/apify_storage_client.py @@ -18,13 +18,13 @@ def __init__(self, *, configuration: Configuration) -> None: self._apify_client = ApifyClientAsync( token=configuration.token, api_url=configuration.api_base_url, - max_retries=8, # TODO + max_retries=8, min_delay_between_retries_millis=500, timeout_secs=360, ) @override - def dataset(self, id: str) -> DatasetClient: # noqa: A002 + def dataset(self, id: str) -> DatasetClient: return DatasetClient(self._apify_client.dataset(id)) @override diff --git a/src/apify/apify_storage_client/dataset_client.py b/src/apify/apify_storage_client/dataset_client.py index e2f5113a..6d817997 100644 --- a/src/apify/apify_storage_client/dataset_client.py +++ b/src/apify/apify_storage_client/dataset_client.py @@ -8,9 +8,11 @@ if TYPE_CHECKING: from collections.abc import AsyncIterator + from contextlib import AbstractAsyncContextManager from apify_client.clients import DatasetClientAsync from crawlee.types import JSONSerializable + from httpx import Response class DatasetClient(BaseDatasetClient): @@ -157,7 +159,7 @@ async def stream_items( skip_hidden: bool = False, xml_root: str | None = None, xml_row: str | None = None, - ) -> AsyncIterator[dict]: # TODO incorrect type + ) -> AbstractAsyncContextManager[Response | None]: return self._client.stream_items( item_format=item_format, offset=offset, diff --git a/src/apify/apify_storage_client/dataset_collection_client.py b/src/apify/apify_storage_client/dataset_collection_client.py index 7bf3d200..7c0da6f5 100644 --- a/src/apify/apify_storage_client/dataset_collection_client.py +++ b/src/apify/apify_storage_client/dataset_collection_client.py @@ -20,7 +20,7 @@ def __init__(self, apify_dataset_collection_client: DatasetCollectionClientAsync async def get_or_create( self, *, - id: str | None = None, # noqa: A002 + id: str | None = None, name: str | None = None, schema: dict | None = None, ) -> DatasetMetadata: diff --git a/src/apify/apify_storage_client/key_value_store_client.py b/src/apify/apify_storage_client/key_value_store_client.py index 073412d9..6d94d661 100644 --- a/src/apify/apify_storage_client/key_value_store_client.py +++ b/src/apify/apify_storage_client/key_value_store_client.py @@ -1,5 +1,6 @@ from __future__ import annotations +from contextlib import asynccontextmanager from typing import TYPE_CHECKING, Any from crawlee.base_storage_client.base_key_value_store_client import BaseKeyValueStoreClient @@ -8,8 +9,10 @@ if TYPE_CHECKING: from collections.abc import AsyncIterator + from contextlib import AbstractAsyncContextManager from apify_client.clients import KeyValueStoreClientAsync + from httpx import Response class KeyValueStoreClient(BaseKeyValueStoreClient): @@ -55,9 +58,13 @@ async def get_record_as_bytes(self, key: str) -> KeyValueStoreRecord | None: return KeyValueStoreRecord.model_validate(result) if result else None @override - async def stream_record(self, key: str) -> AsyncIterator[KeyValueStoreRecord | None]: # TODO incorrect type + async def stream_record(self, key: str) -> AbstractAsyncContextManager[KeyValueStoreRecord[Response] | None]: + return self._stream_record_internal(key) + + @asynccontextmanager + async def _stream_record_internal(self, key: str) -> AsyncIterator[KeyValueStoreRecord[Response] | None]: async with self._client.stream_record(key) as response: - return KeyValueStoreRecord.model_validate(response) + yield KeyValueStoreRecord.model_validate(response) @override async def set_record(self, key: str, value: Any, content_type: str | None = None) -> None: diff --git a/src/apify/apify_storage_client/key_value_store_collection_client.py b/src/apify/apify_storage_client/key_value_store_collection_client.py index 7b07019e..d0316d9d 100644 --- a/src/apify/apify_storage_client/key_value_store_collection_client.py +++ b/src/apify/apify_storage_client/key_value_store_collection_client.py @@ -20,7 +20,7 @@ def __init__(self, apify_dataset_collection_client: KeyValueStoreCollectionClien async def get_or_create( self, *, - id: str | None = None, # noqa: A002 + id: str | None = None, name: str | None = None, schema: dict | None = None, ) -> KeyValueStoreMetadata: diff --git a/src/apify/apify_storage_client/request_queue_collection_client.py b/src/apify/apify_storage_client/request_queue_collection_client.py index dad2f285..7b6adda4 100644 --- a/src/apify/apify_storage_client/request_queue_collection_client.py +++ b/src/apify/apify_storage_client/request_queue_collection_client.py @@ -20,9 +20,9 @@ def __init__(self, apify_request_queue_collection_client: RequestQueueCollection async def get_or_create( self, *, - id: str | None = None, # noqa: A002 + id: str | None = None, name: str | None = None, - schema: dict | None = None, # TODO unused + schema: dict | None = None, ) -> RequestQueueMetadata: return RequestQueueMetadata.model_validate( await self._client.get_or_create( diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 380a5b1a..93433db5 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -6,7 +6,6 @@ import subprocess import sys import textwrap -from collections.abc import AsyncIterator, Awaitable, Mapping from pathlib import Path from typing import TYPE_CHECKING, Callable, Protocol @@ -21,6 +20,8 @@ from apify.config import Configuration if TYPE_CHECKING: + from collections.abc import AsyncIterator, Awaitable, Mapping + from apify_client.clients.resource_clients import ActorClientAsync TOKEN_ENV_VAR = 'APIFY_TEST_USER_API_TOKEN' @@ -35,7 +36,7 @@ def _reset_and_patch_default_instances(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr(Configuration, '_default_instance', None) monkeypatch.setattr(StorageClientManager, '_cloud_client', None) apify.actor._default_instance = None - # TODO StorageClientManager local client purge + # TODO: StorageClientManager local client purge # noqa: TD003 # This fixture can't be session-scoped, diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 6bc9b296..2b6b9d8c 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -43,7 +43,7 @@ def reset() -> None: monkeypatch.setattr(StorageClientManager, '_local_client', MemoryStorageClient()) apify.actor._default_instance = None - # TODO StorageClientManager local client purge + # TODO: StorageClientManager local client purge # noqa: TD003 return reset diff --git a/tests/unit/test_event_manager.py b/tests/unit/test_event_manager.py index ecc3a451..102e0e91 100644 --- a/tests/unit/test_event_manager.py +++ b/tests/unit/test_event_manager.py @@ -32,7 +32,7 @@ async def test_event_handling_local(self) -> None: async with EventManager() as event_manager: event_calls = defaultdict(list) - def on_event(event: Event, id: int | None = None) -> Callable: # noqa: A002 + def on_event(event: Event, id: int | None = None) -> Callable: def event_handler(data: Any) -> None: nonlocal event_calls event_calls[event].append((id, data)) From 6b9c93febbeb8ba35a5760f2cd3ac18366712799 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 17 Jul 2024 13:17:49 +0200 Subject: [PATCH 27/68] mypy: ignore assignment to method --- src/apify/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apify/config.py b/src/apify/config.py index 21936e22..a7f7498c 100644 --- a/src/apify/config.py +++ b/src/apify/config.py @@ -186,4 +186,4 @@ def get_global_configuration(cls) -> Self: # Monkey-patch the base class so that it works with the extended configuration -CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration +CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # mypy: ignore From 6b45143a897ee80d05fe12d4b007c4d501d49b57 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 6 Aug 2024 16:10:50 +0200 Subject: [PATCH 28/68] Ignore untyped imports from scrapy --- pyproject.toml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 21ef1a44..49d6600e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -184,8 +184,6 @@ warn_return_any = true warn_unreachable = true warn_unused_ignores = true -[tool.mypy-scrapy] -ignore_missing_imports = true - -[tool.mypy-sortedcollections] +[[tool.mypy.overrides]] +module = ['scrapy', 'scrapy.*', 'sortedcollections'] ignore_missing_imports = true From ea7b4184e7b3723876911421e7b6acb9133107dd Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 7 Aug 2024 11:26:43 +0200 Subject: [PATCH 29/68] Fix scrapy integration --- src/apify/config.py | 2 +- src/apify/scrapy/middlewares/apify_proxy.py | 2 +- src/apify/scrapy/requests.py | 98 ++++++------ src/apify/scrapy/scheduler.py | 9 +- src/apify/scrapy/utils.py | 17 +-- .../scrapy/requests/test_to_apify_request.py | 22 +-- .../scrapy/requests/test_to_scrapy_request.py | 143 +++++++----------- 7 files changed, 123 insertions(+), 170 deletions(-) diff --git a/src/apify/config.py b/src/apify/config.py index a7f7498c..da03e766 100644 --- a/src/apify/config.py +++ b/src/apify/config.py @@ -186,4 +186,4 @@ def get_global_configuration(cls) -> Self: # Monkey-patch the base class so that it works with the extended configuration -CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # mypy: ignore +CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index c2aeca65..4ab27166 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -144,4 +144,4 @@ async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult: # Get a new proxy URL and return it new_url = await proxy_cfg.new_url() - return urlparse(new_url) + return urlparse(str(new_url)) diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index dd527497..1942d4a5 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -2,6 +2,7 @@ import codecs import pickle +from typing import Any, cast try: from scrapy import Request, Spider @@ -13,7 +14,8 @@ ) from exc from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.requests import compute_unique_key +from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id +from crawlee.models import Request as CrawleeRequest from apify.actor import Actor @@ -26,7 +28,7 @@ def _is_request_produced_by_middleware(scrapy_request: Request) -> bool: return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times')) -def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None: +def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest | None: """Convert a Scrapy request to an Apify request. Args: @@ -36,7 +38,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None: Returns: The converted Apify request if the conversion was successful, otherwise None. """ - if not isinstance(scrapy_request, Request): + if not isinstance(cast(Any, scrapy_request), Request): Actor.log.warning('Failed to convert to Apify request: Scrapy request must be a Request instance.') return None @@ -44,39 +46,39 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None: Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...') try: - apify_request = { - 'url': scrapy_request.url, - 'method': scrapy_request.method, - 'payload': scrapy_request.body, - 'userData': scrapy_request.meta.get('userData', {}), - } - - # Convert Scrapy's headers to a dictionary and store them in the apify_request - if isinstance(scrapy_request.headers, Headers): - apify_request['headers'] = dict(scrapy_request.headers.to_unicode_dict()) - else: - Actor.log.warning(f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}') - - # If the request was produced by the middleware (e.g. retry or redirect), we must compute the unique key here if _is_request_produced_by_middleware(scrapy_request): - apify_request['uniqueKey'] = compute_unique_key( + unique_key = compute_unique_key( url=scrapy_request.url, method=scrapy_request.method, payload=scrapy_request.body, use_extended_unique_key=True, ) - # Othwerwise, we can use the unique key (also the id) from the meta + elif scrapy_request.dont_filter: + unique_key = crypto_random_object_id(8) + elif scrapy_request.meta.get('apify_request_unique_key'): + unique_key = scrapy_request.meta['apify_request_unique_key'] else: - if scrapy_request.meta.get('apify_request_id'): - apify_request['id'] = scrapy_request.meta['apify_request_id'] + unique_key = crypto_random_object_id(8) - if scrapy_request.meta.get('apify_request_unique_key'): - apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key'] + if scrapy_request.meta.get('apify_request_id'): # noqa: SIM108 + request_id = scrapy_request.meta['apify_request_id'] + else: + request_id = unique_key_to_request_id(unique_key) + + apify_request = CrawleeRequest( + url=scrapy_request.url, + method=scrapy_request.method, + payload=scrapy_request.body, + user_data=scrapy_request.meta.get('userData', {}), + unique_key=unique_key, + id=request_id, + ) - # If the request's dont_filter field is set, we must generate a random `uniqueKey` to avoid deduplication - # of the request in the Request Queue. - if scrapy_request.dont_filter: - apify_request['uniqueKey'] = crypto_random_object_id(8) + # Convert Scrapy's headers to a dictionary and store them in the apify_request + if isinstance(scrapy_request.headers, Headers): + apify_request.headers = dict(scrapy_request.headers.to_unicode_dict()) + else: + Actor.log.warning(f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}') # Serialize the Scrapy Request and store it in the apify_request. # - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64, @@ -84,7 +86,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None: # - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/. scrapy_request_dict = scrapy_request.to_dict(spider=spider) scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode() - apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded + apify_request.user_data['scrapy_request'] = scrapy_request_dict_encoded except Exception as exc: Actor.log.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}') @@ -94,7 +96,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None: return apify_request -def to_scrapy_request(apify_request: dict, spider: Spider) -> Request: +def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: """Convert an Apify request to a Scrapy request. Args: @@ -102,32 +104,26 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request: spider: The Scrapy spider that the request is associated with. Raises: - TypeError: If the apify_request is not a dictionary. + TypeError: If the apify_request is not a crawlee request. ValueError: If the apify_request does not contain the required keys. Returns: The converted Scrapy request. """ - if not isinstance(apify_request, dict): - raise TypeError('apify_request must be a dictionary') - - required_keys = ['url', 'method', 'id', 'uniqueKey'] - missing_keys = [key for key in required_keys if key not in apify_request] - - if missing_keys: - raise ValueError(f'apify_request must contain {", ".join(map(repr, missing_keys))} key(s)') + if not isinstance(cast(Any, apify_request), CrawleeRequest): + raise TypeError('apify_request must be a crawlee.models.Request instance') call_id = crypto_random_object_id(8) Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...') # If the apify_request comes from the Scrapy - if 'userData' in apify_request and 'scrapy_request' in apify_request['userData']: + if 'scrapy_request' in apify_request.user_data: # Deserialize the Scrapy Request from the apify_request. # - This process involves decoding the base64-encoded request data and reconstructing # the Scrapy Request object from its dictionary representation. Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...') - scrapy_request_dict_encoded = apify_request['userData']['scrapy_request'] + scrapy_request_dict_encoded = apify_request.user_data['scrapy_request'] if not isinstance(scrapy_request_dict_encoded, str): raise TypeError('scrapy_request_dict_encoded must be a string') @@ -143,7 +139,7 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request: # Update the meta field with the meta field from the apify_request meta = scrapy_request.meta or {} - meta.update({'apify_request_id': apify_request['id'], 'apify_request_unique_key': apify_request['uniqueKey']}) + meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key}) scrapy_request._meta = meta # scrapy_request.meta is a property, so we have to set it like this # If the apify_request comes directly from the Request Queue, typically start URLs @@ -151,26 +147,26 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request: Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)') scrapy_request = Request( - url=apify_request['url'], - method=apify_request['method'], + url=apify_request.url, + method=apify_request.method, meta={ - 'apify_request_id': apify_request['id'], - 'apify_request_unique_key': apify_request['uniqueKey'], + 'apify_request_id': apify_request.id, + 'apify_request_unique_key': apify_request.unique_key, }, ) # Add optional 'headers' field - if 'headers' in apify_request: - if isinstance(apify_request['headers'], dict): - scrapy_request.headers = Headers(apify_request['headers']) + if apify_request.headers: + if isinstance(cast(Any, apify_request.headers), dict): + scrapy_request.headers = Headers(apify_request.headers) else: Actor.log.warning( - f'apify_request[headers] is not an instance of the dict class, apify_request[headers] = {apify_request["headers"]}', + f'apify_request[headers] is not an instance of the dict class, apify_request[headers] = {apify_request.headers}', ) # Add optional 'userData' field - if 'userData' in apify_request: - scrapy_request.meta['userData'] = apify_request['userData'] + if apify_request.user_data: + scrapy_request.meta['userData'] = apify_request.user_data Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}') return scrapy_request diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 03e8b78c..1e3c8323 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -96,18 +96,13 @@ def enqueue_request(self: ApifyScheduler, request: Request) -> bool: raise TypeError('self._rq must be an instance of the RequestQueue class') try: - result = nested_event_loop.run_until_complete( - self._rq.add_request( - apify_request, - use_extended_unique_key=True, - ) - ) + result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request)) except BaseException: traceback.print_exc() raise Actor.log.debug(f'[{call_id}]: rq.add_request.result={result}...') - return bool(result['wasAlreadyPresent']) + return bool(result.was_already_present) def next_request(self: ApifyScheduler) -> Request | None: """Fetch the next request from the scheduler. diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index 75545dde..b1658b65 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -17,10 +17,11 @@ from crawlee.storage_client_manager import StorageClientManager -from apify.actor import Actor +from apify import Actor, Configuration +from apify.apify_storage_client.apify_storage_client import ApifyStorageClient if TYPE_CHECKING: - from apify.storages import RequestQueue + from crawlee.storages import RequestQueue nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop() @@ -86,10 +87,10 @@ async def open_queue_with_custom_client() -> RequestQueue: we don't have to do this hacky workaround """ # Create a new Apify Client with its httpx client in the custom event loop - custom_loop_apify_client = Actor.new_client() + custom_loop_apify_client = ApifyStorageClient(configuration=Configuration.get_global_configuration()) # Set the new Apify Client as the default client, back up the old client - old_client = Actor.apify_client + old_client = StorageClientManager._cloud_client StorageClientManager.set_cloud_client(custom_loop_apify_client) # Create a new Request Queue in the custom event loop, @@ -97,11 +98,9 @@ async def open_queue_with_custom_client() -> RequestQueue: rq = await Actor.open_request_queue() if Actor.config.is_at_home: - rq._request_queue_client = custom_loop_apify_client.request_queue( - rq._id, - client_key=rq._client_key, - ) + rq._resource_client = custom_loop_apify_client.request_queue(rq._id) # Restore the old Apify Client as the default client - StorageClientManager.set_cloud_client(old_client) + if old_client: + StorageClientManager.set_cloud_client(old_client) return rq diff --git a/tests/unit/scrapy/requests/test_to_apify_request.py b/tests/unit/scrapy/requests/test_to_apify_request.py index ac483e76..0116f5ec 100644 --- a/tests/unit/scrapy/requests/test_to_apify_request.py +++ b/tests/unit/scrapy/requests/test_to_apify_request.py @@ -22,9 +22,9 @@ def test__to_apify_request__simple(spider: Spider) -> None: apify_request = to_apify_request(scrapy_request, spider) assert apify_request is not None - assert apify_request.get('url') == 'https://example.com' + assert apify_request.url == 'https://example.com' - user_data = apify_request.get('userData', {}) + user_data = apify_request.user_data assert isinstance(user_data, dict) assert 'scrapy_request' in user_data assert isinstance(user_data.get('scrapy_request'), str) @@ -37,7 +37,7 @@ def test__to_apify_request__headers(spider: Spider) -> None: apify_request = to_apify_request(scrapy_request, spider) assert apify_request is not None - assert apify_request['headers'] == dict(scrapy_request_headers.to_unicode_dict()) + assert apify_request.headers == dict(scrapy_request_headers.to_unicode_dict()) def test__to_apify_request__without_id_and_unique_key(spider: Spider) -> None: @@ -50,10 +50,10 @@ def test__to_apify_request__without_id_and_unique_key(spider: Spider) -> None: apify_request = to_apify_request(scrapy_request, spider) assert apify_request is not None - assert apify_request.get('url') == 'https://example.com' - assert apify_request.get('method') == 'GET' + assert apify_request.url == 'https://example.com' + assert apify_request.method == 'GET' - user_data = apify_request.get('userData', {}) + user_data = apify_request.user_data assert isinstance(user_data, dict) assert user_data['some_user_data'] == 'test' @@ -75,12 +75,12 @@ def test__to_apify_request__with_id_and_unique_key(spider: Spider) -> None: apify_request = to_apify_request(scrapy_request, spider) assert apify_request is not None - assert apify_request.get('url') == 'https://example.com' - assert apify_request.get('method') == 'GET' - assert apify_request.get('id') == 'abc123' - assert apify_request.get('uniqueKey') == 'https://example.com' + assert apify_request.url == 'https://example.com' + assert apify_request.method == 'GET' + assert apify_request.id == 'abc123' + assert apify_request.unique_key == 'https://example.com' - user_data = apify_request.get('userData', {}) + user_data = apify_request.user_data assert isinstance(user_data, dict) assert user_data['some_user_data'] == 'hello' diff --git a/tests/unit/scrapy/requests/test_to_scrapy_request.py b/tests/unit/scrapy/requests/test_to_scrapy_request.py index ebd294e4..3624eefe 100644 --- a/tests/unit/scrapy/requests/test_to_scrapy_request.py +++ b/tests/unit/scrapy/requests/test_to_scrapy_request.py @@ -3,6 +3,7 @@ import binascii import pytest +from crawlee.models import Request as CrawleeRequest from scrapy import Request, Spider from scrapy.http.headers import Headers @@ -21,134 +22,96 @@ def spider() -> DummySpider: def test__to_scrapy_request__without_reconstruction(spider: Spider) -> None: # Without reconstruction of encoded Scrapy request - apify_request = { - 'url': 'https://example.com', - 'method': 'GET', - 'uniqueKey': 'https://example.com', - 'id': 'fvwscO2UJLdr10B', - } + apify_request = CrawleeRequest(url='https://example.com', method='GET', unique_key='https://example.com', id='fvwscO2UJLdr10B', user_data={}) scrapy_request = to_scrapy_request(apify_request, spider) assert isinstance(scrapy_request, Request) - assert apify_request['url'] == scrapy_request.url - assert apify_request['method'] == scrapy_request.method - assert apify_request['id'] == scrapy_request.meta.get('apify_request_id') - assert apify_request['uniqueKey'] == scrapy_request.meta.get('apify_request_unique_key') + assert apify_request.url == scrapy_request.url + assert apify_request.method == scrapy_request.method + assert apify_request.id == scrapy_request.meta.get('apify_request_id') + assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') def test__to_scrapy_request__without_reconstruction_with_optional_fields(spider: Spider) -> None: # Without reconstruction of encoded Scrapy request - apify_request = { - 'url': 'https://crawlee.dev', - 'method': 'GET', - 'uniqueKey': 'https://crawlee.dev', - 'id': 'fvwscO2UJLdr10B', - 'headers': {'Authorization': 'Bearer access_token'}, - 'userData': {'some_user_data': 'test'}, - } + apify_request = CrawleeRequest( + url='https://crawlee.dev', + method='GET', + unique_key='https://crawlee.dev', + id='fvwscO2UJLdr10B', + headers={'Authorization': 'Bearer access_token'}, + user_data={'some_user_data': 'test'}, + ) scrapy_request = to_scrapy_request(apify_request, spider) assert isinstance(scrapy_request, Request) - assert apify_request['url'] == scrapy_request.url - assert apify_request['method'] == scrapy_request.method - assert apify_request['id'] == scrapy_request.meta.get('apify_request_id') - assert apify_request['uniqueKey'] == scrapy_request.meta.get('apify_request_unique_key') - assert Headers(apify_request['headers']) == scrapy_request.headers - assert apify_request['userData'] == scrapy_request.meta.get('userData') + assert apify_request.url == scrapy_request.url + assert apify_request.method == scrapy_request.method + assert apify_request.id == scrapy_request.meta.get('apify_request_id') + assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') + assert Headers(apify_request.headers) == scrapy_request.headers + assert apify_request.user_data == scrapy_request.meta.get('userData') def test__to_scrapy_request__with_reconstruction(spider: Spider) -> None: # With reconstruction of encoded Scrapy request - apify_request = { - 'url': 'https://apify.com', - 'method': 'GET', - 'id': 'fvwscO2UJLdr10B', - 'uniqueKey': 'https://apify.com', - 'userData': { + apify_request = CrawleeRequest( + url='https://apify.com', + method='GET', + id='fvwscO2UJLdr10B', + unique_key='https://apify.com', + user_data={ 'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n', # noqa: E501 }, - } + ) scrapy_request = to_scrapy_request(apify_request, spider) assert isinstance(scrapy_request, Request) - assert apify_request['url'] == scrapy_request.url - assert apify_request['method'] == scrapy_request.method - assert apify_request['id'] == scrapy_request.meta.get('apify_request_id') - assert apify_request['uniqueKey'] == scrapy_request.meta.get('apify_request_unique_key') - assert apify_request['userData'] == scrapy_request.meta.get('userData') + assert apify_request.url == scrapy_request.url + assert apify_request.method == scrapy_request.method + assert apify_request.id == scrapy_request.meta.get('apify_request_id') + assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') + assert apify_request.user_data == scrapy_request.meta.get('userData') def test__to_scrapy_request__with_reconstruction_with_optional_fields(spider: Spider) -> None: # With reconstruction of encoded Scrapy request - apify_request = { - 'url': 'https://apify.com', - 'method': 'GET', - 'id': 'fvwscO2UJLdr10B', - 'uniqueKey': 'https://apify.com', - 'headers': {'Authorization': 'Bearer access_token'}, - 'userData': { + apify_request = CrawleeRequest( + url='https://apify.com', + method='GET', + id='fvwscO2UJLdr10B', + unique_key='https://apify.com', + headers={'Authorization': 'Bearer access_token'}, + user_data={ 'some_user_data': 'hello', 'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n', # noqa: E501 }, - } + ) scrapy_request = to_scrapy_request(apify_request, spider) assert isinstance(scrapy_request, Request) - assert apify_request['url'] == scrapy_request.url - assert apify_request['method'] == scrapy_request.method - assert apify_request['id'] == scrapy_request.meta.get('apify_request_id') - assert apify_request['uniqueKey'] == scrapy_request.meta.get('apify_request_unique_key') - assert Headers(apify_request['headers']) == scrapy_request.headers - assert apify_request['userData'] == scrapy_request.meta.get('userData') - - -def test__to_scrapy_request__invalid_missing_url(spider: Spider) -> None: - apify_request = { - 'method': 'GET', - 'id': 'fvwscO2UJLdr10B', - 'uniqueKey': 'https://example.com', - } - - with pytest.raises(ValueError): - to_scrapy_request(apify_request, spider) - - -def test__to_scrapy_request__invalid_missing_id(spider: Spider) -> None: - apify_request = { - 'url': 'https://example.com', - 'method': 'GET', - 'uniqueKey': 'https://example.com', - } - - with pytest.raises(ValueError): - to_scrapy_request(apify_request, spider) - - -def test__to_scrapy_request__invalid_missing_unique_key(spider: Spider) -> None: - apify_request = { - 'url': 'https://example.com', - 'method': 'GET', - 'id': 'fvwscO2UJLdr10B', - } - - with pytest.raises(ValueError): - to_scrapy_request(apify_request, spider) + assert apify_request.url == scrapy_request.url + assert apify_request.method == scrapy_request.method + assert apify_request.id == scrapy_request.meta.get('apify_request_id') + assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') + assert Headers(apify_request.headers) == scrapy_request.headers + assert apify_request.user_data == scrapy_request.meta.get('userData') def test__to_scrapy_request__invalid_request_for_reconstruction(spider: Spider) -> None: - apify_request = { - 'url': 'https://example.com', - 'method': 'GET', - 'id': 'invalid123', - 'uniqueKey': 'https://example.com', - 'userData': { + apify_request = CrawleeRequest( + url='https://example.com', + method='GET', + id='invalid123', + unique_key='https://example.com', + user_data={ 'scrapy_request': 'this is not a correctly encoded Scrapy request', }, - } + ) with pytest.raises(binascii.Error): to_scrapy_request(apify_request, spider) From e228a44ecc1b79351016f4e407fb45888f944d83 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 7 Aug 2024 21:38:00 +0200 Subject: [PATCH 30/68] Fix type errors in integration tests --- .../apify_storage_client/apify_storage_client.py | 3 ++- .../apify_storage_client/key_value_store_client.py | 13 ++++++++++++- tests/integration/actor_source_base/src/main.py | 2 +- tests/integration/test_actor_events.py | 11 ++++++----- tests/integration/test_actor_key_value_store.py | 5 +++-- tests/integration/test_actor_lifecycle.py | 7 ++++--- tests/integration/test_actor_request_queue.py | 5 +++-- 7 files changed, 31 insertions(+), 15 deletions(-) diff --git a/src/apify/apify_storage_client/apify_storage_client.py b/src/apify/apify_storage_client/apify_storage_client.py index 885588ed..5aba8057 100644 --- a/src/apify/apify_storage_client/apify_storage_client.py +++ b/src/apify/apify_storage_client/apify_storage_client.py @@ -22,6 +22,7 @@ def __init__(self, *, configuration: Configuration) -> None: min_delay_between_retries_millis=500, timeout_secs=360, ) + self._configuration = configuration @override def dataset(self, id: str) -> DatasetClient: @@ -33,7 +34,7 @@ def datasets(self) -> DatasetCollectionClient: @override def key_value_store(self, id: str) -> KeyValueStoreClient: - return KeyValueStoreClient(self._apify_client.key_value_store(id)) + return KeyValueStoreClient(self._apify_client.key_value_store(id), self._configuration.api_public_base_url) @override def key_value_stores(self) -> KeyValueStoreCollectionClient: diff --git a/src/apify/apify_storage_client/key_value_store_client.py b/src/apify/apify_storage_client/key_value_store_client.py index 6d94d661..6e4f4721 100644 --- a/src/apify/apify_storage_client/key_value_store_client.py +++ b/src/apify/apify_storage_client/key_value_store_client.py @@ -18,8 +18,9 @@ class KeyValueStoreClient(BaseKeyValueStoreClient): """Key-value store resource client implementation based on the Apify platform storage.""" - def __init__(self, apify_key_value_store_client: KeyValueStoreClientAsync) -> None: + def __init__(self, apify_key_value_store_client: KeyValueStoreClientAsync, api_public_base_url: str) -> None: self._client = apify_key_value_store_client + self._api_public_base_url = api_public_base_url @override async def get(self) -> KeyValueStoreMetadata | None: @@ -79,3 +80,13 @@ async def delete_record(self, key: str) -> None: await self._client.delete_record( key=key, ) + + async def get_public_url(self, key: str) -> str: + """Get a URL for the given key that may be used to publicly access the value in the remote key-value store. + + Args: + key (str): The key for which the URL should be generated. + """ + public_api_url = self._api_public_base_url + + return f'{public_api_url}/v2/key-value-stores/{self._client.resource_id}/records/{key}' diff --git a/tests/integration/actor_source_base/src/main.py b/tests/integration/actor_source_base/src/main.py index 78c03a48..678334bd 100644 --- a/tests/integration/actor_source_base/src/main.py +++ b/tests/integration/actor_source_base/src/main.py @@ -4,5 +4,5 @@ async def main() -> None: - async with Actor(): + async with Actor: raise RuntimeError('You need to override the `main.py` file in the integration test!') diff --git a/tests/integration/test_actor_events.py b/tests/integration/test_actor_events.py index b4436b7e..eec543f0 100644 --- a/tests/integration/test_actor_events.py +++ b/tests/integration/test_actor_events.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING from apify_shared.consts import ActorEventTypes +from crawlee.events.types import Event from apify import Actor @@ -38,8 +39,8 @@ async def log_event(data: Any) -> None: return log_event async with Actor: - Actor.on(ActorEventTypes.SYSTEM_INFO, on_event(ActorEventTypes.SYSTEM_INFO)) - Actor.on(ActorEventTypes.PERSIST_STATE, on_event(ActorEventTypes.PERSIST_STATE)) + Actor.on(Event.SYSTEM_INFO, on_event(ActorEventTypes.SYSTEM_INFO)) + Actor.on(Event.PERSIST_STATE, on_event(ActorEventTypes.PERSIST_STATE)) await asyncio.sleep(3) # The SYSTEM_INFO event sometimes takes a while to appear, let's wait for it for a while longer @@ -68,7 +69,7 @@ async def test_off_event(self: TestActorEvents, make_actor: ActorFactory) -> Non async def main() -> None: import os - from apify_shared.consts import ActorEventTypes, ApifyEnvVars + from apify_shared.consts import ApifyEnvVars os.environ[ApifyEnvVars.PERSIST_STATE_INTERVAL_MILLIS] = '100' @@ -80,11 +81,11 @@ def count_event(data): # type: ignore # noqa: ANN202, ANN001 counter += 1 async with Actor: - Actor.on(ActorEventTypes.PERSIST_STATE, count_event) + Actor.on(Event.PERSIST_STATE, count_event) await asyncio.sleep(0.5) assert counter > 1 last_count = counter - Actor.off(ActorEventTypes.PERSIST_STATE, count_event) + Actor.off(Event.PERSIST_STATE, count_event) await asyncio.sleep(0.5) assert counter == last_count diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index 3bd6df62..4d306d55 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -1,11 +1,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast from apify_shared.consts import ApifyEnvVars from ._utils import generate_unique_resource_name from apify import Actor +from apify.apify_storage_client.key_value_store_client import KeyValueStoreClient if TYPE_CHECKING: import pytest @@ -191,7 +192,7 @@ async def main() -> None: default_store_id = Actor.config.default_key_value_store_id store = await Actor.open_key_value_store() - record_url = await store.get_public_url('dummy') + record_url = await cast(KeyValueStoreClient, store._resource_client).get_public_url('dummy') print(record_url) assert record_url == f'{public_api_url}/v2/key-value-stores/{default_store_id}/records/dummy' diff --git a/tests/integration/test_actor_lifecycle.py b/tests/integration/test_actor_lifecycle.py index ae517f90..8af6a92a 100644 --- a/tests/integration/test_actor_lifecycle.py +++ b/tests/integration/test_actor_lifecycle.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING +import apify.actor from apify import Actor if TYPE_CHECKING: @@ -11,7 +12,7 @@ class TestActorInit: async def test_actor_init(self: TestActorInit, make_actor: ActorFactory) -> None: async def main() -> None: - my_actor = Actor() + my_actor = Actor await my_actor.init() assert my_actor._is_initialized is True double_init = False @@ -43,8 +44,8 @@ async def main() -> None: async def test_async_with_actor_properly_initialize(self: TestActorInit, make_actor: ActorFactory) -> None: async def main() -> None: async with Actor: - assert Actor._get_default_instance()._is_initialized - assert Actor._get_default_instance()._is_initialized is False + assert apify.actor._get_default_instance()._is_initialized + assert apify.actor._get_default_instance()._is_initialized is False actor = await make_actor('with-actor-init', main_func=main) diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 076115e3..6abd611b 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING from apify_shared.consts import ApifyEnvVars +from crawlee.models import Request from ._utils import generate_unique_resource_name from apify import Actor @@ -66,7 +67,7 @@ async def test_force_cloud( request_queue = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) request_queue_id = request_queue._id - request_info = await request_queue.add_request({'url': 'http://example.com'}) + request_info = await request_queue.add_request(Request.from_url('http://example.com')) request_queue_client = apify_client_async.request_queue(request_queue_id) @@ -75,7 +76,7 @@ async def test_force_cloud( assert request_queue_details is not None assert request_queue_details.get('name') == request_queue_name - request_queue_request = await request_queue_client.get_request(request_info['requestId']) + request_queue_request = await request_queue_client.get_request(request_info.id) assert request_queue_request is not None assert request_queue_request['url'] == 'http://example.com' finally: From 087fb121efcc7c90af37ca8ca3a44a1116f7ee53 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 7 Aug 2024 21:42:13 +0200 Subject: [PATCH 31/68] Increase crawlee dependency version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 49d6600e..e97faf33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "aiofiles >= 22.1.0", "aioshutil >= 1.0", "colorama >= 0.4.6", - "crawlee >= 0.1.0", + "crawlee >= 0.2.0", "cryptography >= 39.0.0", "httpx >= 0.24.0", "psutil >= 5.9.0", From dbacef778a853926bd4c52a62c2dfad1ccab8467 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 7 Aug 2024 22:11:00 +0200 Subject: [PATCH 32/68] Fix more type errors --- src/apify/proxy_configuration.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/apify/proxy_configuration.py b/src/apify/proxy_configuration.py index 3118a1da..244f15b4 100644 --- a/src/apify/proxy_configuration.py +++ b/src/apify/proxy_configuration.py @@ -154,7 +154,6 @@ def __init__( proxy_urls=[f'http://{_actor_config.proxy_hostname}:{_actor_config.proxy_port}'] if self._uses_apify_proxy else proxy_urls, new_url_function=new_url_function, tiered_proxy_urls=tiered_proxy_urls, - configuration=_actor_config, ) self._configuration = _actor_config @@ -218,6 +217,7 @@ async def new_proxy_info( return ProxyInfo( url=f'http://{username}:{self._password or ""}@{parsed_url.host}:{parsed_url.port}', + scheme='http', hostname=proxy_info.hostname, port=proxy_info.port, username=username, @@ -230,6 +230,7 @@ async def new_proxy_info( return ProxyInfo( url=proxy_info.url, + scheme=proxy_info.scheme, hostname=proxy_info.hostname, port=proxy_info.port, username=proxy_info.username, From 5a57a14d813b863d4a189dbdd0b76fe68f937d1b Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 7 Aug 2024 22:17:09 +0200 Subject: [PATCH 33/68] Update test --- tests/unit/test_proxy_configuration.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_proxy_configuration.py b/tests/unit/test_proxy_configuration.py index b13b3a55..93be7f72 100644 --- a/tests/unit/test_proxy_configuration.py +++ b/tests/unit/test_proxy_configuration.py @@ -265,6 +265,7 @@ async def test_new_proxy_info_basic(self: TestProxyConfigurationNewProxyInfo) -> 'password': password, 'proxy_tier': None, 'session_id': None, + 'scheme': 'http', } async def test_new_proxy_info_rotates_urls(self: TestProxyConfigurationNewProxyInfo) -> None: From 2714e8d48281a383f6be0a1889e33af1a33a060b Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 8 Aug 2024 13:23:34 +0200 Subject: [PATCH 34/68] Remove obsolete test case --- .../actor/test_actor_memory_storage_e2e.py | 123 ------------------ 1 file changed, 123 deletions(-) delete mode 100644 tests/unit/actor/test_actor_memory_storage_e2e.py diff --git a/tests/unit/actor/test_actor_memory_storage_e2e.py b/tests/unit/actor/test_actor_memory_storage_e2e.py deleted file mode 100644 index 330236b2..00000000 --- a/tests/unit/actor/test_actor_memory_storage_e2e.py +++ /dev/null @@ -1,123 +0,0 @@ -from __future__ import annotations - -from datetime import datetime, timezone -from typing import Callable - -import pytest -from apify_shared.consts import ApifyEnvVars -from crawlee.models import Request -from crawlee.storage_client_manager import StorageClientManager - -from apify import Actor - - -@pytest.mark.parametrize('purge_on_start', [True, False]) -async def test_actor_memory_storage_client_key_value_store_e2e( - monkeypatch: pytest.MonkeyPatch, - purge_on_start: bool, # noqa: FBT001 - reset_default_instances: Callable[[], None], -) -> None: - """This test simulates two clean runs using memory storage. - The second run attempts to access data created by the first one. - We run 2 configurations with different `purge_on_start`.""" - # Configure purging env var - monkeypatch.setenv(ApifyEnvVars.PURGE_ON_START, f'{int(purge_on_start)}') - # Store old storage client so we have the object reference for comparison - old_client = StorageClientManager.get_storage_client() - async with Actor: - old_default_kvs = await Actor.open_key_value_store() - old_non_default_kvs = await Actor.open_key_value_store(name='non-default') - # Create data in default and non-default key-value store - await old_default_kvs.set_value('test', 'default value') - await old_non_default_kvs.set_value('test', 'non-default value') - - # We simulate another clean run, we expect the memory storage to read from the local data directory - # Default storages are purged based on purge_on_start parameter. - reset_default_instances() - - async with Actor: - # Check if we're using a different memory storage instance - assert old_client is not StorageClientManager.get_storage_client() - default_kvs = await Actor.open_key_value_store() - assert default_kvs is not old_default_kvs - non_default_kvs = await Actor.open_key_value_store(name='non-default') - assert non_default_kvs is not old_non_default_kvs - default_value = await default_kvs.get_value('test') - - if purge_on_start: - assert default_value is None - else: - assert default_value == 'default value' - - assert await non_default_kvs.get_value('test') == 'non-default value' - - -@pytest.mark.parametrize('purge_on_start', [True, False]) -async def test_actor_memory_storage_client_request_queue_e2e( - monkeypatch: pytest.MonkeyPatch, - purge_on_start: bool, # noqa: FBT001 - reset_default_instances: Callable[[], None], -) -> None: - """This test simulates two clean runs using memory storage. - The second run attempts to access data created by the first one. - We run 2 configurations with different `purge_on_start`.""" - # Configure purging env var - monkeypatch.setenv(ApifyEnvVars.PURGE_ON_START, f'{int(purge_on_start)}') - async with Actor: - # Add some requests to the default queue - default_queue = await Actor.open_request_queue() - for i in range(6): - request_url = f'http://example.com/{i}' - forefront = i % 3 == 1 - was_handled = i % 3 == 2 - await default_queue.add_request( - Request.from_url( - unique_key=str(i), - url=request_url, - handled_at=datetime.now(timezone.utc) if was_handled else None, - ), - forefront=forefront, - ) - - # We simulate another clean run, we expect the memory storage to read from the local data directory - # Default storages are purged based on purge_on_start parameter. - reset_default_instances() - - async with Actor: - # Add some more requests to the default queue - default_queue = await Actor.open_request_queue() - for i in range(6, 12): - request_url = f'http://example.com/{i}' - forefront = i % 3 == 1 - was_handled = i % 3 == 2 - await default_queue.add_request( - Request.from_url( - unique_key=str(i), - url=request_url, - handled_at=datetime.now(timezone.utc) if was_handled else None, - ), - forefront=forefront, - ) - - queue_info = await default_queue.get_info() - assert queue_info is not None - - # If the queue was purged between the runs, only the requests from the second run should be present, in the right order - if purge_on_start: - assert queue_info.total_request_count == 6 - assert queue_info.handled_request_count == 2 - - expected_pending_request_order = [10, 7, 6, 9] - # If the queue was NOT purged between the runs, all the requests should be in the queue in the right order - else: - assert queue_info.total_request_count == 12 - assert queue_info.handled_request_count == 4 - - expected_pending_request_order = [10, 7, 4, 1, 0, 3, 6, 9] - - actual_requests = list[Request]() - while req := await default_queue.fetch_next_request(): - actual_requests.append(req) - - assert [int(req.unique_key) for req in actual_requests] == expected_pending_request_order - assert [req.url for req in actual_requests] == [f'http://example.com/{req.unique_key}' for req in actual_requests] From 34a9e625365d9ef517c257f2756dde33421fba82 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 9 Aug 2024 12:24:09 +0200 Subject: [PATCH 35/68] Fix handling of configuration on platform --- src/apify/config.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/apify/config.py b/src/apify/config.py index da03e766..115764c0 100644 --- a/src/apify/config.py +++ b/src/apify/config.py @@ -6,7 +6,7 @@ from crawlee._utils.models import timedelta_ms from crawlee.configuration import Configuration as CrawleeConfiguration -from pydantic import AliasChoices, Field +from pydantic import AliasChoices, BeforeValidator, Field from typing_extensions import Self @@ -85,7 +85,11 @@ class Configuration(CrawleeConfiguration): dedicated_cpus: Annotated[float | None, Field(alias='apify_dedicated_cpus')] = None - disable_outdated_warning: Annotated[bool, Field(alias='apify_disable_outdated_warning')] = False + disable_outdated_warning: Annotated[ + bool, + Field(alias='apify_disable_outdated_warning'), + BeforeValidator(lambda val: val or False), + ] = False fact: Annotated[str | None, Field(alias='apify_fact')] = None @@ -110,7 +114,11 @@ class Configuration(CrawleeConfiguration): log_format: Annotated[str | None, Field(alias='apify_log_format', deprecated=True)] = None - max_paid_dataset_items: Annotated[int | None, Field(alias='actor_max_paid_dataset_items')] = None + max_paid_dataset_items: Annotated[ + int | None, + Field(alias='actor_max_paid_dataset_items'), + BeforeValidator(lambda val: val or None), + ] = None meta_origin: Annotated[str | None, Field(alias='apify_meta_origin')] = None From 0547bb3a224af06b8b5717d1a8926e1e5604b97c Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 9 Aug 2024 16:25:26 +0200 Subject: [PATCH 36/68] Hackily fix cloud storage usage --- src/apify/actor.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/apify/actor.py b/src/apify/actor.py index a11f69f1..97ac502a 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -344,8 +344,8 @@ async def open_dataset( self._raise_if_not_initialized() configuration_updates = {} - if force_cloud: - configuration_updates['is_at_home'] = True + if force_cloud or self._configuration.is_at_home: + configuration_updates['in_cloud'] = True return await Dataset.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) @@ -376,8 +376,8 @@ async def open_key_value_store( self._raise_if_not_initialized() configuration_updates = {} - if force_cloud: - configuration_updates['is_at_home'] = True + if force_cloud or self._configuration.is_at_home: + configuration_updates['in_cloud'] = True return await KeyValueStore.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) @@ -409,8 +409,8 @@ async def open_request_queue( self._raise_if_not_initialized() configuration_updates = {} - if force_cloud: - configuration_updates['is_at_home'] = True + if force_cloud or self._configuration.is_at_home: + configuration_updates['in_cloud'] = True return await RequestQueue.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) From 57057d22617afecaf4fd2020eb828a897ef0f1d8 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 9 Aug 2024 17:45:54 +0200 Subject: [PATCH 37/68] Fix force_cloud tests --- .../apify_storage_client/request_queue_client.py | 12 ++++++++++-- .../request_queue_collection_client.py | 13 +++++++++++-- tests/integration/conftest.py | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/apify/apify_storage_client/request_queue_client.py b/src/apify/apify_storage_client/request_queue_client.py index 5ca4729c..64e60d4b 100644 --- a/src/apify/apify_storage_client/request_queue_client.py +++ b/src/apify/apify_storage_client/request_queue_client.py @@ -73,8 +73,16 @@ async def add_request( forefront: bool = False, ) -> ProcessedRequest: return ProcessedRequest.model_validate( - await self._client.add_request( - request=request.model_dump(by_alias=True), + {'id': request.id, 'uniqueKey': request.unique_key} + | await self._client.add_request( + request=request.model_dump( + by_alias=True, + exclude={ + 'id', + 'json_', + 'order_no', + }, + ), forefront=forefront, ) ) diff --git a/src/apify/apify_storage_client/request_queue_collection_client.py b/src/apify/apify_storage_client/request_queue_collection_client.py index 7b6adda4..ca27930a 100644 --- a/src/apify/apify_storage_client/request_queue_collection_client.py +++ b/src/apify/apify_storage_client/request_queue_collection_client.py @@ -1,15 +1,24 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Annotated from crawlee.base_storage_client.base_request_queue_collection_client import BaseRequestQueueCollectionClient from crawlee.models import RequestQueueListPage, RequestQueueMetadata +from pydantic import Field # noqa: TCH002 from typing_extensions import override if TYPE_CHECKING: from apify_client.clients import RequestQueueCollectionClientAsync +__all__ = ['RequestQueueCollectionClient'] + + +class ExtendedRequestQueueMetadata(RequestQueueMetadata): + id: str + resource_directory: Annotated[str, Field(alias='resourceDirectory')] = '' + + class RequestQueueCollectionClient(BaseRequestQueueCollectionClient): """Request queue collection resource client implementation based on the Apify platform storage.""" @@ -24,7 +33,7 @@ async def get_or_create( name: str | None = None, schema: dict | None = None, ) -> RequestQueueMetadata: - return RequestQueueMetadata.model_validate( + return ExtendedRequestQueueMetadata.model_validate( await self._client.get_or_create( name=id if id is not None else name, ) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 93433db5..6d754fd7 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -12,12 +12,12 @@ import pytest from apify_client import ApifyClientAsync from apify_shared.consts import ActorJobStatus, ActorSourceType +from crawlee.configuration import Configuration from crawlee.storage_client_manager import StorageClientManager from filelock import FileLock import apify.actor from ._utils import generate_unique_resource_name -from apify.config import Configuration if TYPE_CHECKING: from collections.abc import AsyncIterator, Awaitable, Mapping From 325fdd0242dfa62d24a7557ada4d0ac18fb739fa Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 9 Aug 2024 18:31:00 +0200 Subject: [PATCH 38/68] Fix interval events --- src/apify/event_manager.py | 36 ++++++++++++++++++++++++-- tests/integration/conftest.py | 13 +++++++++- tests/integration/test_actor_events.py | 5 ++-- 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/src/apify/event_manager.py b/src/apify/event_manager.py index d203d280..29465e4d 100644 --- a/src/apify/event_manager.py +++ b/src/apify/event_manager.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +from datetime import datetime # noqa: TCH003 from typing import TYPE_CHECKING, Annotated, Any, Literal, Union import websockets.client @@ -27,9 +28,35 @@ class PersistStateEvent(BaseModel): data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))] +class SystemInfoEventData(BaseModel): + mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')] + mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')] + mem_max_bytes: Annotated[float, Field(alias='memMaxBytes')] + cpu_avg_usage: Annotated[float, Field(alias='cpuAvgUsage')] + cpu_max_usage: Annotated[float, Field(alias='cpuMaxUsage')] + cpu_current_usage: Annotated[float, Field(alias='cpuCurrentUsage')] + is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')] + created_at: Annotated[datetime, Field(alias='createdAt')] + + def to_crawlee_format(self) -> EventSystemInfoData: + return EventSystemInfoData.model_validate( + { + 'cpu_info': { + 'used_ratio': self.cpu_current_usage, + 'created_at': self.created_at, + }, + 'memory_info': { + 'total_size': self.mem_max_bytes, + 'current_size': self.mem_current_bytes, + 'created_at': self.created_at, + }, + } + ) + + class SystemInfoEvent(BaseModel): name: Literal[Event.SYSTEM_INFO] - data: EventSystemInfoData + data: SystemInfoEventData class MigratingEvent(BaseModel): @@ -156,7 +183,12 @@ async def _process_platform_messages(self, ws_url: str) -> None: logger.info(f'Unknown message received: event_name={parsed_message.name}, event_data={parsed_message.data}') continue - self.emit(event=parsed_message.name, event_data=parsed_message.data) + self.emit( + event=parsed_message.name, + event_data=parsed_message.data + if not isinstance(parsed_message.data, SystemInfoEventData) + else parsed_message.data.to_crawlee_format(), + ) if parsed_message.name == Event.MIGRATING: await self._emit_persist_state_event_rec_task.stop() diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 6d754fd7..2364e848 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -172,7 +172,18 @@ async def _make_actor( if main_func: func_source = textwrap.dedent(inspect.getsource(main_func)) func_source = func_source.replace(f'def {main_func.__name__}(', 'def main(') - main_py = f'import asyncio\n\nfrom apify import Actor\n\n\n{func_source}' + main_py = '\n'.join( # noqa: FLY002 + [ + 'import asyncio', + '', + 'from apify import Actor', + 'from crawlee.events.types import Event', + '', + '', + '', + func_source, + ] + ) if main_py: source_files = {'src/main.py': main_py} diff --git a/tests/integration/test_actor_events.py b/tests/integration/test_actor_events.py index eec543f0..eeddaa80 100644 --- a/tests/integration/test_actor_events.py +++ b/tests/integration/test_actor_events.py @@ -20,11 +20,12 @@ async def main() -> None: from typing import Any, Callable from apify_shared.consts import ActorEventTypes, ApifyEnvVars + from crawlee.events.types import EventSystemInfoData os.environ[ApifyEnvVars.PERSIST_STATE_INTERVAL_MILLIS] = '900' was_system_info_emitted = False - system_infos = [] + system_infos = list[EventSystemInfoData]() def on_event(event_type: ActorEventTypes) -> Callable: async def log_event(data: Any) -> None: @@ -51,7 +52,7 @@ async def log_event(data: Any) -> None: # Check that parsing datetimes works correctly # Check `createdAt` is a datetime (so it's the same locally and on platform) - assert isinstance(system_infos[0]['createdAt'], datetime) + assert isinstance(system_infos[0].cpu_info.created_at, datetime) actor = await make_actor('actor-interval-events', main_func=main) From a388cc11f81ef73d5ff15efda22e04a981911298 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 9 Aug 2024 18:41:32 +0200 Subject: [PATCH 39/68] Fix unit test --- tests/unit/test_event_manager.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/tests/unit/test_event_manager.py b/tests/unit/test_event_manager.py index 102e0e91..7111666e 100644 --- a/tests/unit/test_event_manager.py +++ b/tests/unit/test_event_manager.py @@ -14,7 +14,7 @@ from crawlee.events.types import Event, EventSystemInfoData from apify.config import Configuration -from apify.event_manager import EventManager, PlatformEventManager +from apify.event_manager import EventManager, PlatformEventManager, SystemInfoEventData class TestEventManagerLocal: @@ -172,14 +172,16 @@ async def send_platform_event(event_name: Event, data: Any = None) -> None: monkeypatch.setenv(ActorEnvVars.EVENTS_WEBSOCKET_URL, f'ws://localhost:{port}') dummy_system_info = { - 'cpuInfo': {'usedRatio': 0.66, 'createdAt': '2024-04-04T12:44:00Z'}, - 'memoryInfo': { - 'currentSize': 11, - 'totalSize': 42, - 'createdAt': '2024-04-04T12:44:00Z', - }, + 'memAvgBytes': 19328860.328293584, + 'memCurrentBytes': 65171456, + 'memMaxBytes': 65171456, + 'cpuAvgUsage': 2.0761105633130397, + 'cpuMaxUsage': 53.941134593993326, + 'cpuCurrentUsage': 8.45549815498155, + 'isCpuOverloaded': False, + 'createdAt': '2024-08-09T16:04:16.161Z', } - EventSystemInfoData.model_validate(dummy_system_info) + SystemInfoEventData.model_validate(dummy_system_info) async with PlatformEventManager(Configuration.get_global_configuration()) as event_manager: event_calls = [] @@ -192,5 +194,6 @@ def listener(data: Any) -> None: # Test sending event with data await send_platform_event(Event.SYSTEM_INFO, dummy_system_info) await asyncio.sleep(0.1) - assert event_calls == [dummy_system_info] + assert len(event_calls) == 1 + assert event_calls[0]['cpuInfo']['usedRatio'] == 8.45549815498155 event_calls.clear() From 60f54689fd5dcf19beb9e9f348b0912c857bb31c Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 9 Aug 2024 18:45:26 +0200 Subject: [PATCH 40/68] Lint --- tests/unit/test_event_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_event_manager.py b/tests/unit/test_event_manager.py index 7111666e..4cd2db87 100644 --- a/tests/unit/test_event_manager.py +++ b/tests/unit/test_event_manager.py @@ -11,7 +11,7 @@ import websockets import websockets.server from apify_shared.consts import ActorEnvVars -from crawlee.events.types import Event, EventSystemInfoData +from crawlee.events.types import Event from apify.config import Configuration from apify.event_manager import EventManager, PlatformEventManager, SystemInfoEventData From 7a829d30ffffd09ba7ae30a0d64a472083ee2692 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 9 Aug 2024 18:56:09 +0200 Subject: [PATCH 41/68] Fix actor log test --- tests/integration/test_actor_log.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_actor_log.py b/tests/integration/test_actor_log.py index f07598e6..216707b1 100644 --- a/tests/integration/test_actor_log.py +++ b/tests/integration/test_actor_log.py @@ -80,7 +80,7 @@ async def main() -> None: assert run_log_lines.pop(0) == 'ERROR Error message' assert run_log_lines.pop(0) == 'ERROR Exception message' assert run_log_lines.pop(0) == ' Traceback (most recent call last):' - assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 34, in main' + assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 36, in main' assert run_log_lines.pop(0) == " raise ValueError('Dummy ValueError')" assert run_log_lines.pop(0) == ' ValueError: Dummy ValueError' assert run_log_lines.pop(0) == 'INFO Multi' @@ -89,7 +89,7 @@ async def main() -> None: assert run_log_lines.pop(0) == ' message' assert run_log_lines.pop(0) == 'ERROR Actor failed with an exception' assert run_log_lines.pop(0) == ' Traceback (most recent call last):' - assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 42, in main' + assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 44, in main' assert run_log_lines.pop(0) == " raise RuntimeError('Dummy RuntimeError')" assert run_log_lines.pop(0) == ' RuntimeError: Dummy RuntimeError' assert run_log_lines.pop(0) == 'INFO Exiting actor ({"exit_code": 91})' From 76a5ea587d341c9ee3b85835299b28e8ab7fa244 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 9 Aug 2024 21:07:18 +0200 Subject: [PATCH 42/68] Fix get_public_url test --- tests/integration/test_actor_key_value_store.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index 4d306d55..73e888b7 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -1,12 +1,11 @@ from __future__ import annotations -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING from apify_shared.consts import ApifyEnvVars from ._utils import generate_unique_resource_name from apify import Actor -from apify.apify_storage_client.key_value_store_client import KeyValueStoreClient if TYPE_CHECKING: import pytest @@ -187,6 +186,10 @@ async def main(): class TestGetPublicUrl: async def test_get_public_url(self: TestGetPublicUrl, make_actor: ActorFactory) -> None: async def main() -> None: + from typing import cast + + from apify.apify_storage_client.key_value_store_client import KeyValueStoreClient + async with Actor: public_api_url = Actor.config.api_public_base_url default_store_id = Actor.config.default_key_value_store_id From 7488a1136a9786cdcbb2ddf05b07f9e62a519acf Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 9 Aug 2024 21:07:59 +0200 Subject: [PATCH 43/68] Fix actor_lifecycle test --- tests/integration/test_actor_lifecycle.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_actor_lifecycle.py b/tests/integration/test_actor_lifecycle.py index 8af6a92a..4b2545dd 100644 --- a/tests/integration/test_actor_lifecycle.py +++ b/tests/integration/test_actor_lifecycle.py @@ -2,7 +2,6 @@ from typing import TYPE_CHECKING -import apify.actor from apify import Actor if TYPE_CHECKING: @@ -43,6 +42,8 @@ async def main() -> None: async def test_async_with_actor_properly_initialize(self: TestActorInit, make_actor: ActorFactory) -> None: async def main() -> None: + import apify.actor + async with Actor: assert apify.actor._get_default_instance()._is_initialized assert apify.actor._get_default_instance()._is_initialized is False From d11cb4ff32b628170bda8aad17821dba00e6ed5c Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 9 Aug 2024 22:07:29 +0200 Subject: [PATCH 44/68] Fix request queue stuff --- .../request_queue_client.py | 41 +++++++++++++++---- .../request_queue_collection_client.py | 16 ++------ 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/src/apify/apify_storage_client/request_queue_client.py b/src/apify/apify_storage_client/request_queue_client.py index 64e60d4b..5d2d83cc 100644 --- a/src/apify/apify_storage_client/request_queue_client.py +++ b/src/apify/apify_storage_client/request_queue_client.py @@ -30,7 +30,7 @@ def __init__(self, apify_request_queue_client: RequestQueueClientAsync) -> None: @override async def get(self) -> RequestQueueMetadata | None: result = await self._client.get() - return RequestQueueMetadata.model_validate(result) if result else None + return RequestQueueMetadata.model_validate({'resourceDirectory': ''} | result) if result else None @override async def update( @@ -39,7 +39,8 @@ async def update( name: str | None = None, ) -> RequestQueueMetadata: return RequestQueueMetadata.model_validate( - await self._client.update( + {'resourceDirectory': ''} + | await self._client.update( name=name, ) ) @@ -51,7 +52,7 @@ async def delete(self) -> None: @override async def list_head(self, *, limit: int | None = None) -> RequestQueueHead: return RequestQueueHead.model_validate( - self._client.list_head( + await self._client.list_head( limit=limit, ), ) @@ -100,8 +101,15 @@ async def update_request( forefront: bool = False, ) -> ProcessedRequest: return ProcessedRequest.model_validate( - await self._client.update_request( - request=request.model_dump(by_alias=True), + {'id': request.id, 'uniqueKey': request.unique_key} + | await self._client.update_request( + request=request.model_dump( + by_alias=True, + exclude={ + 'json_', + 'order_no', + }, + ), forefront=forefront, ) ) @@ -147,7 +155,17 @@ async def batch_add_requests( ) -> BatchRequestsOperationResponse: return BatchRequestsOperationResponse.model_validate( await self._client.batch_add_requests( - requests=[r.model_dump(by_alias=True) for r in requests], + requests=[ + r.model_dump( + by_alias=True, + exclude={ + 'id', + 'json_', + 'order_no', + }, + ) + for r in requests + ], forefront=forefront, ) ) @@ -156,7 +174,16 @@ async def batch_add_requests( async def batch_delete_requests(self, requests: list[Request]) -> BatchRequestsOperationResponse: return BatchRequestsOperationResponse.model_validate( await self._client.batch_delete_requests( - requests=[r.model_dump(by_alias=True) for r in requests], + requests=[ + r.model_dump( + by_alias=True, + exclude={ + 'json_', + 'order_no', + }, + ) + for r in requests + ], ) ) diff --git a/src/apify/apify_storage_client/request_queue_collection_client.py b/src/apify/apify_storage_client/request_queue_collection_client.py index ca27930a..a513846b 100644 --- a/src/apify/apify_storage_client/request_queue_collection_client.py +++ b/src/apify/apify_storage_client/request_queue_collection_client.py @@ -1,24 +1,15 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Annotated +from typing import TYPE_CHECKING from crawlee.base_storage_client.base_request_queue_collection_client import BaseRequestQueueCollectionClient from crawlee.models import RequestQueueListPage, RequestQueueMetadata -from pydantic import Field # noqa: TCH002 from typing_extensions import override if TYPE_CHECKING: from apify_client.clients import RequestQueueCollectionClientAsync -__all__ = ['RequestQueueCollectionClient'] - - -class ExtendedRequestQueueMetadata(RequestQueueMetadata): - id: str - resource_directory: Annotated[str, Field(alias='resourceDirectory')] = '' - - class RequestQueueCollectionClient(BaseRequestQueueCollectionClient): """Request queue collection resource client implementation based on the Apify platform storage.""" @@ -33,8 +24,9 @@ async def get_or_create( name: str | None = None, schema: dict | None = None, ) -> RequestQueueMetadata: - return ExtendedRequestQueueMetadata.model_validate( - await self._client.get_or_create( + return RequestQueueMetadata.model_validate( + {'resourceDirectory': ''} + | await self._client.get_or_create( name=id if id is not None else name, ) ) From b7e7d8c0dde007ab596e4ee983dad495db30c75c Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 12 Aug 2024 13:10:19 +0200 Subject: [PATCH 45/68] Remove old consts --- src/apify/actor.py | 5 ++++- src/apify/consts.py | 49 --------------------------------------------- 2 files changed, 4 insertions(+), 50 deletions(-) diff --git a/src/apify/actor.py b/src/apify/actor.py index 97ac502a..28340151 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -169,7 +169,7 @@ async def exit( self, *, exit_code: int = 0, - event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, # noqa: ARG002 + event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, status_message: str | None = None, cleanup_timeout: timedelta = timedelta(seconds=30), ) -> None: @@ -202,6 +202,9 @@ async def finalize() -> None: # Sleep for a bit so that the listeners have a chance to trigger await asyncio.sleep(0.1) + if event_listeners_timeout: + await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout) + await self._event_manager.__aexit__(None, None, None) await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds()) diff --git a/src/apify/consts.py b/src/apify/consts.py index 4ed8bba7..71f373a0 100644 --- a/src/apify/consts.py +++ b/src/apify/consts.py @@ -1,59 +1,10 @@ from __future__ import annotations import re -import warnings from datetime import timedelta -from enum import Enum -from typing import Any - -DEPRECATED_NAMES = [ - 'BOOL_ENV_VARS', - 'DATETIME_ENV_VARS', - 'FLOAT_ENV_VARS', - 'INTEGER_ENV_VARS', - 'STRING_ENV_VARS', - 'ActorEventTypes', - 'ActorExitCodes', - 'ApifyEnvVars', -] - - -# The following piece of code is highly inspired by the example in https://peps.python.org/pep-0562. -# The else branch is missing intentionally! Check the following discussion for details: -# https://github.com/apify/apify-client-python/pull/132#discussion_r1277294315. -def __getattr__(name: str) -> Any: - if name in DEPRECATED_NAMES: - warnings.warn( - ( - f'Importing "{name}" from "apify_client.consts" is deprecated and will be removed in the future. ' - 'Please use "apify_shared" library instead.' - ), - category=DeprecationWarning, - stacklevel=2, - ) - return globals()[f'_{name}'] - raise AttributeError(f'module {__name__!r} has no attribute {name!r}') - - -class StorageTypes(str, Enum): - """Possible Apify storage types.""" - - DATASET = 'Dataset' - KEY_VALUE_STORE = 'Key-value store' - REQUEST_QUEUE = 'Request queue' - - -DEFAULT_API_PARAM_LIMIT = 1000 - -REQUEST_ID_LENGTH = 15 - -REQUEST_QUEUE_HEAD_MAX_LIMIT = 1000 EVENT_LISTENERS_TIMEOUT = timedelta(seconds=5) BASE64_REGEXP = '[-A-Za-z0-9+/]*={0,3}' ENCRYPTED_INPUT_VALUE_PREFIX = 'ENCRYPTED_VALUE' ENCRYPTED_INPUT_VALUE_REGEXP = re.compile(f'^{ENCRYPTED_INPUT_VALUE_PREFIX}:({BASE64_REGEXP}):({BASE64_REGEXP})$') - -# 9MB -MAX_PAYLOAD_SIZE_BYTES = 9437184 From 855972b13d3c4979adc6a81efbf2335b0e58bd41 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 12 Aug 2024 13:29:43 +0200 Subject: [PATCH 46/68] Use CrawleeLogFormatter --- src/apify/log.py | 115 ++--------------------------------------------- 1 file changed, 3 insertions(+), 112 deletions(-) diff --git a/src/apify/log.py b/src/apify/log.py index 180ab826..7ee60ca3 100644 --- a/src/apify/log.py +++ b/src/apify/log.py @@ -1,16 +1,8 @@ from __future__ import annotations -import json import logging -import textwrap -import traceback -from typing import Any - -from apify_shared.utils import ignore_docs -from colorama import Fore, Style, just_fix_windows_console - -just_fix_windows_console() +from crawlee.log_config import CrawleeLogFormatter # Name of the logger used throughout the library (resolves to 'apify') logger_name = __name__.split('.')[0] @@ -18,107 +10,6 @@ # Logger used throughout the library logger = logging.getLogger(logger_name) -_LOG_NAME_COLOR = Fore.LIGHTBLACK_EX - -_LOG_LEVEL_COLOR = { - logging.DEBUG: Fore.BLUE, - logging.INFO: Fore.GREEN, - logging.WARNING: Fore.YELLOW, - logging.ERROR: Fore.RED, - logging.CRITICAL: Fore.RED, -} - -_LOG_LEVEL_SHORT_ALIAS = { - logging.DEBUG: 'DEBUG', - logging.INFO: 'INFO ', - logging.WARNING: 'WARN ', - logging.ERROR: 'ERROR', -} - -# So that all the log messages have the same alignment -_LOG_MESSAGE_INDENT = ' ' * 6 - - -class ActorLogFormatter(logging.Formatter): - """Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields. - - It formats the log records so that they: - - start with the level (colorized, and padded to 5 chars so that it is nicely aligned) - - then have the actual log message, if it's multiline then it's nicely indented - - then have the stringified extra log fields - - then, if an exception is a part of the log record, prints the formatted exception. - """ - - # The fields that are added to the log record with `logger.log(..., extra={...})` - # are just merged in the log record with the other log record properties, and you can't get them in some nice, isolated way. - # So, to get the extra fields, we just compare all the properties present in the log record - # with properties present in an empty log record, - # and extract all the extra ones not present in the empty log record - empty_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None) - - def __init__( - self: ActorLogFormatter, - include_logger_name: bool = False, # noqa: FBT001, FBT002 - *args: Any, - **kwargs: Any, - ) -> None: - """Create an instance of the ActorLogFormatter. - - Args: - include_logger_name: Include logger name at the beginning of the log line. Defaults to False. - args: Arguments passed to the parent class. - kwargs: Keyword arguments passed to the parent class. - """ - super().__init__(*args, **kwargs) - self.include_logger_name = include_logger_name - - def _get_extra_fields(self: ActorLogFormatter, record: logging.LogRecord) -> dict[str, Any]: - extra_fields: dict[str, Any] = {} - for key, value in record.__dict__.items(): - if key not in self.empty_record.__dict__: - extra_fields[key] = value # noqa: PERF403 - - return extra_fields - - @ignore_docs - def format(self: ActorLogFormatter, record: logging.LogRecord) -> str: - """Format the log record nicely. - - This formats the log record so that it: - - starts with the level (colorized, and padded to 5 chars so that it is nicely aligned) - - then has the actual log message, if it's multiline then it's nicely indented - - then has the stringified extra log fields - - then, if an exception is a part of the log record, prints the formatted exception. - """ - logger_name_string = f'{_LOG_NAME_COLOR}[{record.name}]{Style.RESET_ALL} ' - - # Colorize the log level, and shorten it to 6 chars tops - level_color_code = _LOG_LEVEL_COLOR.get(record.levelno, '') - level_short_alias = _LOG_LEVEL_SHORT_ALIAS.get(record.levelno, record.levelname) - level_string = f'{level_color_code}{level_short_alias}{Style.RESET_ALL} ' - - # Format the exception, if there is some - # Basically just print the traceback and indent it a bit - exception_string = '' - if record.exc_info: - exc_info = record.exc_info - record.exc_info = None - exception_string = ''.join(traceback.format_exception(*exc_info)).rstrip() - exception_string = '\n' + textwrap.indent(exception_string, _LOG_MESSAGE_INDENT) - - # Format the extra log record fields, if there were some - # Just stringify them to JSON and color them gray - extra_string = '' - extra = self._get_extra_fields(record) - if extra: - extra_string = f' {Fore.LIGHTBLACK_EX}({json.dumps(extra, ensure_ascii=False, default=str)}){Style.RESET_ALL}' - - # Format the actual log message, and indent everything but the first line - log_string = super().format(record) - log_string = textwrap.indent(log_string, _LOG_MESSAGE_INDENT).lstrip() - - if self.include_logger_name: - # Include logger name at the beginning of the log line - return f'{logger_name_string}{level_string}{log_string}{extra_string}{exception_string}' - return f'{level_string}{log_string}{extra_string}{exception_string}' +class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 Inherited from parent class + pass From 9aa4ea37d86a06f92bfd478ec10f73fc5b28979a Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 12 Aug 2024 13:38:07 +0200 Subject: [PATCH 47/68] Remove asserts --- src/apify/actor.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/apify/actor.py b/src/apify/actor.py index 28340151..4cca7cd6 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -776,7 +776,8 @@ async def metamorph( custom_after_sleep = self._configuration.metamorph_after_sleep # If is_at_home() is True, config.actor_run_id is always set - assert self._configuration.actor_run_id is not None # noqa: S101 + if not self._configuration.actor_run_id: + raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.') await self._apify_client.run(self._configuration.actor_run_id).metamorph( target_actor_id=target_actor_id, @@ -815,7 +816,9 @@ async def reboot( await self._event_manager.__aexit__(None, None, None) - assert self._configuration.actor_run_id is not None # noqa: S101 + if not self._configuration.actor_run_id: + raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.') + await self._apify_client.run(self._configuration.actor_run_id).reboot() if custom_after_sleep: @@ -860,7 +863,8 @@ async def add_webhook( return None # If is_at_home() is True, config.actor_run_id is always set - assert self._configuration.actor_run_id is not None # noqa: S101 + if not self._configuration.actor_run_id: + raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.') return await self._apify_client.webhooks().create( actor_run_id=self._configuration.actor_run_id, @@ -895,7 +899,8 @@ async def set_status_message( return None # If is_at_home() is True, config.actor_run_id is always set - assert self._configuration.actor_run_id is not None # noqa: S101 + if not self._configuration.actor_run_id: + raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.') return await self._apify_client.run(self._configuration.actor_run_id).update( status_message=status_message, is_status_message_terminal=is_terminal From 01c1dc725d00bd4f46252ae8d97205452b4c3103 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 12 Aug 2024 13:46:39 +0200 Subject: [PATCH 48/68] Update test --- tests/integration/test_actor_log.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_actor_log.py b/tests/integration/test_actor_log.py index 216707b1..fee29ac7 100644 --- a/tests/integration/test_actor_log.py +++ b/tests/integration/test_actor_log.py @@ -72,24 +72,24 @@ async def main() -> None: assert run_log_lines.pop(0).startswith('ACTOR: Pulling Docker image') assert run_log_lines.pop(0) == 'ACTOR: Creating Docker container.' assert run_log_lines.pop(0) == 'ACTOR: Starting Docker container.' - assert run_log_lines.pop(0) == 'INFO Initializing actor...' - assert run_log_lines.pop(0).startswith(f'INFO System info ({{"apify_sdk_version": "{__version__}", "apify_client_version": "') - assert run_log_lines.pop(0) == 'DEBUG Debug message' - assert run_log_lines.pop(0) == 'INFO Info message' - assert run_log_lines.pop(0) == 'WARN Warning message' - assert run_log_lines.pop(0) == 'ERROR Error message' - assert run_log_lines.pop(0) == 'ERROR Exception message' + assert run_log_lines.pop(0) == '[apify] INFO Initializing actor...' + assert run_log_lines.pop(0).startswith(f'[apify] INFO System info ({{"apify_sdk_version": "{__version__}", "apify_client_version": "') + assert run_log_lines.pop(0) == '[apify] DEBUG Debug message' + assert run_log_lines.pop(0) == '[apify] INFO Info message' + assert run_log_lines.pop(0) == '[apify] WARN Warning message' + assert run_log_lines.pop(0) == '[apify] ERROR Error message' + assert run_log_lines.pop(0) == '[apify] ERROR Exception message' assert run_log_lines.pop(0) == ' Traceback (most recent call last):' assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 36, in main' assert run_log_lines.pop(0) == " raise ValueError('Dummy ValueError')" assert run_log_lines.pop(0) == ' ValueError: Dummy ValueError' - assert run_log_lines.pop(0) == 'INFO Multi' + assert run_log_lines.pop(0) == '[apify] INFO Multi' assert run_log_lines.pop(0) == ' line' assert run_log_lines.pop(0) == ' log' assert run_log_lines.pop(0) == ' message' - assert run_log_lines.pop(0) == 'ERROR Actor failed with an exception' + assert run_log_lines.pop(0) == '[apify] ERROR Actor failed with an exception' assert run_log_lines.pop(0) == ' Traceback (most recent call last):' assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 44, in main' assert run_log_lines.pop(0) == " raise RuntimeError('Dummy RuntimeError')" assert run_log_lines.pop(0) == ' RuntimeError: Dummy RuntimeError' - assert run_log_lines.pop(0) == 'INFO Exiting actor ({"exit_code": 91})' + assert run_log_lines.pop(0) == '[apify] INFO Exiting actor ({"exit_code": 91})' From 845071f2f5b08c923fc16cdf3248fd3de771ae36 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 12 Aug 2024 16:06:02 +0200 Subject: [PATCH 49/68] actor -> Actor --- CONTRIBUTING.md | 2 +- docs/02-guides/02-beautiful-soup.mdx | 2 +- docs/02-guides/03-playwright.mdx | 4 +- docs/02-guides/04-selenium.mdx | 2 +- docs/02-guides/05-scrapy.mdx | 2 +- docs/03-concepts/04-actor-events.mdx | 2 +- src/apify/actor.py | 213 ++++++++++---------- src/apify/config.py | 4 +- src/apify/event_manager.py | 6 +- tests/integration/README.md | 26 +-- tests/integration/conftest.py | 22 +- tests/integration/test_actor_api_helpers.py | 4 +- tests/integration/test_actor_lifecycle.py | 4 +- tests/integration/test_actor_log.py | 4 +- tests/unit/actor/test_actor_lifecycle.py | 2 +- tests/unit/actor/test_actor_log.py | 6 +- website/src/pages/index.js | 10 +- 17 files changed, 156 insertions(+), 159 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a7cb99ad..bb92f3e8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -50,7 +50,7 @@ tests with HTML coverage report execute `make unit-tests-cov`. ## Integration tests -We have integration tests which build and run actors using the Python SDK on the Apify Platform. To run these tests, +We have integration tests which build and run Actors using the Python SDK on the Apify Platform. To run these tests, you need to set the `APIFY_TEST_USER_API_TOKEN` environment variable to the API token of the Apify user you want to use for the tests, and then start them with `make integration-tests`. diff --git a/docs/02-guides/02-beautiful-soup.mdx b/docs/02-guides/02-beautiful-soup.mdx index a625741f..a7ebdc84 100644 --- a/docs/02-guides/02-beautiful-soup.mdx +++ b/docs/02-guides/02-beautiful-soup.mdx @@ -36,7 +36,7 @@ async def main(): max_depth = actor_input.get('max_depth', 1) if not start_urls: - Actor.log.info('No start URLs specified in actor input, exiting...') + Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() # Enqueue the starting URLs in the default request queue diff --git a/docs/02-guides/03-playwright.mdx b/docs/02-guides/03-playwright.mdx index 8094e621..a46f578f 100644 --- a/docs/02-guides/03-playwright.mdx +++ b/docs/02-guides/03-playwright.mdx @@ -29,7 +29,7 @@ To create Actors which use Playwright, start from the [Playwright & Python](http On the Apify platform, the Actor will already have Playwright and the necessary browsers preinstalled in its Docker image, including the tools and setup necessary to run browsers in headful mode. -When running the Actor locally, you'll need to finish the Playwright setup yourself before you can run the actor. +When running the Actor locally, you'll need to finish the Playwright setup yourself before you can run the Actor. @@ -69,7 +69,7 @@ async def main(): max_depth = actor_input.get('max_depth', 1) if not start_urls: - Actor.log.info('No start URLs specified in actor input, exiting...') + Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() # Enqueue the starting URLs in the default request queue diff --git a/docs/02-guides/04-selenium.mdx b/docs/02-guides/04-selenium.mdx index 3efa5149..3fb77d7c 100644 --- a/docs/02-guides/04-selenium.mdx +++ b/docs/02-guides/04-selenium.mdx @@ -53,7 +53,7 @@ async def main(): max_depth = actor_input.get('max_depth', 1) if not start_urls: - Actor.log.info('No start URLs specified in actor input, exiting...') + Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() # Enqueue the starting URLs in the default request queue diff --git a/docs/02-guides/05-scrapy.mdx b/docs/02-guides/05-scrapy.mdx index f73c4a3c..ea9825d0 100644 --- a/docs/02-guides/05-scrapy.mdx +++ b/docs/02-guides/05-scrapy.mdx @@ -87,7 +87,7 @@ class TitleSpider(scrapy.Spider): if link_url.startswith(('http://', 'https://')): yield scrapy.Request(link_url) -# Pushes the scraped items into the actor's default dataset +# Pushes the scraped items into the Actor's default dataset class ActorDatasetPushPipeline: async def process_item(self, item, spider): item_dict = ItemAdapter(item).asdict() diff --git a/docs/03-concepts/04-actor-events.mdx b/docs/03-concepts/04-actor-events.mdx index 8d795cab..c041035b 100644 --- a/docs/03-concepts/04-actor-events.mdx +++ b/docs/03-concepts/04-actor-events.mdx @@ -91,7 +91,7 @@ async def main(): # Save the state when the `PERSIST_STATE` event happens async def save_state(event_data): nonlocal processed_items - Actor.log.info('Saving actor state', extra=event_data) + Actor.log.info('Saving Actor state', extra=event_data) await Actor.set_value('STATE', processed_items) Actor.on(ActorEventTypes.PERSIST_STATE, save_state) diff --git a/src/apify/actor.py b/src/apify/actor.py index 4cca7cd6..fa11973c 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -50,7 +50,7 @@ def __init__(self, config: Configuration | None = None) -> None: and that is their preferred usage. Args: - config (Configuration, optional): The actor configuration to be used. If not passed, a new Configuration instance will be created. + config (Configuration, optional): The Actor configuration to be used. If not passed, a new Configuration instance will be created. """ self._configuration = config or Configuration.get_global_configuration() self._apify_client = self.new_client() @@ -132,27 +132,27 @@ def log(self) -> logging.Logger: def _raise_if_not_initialized(self) -> None: if not self._is_initialized: - raise RuntimeError('The actor was not initialized!') + raise RuntimeError('The Actor was not initialized!') async def init(self) -> None: - """Initialize the actor instance. + """Initialize the Actor instance. This initializes the Actor instance. - It configures the right storage client based on whether the actor is running locally or on the Apify platform, - it initializes the event manager for processing actor events, + It configures the right storage client based on whether the Actor is running locally or on the Apify platform, + it initializes the event manager for processing Actor events, and starts an interval for regularly sending `PERSIST_STATE` events, - so that the actor can regularly persist its state in response to these events. + so that the Actor can regularly persist its state in response to these events. - This method should be called immediately before performing any additional actor actions, + This method should be called immediately before performing any additional Actor actions, and it should be called only once. """ if self._is_initialized: - raise RuntimeError('The actor was already initialized!') + raise RuntimeError('The Actor was already initialized!') self._is_exiting = False self._was_final_persist_state_emitted = False - self.log.info('Initializing actor...') + self.log.info('Initializing Actor...') self.log.info('System info', extra=get_system_info()) # TODO: Print outdated SDK version warning (we need a new env var for this) @@ -173,7 +173,7 @@ async def exit( status_message: str | None = None, cleanup_timeout: timedelta = timedelta(seconds=30), ) -> None: - """Exit the actor instance. + """Exit the Actor instance. This stops the Actor instance. It cancels all the intervals for regularly sending `PERSIST_STATE` events, @@ -182,9 +182,9 @@ async def exit( and stops the event manager. Args: - exit_code (int, optional): The exit code with which the actor should fail (defaults to `0`). - event_listeners_timeout (timedelta, optional): How long should the actor wait for actor event listeners to finish before exiting. - status_message (str, optional): The final status message that the actor should display. + exit_code (int, optional): The exit code with which the Actor should fail (defaults to `0`). + event_listeners_timeout (timedelta, optional): How long should the Actor wait for Actor event listeners to finish before exiting. + status_message (str, optional): The final status message that the Actor should display. cleanup_timeout (timedelta, optional): How long we should wait for event listeners. """ self._raise_if_not_initialized() @@ -193,7 +193,7 @@ async def exit( exit_code = maybe_extract_enum_member_value(exit_code) - self.log.info('Exiting actor', extra={'exit_code': exit_code}) + self.log.info('Exiting Actor', extra={'exit_code': exit_code}) async def finalize() -> None: if status_message is not None: @@ -211,11 +211,11 @@ async def finalize() -> None: self._is_initialized = False if is_running_in_ipython(): - self.log.debug(f'Not calling sys.exit({exit_code}) because actor is running in IPython') + self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in IPython') elif os.getenv('PYTEST_CURRENT_TEST', default=False): # noqa: PLW1508 - self.log.debug(f'Not calling sys.exit({exit_code}) because actor is running in an unit test') + self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in an unit test') elif hasattr(asyncio, '_nest_patched'): - self.log.debug(f'Not calling sys.exit({exit_code}) because actor is running in a nested event loop') + self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in a nested event loop') else: sys.exit(exit_code) @@ -226,19 +226,19 @@ async def fail( exception: BaseException | None = None, status_message: str | None = None, ) -> None: - """Fail the actor instance. + """Fail the Actor instance. This performs all the same steps as Actor.exit(), but it additionally sets the exit code to `1` (by default). Args: - exit_code (int, optional): The exit code with which the actor should fail (defaults to `1`). - exception (BaseException, optional): The exception with which the actor failed. - status_message (str, optional): The final status message that the actor should display. + exit_code (int, optional): The exit code with which the Actor should fail (defaults to `1`). + exception (BaseException, optional): The exception with which the Actor failed. + status_message (str, optional): The final status message that the Actor should display. """ self._raise_if_not_initialized() - # In IPython, we don't run `sys.exit()` during actor exits, + # In IPython, we don't run `sys.exit()` during Actor exits, # so the exception traceback will be printed on its own if exception and not is_running_in_ipython(): self.log.exception('Actor failed with an exception', exc_info=exception) @@ -246,10 +246,10 @@ async def fail( await self.exit(exit_code=exit_code, status_message=status_message) async def main(self, main_actor_function: Callable[[], MainReturnType]) -> MainReturnType | None: - """Initialize the actor, run the passed function and finish the actor cleanly. + """Initialize the Actor, run the passed function and finish the Actor cleanly. **The `Actor.main()` function is optional** and is provided merely for your convenience. - It is mainly useful when you're running your code as an actor on the [Apify platform](https://apify.com/actors). + It is mainly useful when you're running your code as an Actor on the [Apify platform](https://apify.com/actors). The `Actor.main()` function performs the following actions: @@ -260,11 +260,11 @@ async def main(self, main_actor_function: Callable[[], MainReturnType]) -> MainR - If the user function was an async function, it awaits it. - If the user function throws an exception or some other error is encountered, it prints error details to console so that they are stored to the log, - and finishes the actor cleanly. + and finishes the Actor cleanly. - Finally, it exits the Python process, with zero exit code on success and non-zero on errors. Args: - main_actor_function (Callable): The user function which should be run in the actor + main_actor_function (Callable): The user function which should be run in the Actor """ if not inspect.isfunction(main_actor_function): raise TypeError(f'First argument passed to Actor.main() must be a function, but instead it was {type(main_actor_function)}') @@ -334,9 +334,9 @@ async def open_dataset( Args: id (str, optional): ID of the dataset to be opened. - If neither `id` nor `name` are provided, the method returns the default dataset associated with the actor run. + If neither `id` nor `name` are provided, the method returns the default dataset associated with the Actor run. name (str, optional): Name of the dataset to be opened. - If neither `id` nor `name` are provided, the method returns the default dataset associated with the actor run. + If neither `id` nor `name` are provided, the method returns the default dataset associated with the Actor run. force_cloud (bool, optional): If set to `True` then the Apify cloud storage is always used. This way it is possible to combine local and cloud storage. @@ -367,9 +367,9 @@ async def open_key_value_store( Args: id (str, optional): ID of the key-value store to be opened. - If neither `id` nor `name` are provided, the method returns the default key-value store associated with the actor run. + If neither `id` nor `name` are provided, the method returns the default key-value store associated with the Actor run. name (str, optional): Name of the key-value store to be opened. - If neither `id` nor `name` are provided, the method returns the default key-value store associated with the actor run. + If neither `id` nor `name` are provided, the method returns the default key-value store associated with the Actor run. force_cloud (bool, optional): If set to `True` then the Apify cloud storage is always used. This way it is possible to combine local and cloud storage. @@ -400,9 +400,9 @@ async def open_request_queue( Args: id (str, optional): ID of the request queue to be opened. - If neither `id` nor `name` are provided, the method returns the default request queue associated with the actor run. + If neither `id` nor `name` are provided, the method returns the default request queue associated with the Actor run. name (str, optional): Name of the request queue to be opened. - If neither `id` nor `name` are provided, the method returns the default request queue associated with the actor run. + If neither `id` nor `name` are provided, the method returns the default request queue associated with the Actor run. force_cloud (bool, optional): If set to `True` then the Apify cloud storage is always used. This way it is possible to combine local and cloud storage. @@ -418,7 +418,7 @@ async def open_request_queue( return await RequestQueue.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) async def push_data(self, data: Any) -> None: - """Store an object or a list of objects to the default dataset of the current actor run. + """Store an object or a list of objects to the default dataset of the current Actor run. Args: data (object or list of objects, optional): The data to push to the default dataset. @@ -432,7 +432,7 @@ async def push_data(self, data: Any) -> None: await dataset.push_data(data) async def get_input(self) -> Any: - """Get the actor input value from the default key-value store associated with the current actor run.""" + """Get the Actor input value from the default key-value store associated with the current Actor run.""" self._raise_if_not_initialized() input_value = await self.get_value(self._configuration.input_key) @@ -448,7 +448,7 @@ async def get_input(self) -> Any: return input_value async def get_value(self, key: str, default_value: Any = None) -> Any: - """Get a value from the default key-value store associated with the current actor run. + """Get a value from the default key-value store associated with the current Actor run. Args: key (str): The key of the record which to retrieve. @@ -466,7 +466,7 @@ async def set_value( *, content_type: str | None = None, ) -> None: - """Set or delete a value in the default key-value store associated with the current actor run. + """Set or delete a value in the default key-value store associated with the current Actor run. Args: key (str): The key of the record which to set. @@ -479,29 +479,29 @@ async def set_value( return await key_value_store.set_value(key, value, content_type=content_type) def on(self, event_name: Event, listener: Callable) -> Callable: - """Add an event listener to the actor's event manager. + """Add an event listener to the Actor's event manager. The following events can be emitted: - `ActorEventTypes.SYSTEM_INFO`: - Emitted every minute, the event data contains info about the resource usage of the actor. + Emitted every minute, the event data contains info about the resource usage of the Actor. - `ActorEventTypes.MIGRATING`: - Emitted when the actor running on the Apify platform is going to be migrated to another worker server soon. - You can use it to persist the state of the actor and gracefully stop your in-progress tasks, + Emitted when the Actor running on the Apify platform is going to be migrated to another worker server soon. + You can use it to persist the state of the Actor and gracefully stop your in-progress tasks, so that they are not interrupted by the migration.. - `ActorEventTypes.PERSIST_STATE`: - Emitted in regular intervals (by default 60 seconds) to notify the actor that it should persist its state, - in order to avoid repeating all work when the actor restarts. + Emitted in regular intervals (by default 60 seconds) to notify the Actor that it should persist its state, + in order to avoid repeating all work when the Actor restarts. This event is automatically emitted together with the migrating event, in which case the `isMigrating` flag in the event data is set to True, otherwise the flag is False. Note that this event is provided merely for your convenience, you can achieve the same effect using an interval and listening for the migrating event. - `ActorEventTypes.ABORTING`: - When a user aborts an actor run on the Apify platform, - they can choose to abort it gracefully, to allow the actor some time before getting terminated. - This graceful abort emits the aborting event, which you can use to clean up the actor state. + When a user aborts an Actor run on the Apify platform, + they can choose to abort it gracefully, to allow the Actor some time before getting terminated. + This graceful abort emits the aborting event, which you can use to clean up the Actor state. Args: - event_name (ActorEventTypes): The actor event for which to listen to. + event_name (ActorEventTypes): The Actor event for which to listen to. listener (Callable): The function which is to be called when the event is emitted (can be async). """ self._raise_if_not_initialized() @@ -510,10 +510,10 @@ def on(self, event_name: Event, listener: Callable) -> Callable: return listener def off(self, event_name: Event, listener: Callable | None = None) -> None: - """Remove a listener, or all listeners, from an actor event. + """Remove a listener, or all listeners, from an Actor event. Args: - event_name (ActorEventTypes): The actor event for which to remove listeners. + event_name (ActorEventTypes): The Actor event for which to remove listeners. listener (Callable, optional): The listener which is supposed to be removed. If not passed, all listeners of this event are removed. """ self._raise_if_not_initialized() @@ -521,7 +521,7 @@ def off(self, event_name: Event, listener: Callable | None = None) -> None: self._event_manager.off(event=event_name, listener=listener) def is_at_home(self) -> bool: - """Return `True` when the actor is running on the Apify platform, and `False` otherwise (for example when running locally).""" + """Return `True` when the Actor is running on the Apify platform, and `False` otherwise (for example when running locally).""" return self._configuration.is_at_home def get_env(self) -> dict: @@ -566,34 +566,34 @@ async def start( wait_for_finish: int | None = None, webhooks: list[dict] | None = None, ) -> dict: - """Run an actor on the Apify platform. + """Run an Actor on the Apify platform. Unlike `Actor.call`, this method just starts the run without waiting for finish. Args: - actor_id (str): The ID of the actor to be run. - run_input (Any, optional): The input to pass to the actor run. + actor_id (str): The ID of the Actor to be run. + run_input (Any, optional): The input to pass to the Actor run. token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). content_type (str, optional): The content type of the input. - build (str, optional): Specifies the actor build to run. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the actor (typically latest). + build (str, optional): Specifies the Actor build to run. It can be either a build tag or build number. + By default, the run uses the build specified in the default run configuration for the Actor (typically latest). memory_mbytes (int, optional): Memory limit for the run, in megabytes. - By default, the run uses a memory limit specified in the default run configuration for the actor. + By default, the run uses a memory limit specified in the default run configuration for the Actor. timeout (timedelta, optional): Optional timeout for the run, in seconds. - By default, the run uses timeout specified in the default run configuration for the actor. + By default, the run uses timeout specified in the default run configuration for the Actor. wait_for_finish (int, optional): The maximum number of seconds the server waits for the run to finish. By default, it is 0, the maximum value is 300. webhooks (list of dict, optional): Optional ad-hoc webhooks (https://docs.apify.com/webhooks/ad-hoc-webhooks) - associated with the actor run which can be used to receive a notification, - e.g. when the actor finished or failed. - If you already have a webhook set up for the actor or task, you do not have to add it again here. + associated with the Actor run which can be used to receive a notification, + e.g. when the Actor finished or failed. + If you already have a webhook set up for the Actor or task, you do not have to add it again here. Each webhook is represented by a dictionary containing these items: * ``event_types``: list of ``WebhookEventType`` values which trigger the webhook * ``request_url``: URL to which to send the webhook HTTP request * ``payload_template`` (optional): Optional template for the request payload Returns: - dict: Info about the started actor run + dict: Info about the started Actor run """ self._raise_if_not_initialized() @@ -617,18 +617,18 @@ async def abort( status_message: str | None = None, gracefully: bool | None = None, ) -> dict: - """Abort given actor run on the Apify platform using the current user account (determined by the `APIFY_TOKEN` environment variable). + """Abort given Actor run on the Apify platform using the current user account (determined by the `APIFY_TOKEN` environment variable). Args: - run_id (str): The ID of the actor run to be aborted. + run_id (str): The ID of the Actor run to be aborted. token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). - status_message (str, optional): Status message of the actor to be set on the platform. - gracefully (bool, optional): If True, the actor run will abort gracefully. + status_message (str, optional): Status message of the Actor to be set on the platform. + gracefully (bool, optional): If True, the Actor run will abort gracefully. It will send ``aborting`` and ``persistStates`` events into the run and force-stop the run after 30 seconds. It is helpful in cases where you plan to resurrect the run later. Returns: - dict: Info about the aborted actor run + dict: Info about the aborted Actor run """ self._raise_if_not_initialized() @@ -652,28 +652,28 @@ async def call( webhooks: list[dict] | None = None, wait: timedelta | None = None, ) -> dict | None: - """Start an actor on the Apify Platform and wait for it to finish before returning. + """Start an Actor on the Apify Platform and wait for it to finish before returning. It waits indefinitely, unless the wait argument is provided. Args: - actor_id (str): The ID of the actor to be run. - run_input (Any, optional): The input to pass to the actor run. + actor_id (str): The ID of the Actor to be run. + run_input (Any, optional): The input to pass to the Actor run. token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). content_type (str, optional): The content type of the input. - build (str, optional): Specifies the actor build to run. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the actor (typically latest). + build (str, optional): Specifies the Actor build to run. It can be either a build tag or build number. + By default, the run uses the build specified in the default run configuration for the Actor (typically latest). memory_mbytes (int, optional): Memory limit for the run, in megabytes. - By default, the run uses a memory limit specified in the default run configuration for the actor. + By default, the run uses a memory limit specified in the default run configuration for the Actor. timeout (timedelta, optional): Optional timeout for the run, in seconds. - By default, the run uses timeout specified in the default run configuration for the actor. - webhooks (list, optional): Optional webhooks (https://docs.apify.com/webhooks) associated with the actor run, - which can be used to receive a notification, e.g. when the actor finished or failed. - If you already have a webhook set up for the actor, you do not have to add it again here. + By default, the run uses timeout specified in the default run configuration for the Actor. + webhooks (list, optional): Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, + which can be used to receive a notification, e.g. when the Actor finished or failed. + If you already have a webhook set up for the Actor, you do not have to add it again here. wait(timedelta, optional): The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. Returns: - dict: Info about the started actor run + dict: Info about the started Actor run """ self._raise_if_not_initialized() @@ -701,31 +701,31 @@ async def call_task( wait: timedelta | None = None, token: str | None = None, ) -> dict | None: - """Start an actor task on the Apify Platform and wait for it to finish before returning. + """Start an Actor task on the Apify Platform and wait for it to finish before returning. It waits indefinitely, unless the wait argument is provided. - Note that an actor task is a saved input configuration and options for an actor. - If you want to run an actor directly rather than an actor task, please use the `Actor.call` + Note that an Actor task is a saved input configuration and options for an Actor. + If you want to run an Actor directly rather than an Actor task, please use the `Actor.call` Args: - task_id (str): The ID of the actor to be run. - task_input (Any, optional): Overrides the input to pass to the actor run. + task_id (str): The ID of the Actor to be run. + task_input (Any, optional): Overrides the input to pass to the Actor run. token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). content_type (str, optional): The content type of the input. - build (str, optional): Specifies the actor build to run. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the actor (typically latest). + build (str, optional): Specifies the Actor build to run. It can be either a build tag or build number. + By default, the run uses the build specified in the default run configuration for the Actor (typically latest). memory_mbytes (int, optional): Memory limit for the run, in megabytes. - By default, the run uses a memory limit specified in the default run configuration for the actor. + By default, the run uses a memory limit specified in the default run configuration for the Actor. timeout (timedelta, optional): Optional timeout for the run, in seconds. - By default, the run uses timeout specified in the default run configuration for the actor. - webhooks (list, optional): Optional webhooks (https://docs.apify.com/webhooks) associated with the actor run, - which can be used to receive a notification, e.g. when the actor finished or failed. - If you already have a webhook set up for the actor, you do not have to add it again here. + By default, the run uses timeout specified in the default run configuration for the Actor. + webhooks (list, optional): Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, + which can be used to receive a notification, e.g. when the Actor finished or failed. + If you already have a webhook set up for the Actor, you do not have to add it again here. wait (timedelta, optional): The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. Returns: - dict: Info about the started actor run + dict: Info about the started Actor run """ self._raise_if_not_initialized() @@ -749,22 +749,19 @@ async def metamorph( content_type: str | None = None, custom_after_sleep: timedelta | None = None, ) -> None: - """Transform this actor run to an actor run of a different actor. + """Transform this Actor run to an Actor run of a different Actor. - The platform stops the current actor container and starts a new container with the new actor instead. + The platform stops the current Actor container and starts a new container with the new Actor instead. All the default storages are preserved, and the new input is stored under the `INPUT-METAMORPH-1` key in the same default key-value store. Args: - target_actor_id (str): ID of the target actor that the run should be transformed into + target_actor_id (str): ID of the target Actor that the run should be transformed into run_input (Any, optional): The input to pass to the new run. - target_actor_build (str, optional): The build of the target actor. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the target actor (typically the latest build). + target_actor_build (str, optional): The build of the target Actor. It can be either a build tag or build number. + By default, the run uses the build specified in the default run configuration for the target Actor (typically the latest build). content_type (str, optional): The content type of the input. custom_after_sleep (timedelta, optional): How long to sleep for after the metamorph, to wait for the container to be stopped. - - Returns: - dict: The actor run data. """ self._raise_if_not_initialized() @@ -795,12 +792,12 @@ async def reboot( event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, # noqa: ARG002 custom_after_sleep: timedelta | None = None, ) -> None: - """Internally reboot this actor. + """Internally reboot this Actor. The system stops the current container and starts a new one, with the same run ID and default storages. Args: - event_listeners_timeout (timedelta, optional): How long should the actor wait for actor event listeners to finish before exiting + event_listeners_timeout (timedelta, optional): How long should the Actor wait for Actor event listeners to finish before exiting custom_after_sleep (timedelta, optional): How long to sleep for after the reboot, to wait for the container to be stopped. """ self._raise_if_not_initialized() @@ -834,14 +831,14 @@ async def add_webhook( do_not_retry: bool | None = None, idempotency_key: str | None = None, ) -> dict | None: - """Create an ad-hoc webhook for the current actor run. + """Create an ad-hoc webhook for the current Actor run. - This webhook lets you receive a notification when the actor run finished or failed. + This webhook lets you receive a notification when the Actor run finished or failed. - Note that webhooks are only supported for actors running on the Apify platform. - When running the actor locally, the function will print a warning and have no effect. + Note that webhooks are only supported for Actors running on the Apify platform. + When running the Actor locally, the function will print a warning and have no effect. - For more information about Apify actor webhooks, please see the [documentation](https://docs.apify.com/webhooks). + For more information about Apify Actor webhooks, please see the [documentation](https://docs.apify.com/webhooks). Args: event_types (list of WebhookEventType): List of event types that should trigger the webhook. At least one is required. @@ -882,14 +879,14 @@ async def set_status_message( *, is_terminal: bool | None = None, ) -> dict | None: - """Set the status message for the current actor run. + """Set the status message for the current Actor run. Args: status_message (str): The status message to set to the run. is_terminal (bool, optional): Set this flag to True if this is the final status message of the Actor run. Returns: - dict: The updated actor run object + dict: The updated Actor run object """ self._raise_if_not_initialized() @@ -924,7 +921,7 @@ async def create_proxy_configuration( For more details and code examples, see the `ProxyConfiguration` class. Args: - actor_proxy_input (dict, optional): Proxy configuration field from the actor input, if input has such input field. + actor_proxy_input (dict, optional): Proxy configuration field from the Actor input, if input has such input field. If you pass this argument, all the other arguments will be inferred from it. password (str, optional): Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available. groups (list of str, optional): Proxy groups which the Apify Proxy should use, if provided. @@ -975,4 +972,4 @@ def _get_default_instance() -> _ActorType: Actor = cast(_ActorType, LocalProxy(_get_default_instance)) -"""The entry point of the SDK, through which all the actor operations should be done.""" +"""The entry point of the SDK, through which all the Actor operations should be done.""" diff --git a/src/apify/config.py b/src/apify/config.py index 115764c0..610f84e8 100644 --- a/src/apify/config.py +++ b/src/apify/config.py @@ -11,7 +11,7 @@ class Configuration(CrawleeConfiguration): - """A class for specifying the configuration of an actor. + """A class for specifying the configuration of an Actor. Can be used either globally via `Configuration.get_global_configuration()`, or it can be specific to each `Actor` instance on the `actor.config` property. @@ -184,7 +184,7 @@ class Configuration(CrawleeConfiguration): def get_global_configuration(cls) -> Self: """Retrive the global configuration. - The global configuration applies when you call actor methods via their static versions, e.g. `Actor.init()`. + The global configuration applies when you call Actor methods via their static versions, e.g. `Actor.init()`. Also accessible via `Actor.config`. """ if CrawleeConfiguration._default_instance is None: diff --git a/src/apify/event_manager.py b/src/apify/event_manager.py index 29465e4d..e13a8413 100644 --- a/src/apify/event_manager.py +++ b/src/apify/event_manager.py @@ -114,7 +114,7 @@ class UnknownEvent(BaseModel): @ignore_docs class PlatformEventManager(EventManager): - """A class for managing actor events. + """A class for managing Actor events. You shouldn't use this class directly, but instead use it via the `Actor.on()` and `Actor.off()` methods. @@ -129,7 +129,7 @@ def __init__(self, config: Configuration, **kwargs: Unpack[EventManagerOptions]) """Create an instance of the EventManager. Args: - config (Configuration): The actor configuration to be used in this event manager. + config (Configuration): The Actor configuration to be used in this event manager. kwargs (EventManagerOptions): Event manager options - forwarded to the base class """ super().__init__(**kwargs) @@ -194,7 +194,7 @@ async def _process_platform_messages(self, ws_url: str) -> None: await self._emit_persist_state_event_rec_task.stop() self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True)) except Exception: - logger.exception('Cannot parse actor event', extra={'message': message}) + logger.exception('Cannot parse Actor event', extra={'message': message}) except Exception: logger.exception('Error in websocket connection') self._connected_to_platform_websocket.set_result(False) diff --git a/tests/integration/README.md b/tests/integration/README.md index fb86f84b..331acad1 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -1,7 +1,7 @@ Integration tests ================= -We have integration tests which build and run actors using the Python SDK on the Apify Platform. +We have integration tests which build and run Actors using the Python SDK on the Apify Platform. To run these tests, you need to set the `APIFY_TEST_USER_API_TOKEN` environment variable to the API token of the Apify user you want to use for the tests, and then start them with `make integration-tests`. @@ -25,20 +25,20 @@ async def test_something(apify_client_async: ApifyClientAsync) -> None: ### `make_actor` -This fixture returns a factory function for creating actors on the Apify Platform. +This fixture returns a factory function for creating Actors on the Apify Platform. -For the actor source, the fixture takes the files from `tests/integration/actor_source_base`, +For the Actor source, the fixture takes the files from `tests/integration/actor_source_base`, builds the Apify SDK wheel from the current codebase, -and adds the actor source you passed to the fixture as an argument. +and adds the Actor source you passed to the fixture as an argument. You have to pass exactly one of the `main_func`, `main_py` and `source_files` arguments. -The created actor will be uploaded to the platform, built there, and after the test finishes, it will be automatically deleted. -If the actor build fails, it will not be deleted, so that you can check why the build failed. +The created Actor will be uploaded to the platform, built there, and after the test finishes, it will be automatically deleted. +If the Actor build fails, it will not be deleted, so that you can check why the build failed. -### Creating test actor straight from a Python function +### Creating test Actor straight from a Python function -You can create actors straight from a Python function. -This is great because you can have the test actor source code checked with the linter. +You can create Actors straight from a Python function. +This is great because you can have the test Actor source code checked with the linter. ```python async def test_something(self, make_actor: ActorFactory) -> None: @@ -54,7 +54,7 @@ async def test_something(self, make_actor: ActorFactory) -> None: assert run_result['status'] == 'SUCCEEDED' ``` -These actors will have the `src/main.py` file set to the `main` function definition, +These Actors will have the `src/main.py` file set to the `main` function definition, prepended with `import asyncio` and `from apify import Actor`, for your convenience. You can also pass extra imports directly to the main function: @@ -65,7 +65,7 @@ async def test_something(self, make_actor: ActorFactory) -> None: import os from apify_shared.consts import ActorEventTypes, ActorEnvVars async with Actor: - print('The actor is running with ' + os.getenv(ActorEnvVars.MEMORY_MBYTES) + 'MB of memory') + print('The Actor is running with ' + os.getenv(ActorEnvVars.MEMORY_MBYTES) + 'MB of memory') await Actor.on(ActorEventTypes.SYSTEM_INFO, lambda event_data: print(event_data)) actor = await make_actor('something', main_func=main) @@ -76,10 +76,10 @@ async def test_something(self, make_actor: ActorFactory) -> None: assert run_result['status'] == 'SUCCEEDED' ``` -### Creating actor from source files +### Creating Actor from source files You can also pass the source files directly if you need something more complex -(e.g. pass some fixed value to the actor source code or use multiple source files). +(e.g. pass some fixed value to the Actor source code or use multiple source files). To pass the source code of the `src/main.py` file directly, use the `main_py` argument to `make_actor`: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 2364e848..4dd6021a 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -88,7 +88,7 @@ def sdk_wheel_path(tmp_path_factory: pytest.TempPathFactory, testrun_uid: str) - @pytest.fixture(scope='session') def actor_base_source_files(sdk_wheel_path: Path) -> dict[str, str | bytes]: - """Create a dictionary of the base source files for a testing actor. + """Create a dictionary of the base source files for a testing Actor. It takes the files from `tests/integration/actor_source_base`, builds the Apify SDK wheel from the current codebase, @@ -135,7 +135,7 @@ def __call__( @pytest.fixture() async def make_actor(actor_base_source_files: dict[str, str | bytes], apify_client_async: ApifyClientAsync) -> AsyncIterator[ActorFactory]: - """A fixture for returning a temporary actor factory.""" + """A fixture for returning a temporary Actor factory.""" actor_clients_for_cleanup: list[ActorClientAsync] = [] async def _make_actor( @@ -145,20 +145,20 @@ async def _make_actor( main_py: str | None = None, source_files: Mapping[str, str | bytes] | None = None, ) -> ActorClientAsync: - """Create a temporary actor from the given main function or source file(s). + """Create a temporary Actor from the given main function or source file(s). - The actor will be uploaded to the Apify Platform, built there, and after the test finishes, it will be automatically deleted. + The Actor will be uploaded to the Apify Platform, built there, and after the test finishes, it will be automatically deleted. You have to pass exactly one of the `main_func`, `main_py` and `source_files` arguments. Args: - actor_label (str): The label which will be a part of the generated actor name - main_func (Callable, optional): The main function of the actor. - main_py (str, optional): The `src/main.py` file of the actor. - source_files (dict, optional): A dictionary of the source files of the actor. + actor_label (str): The label which will be a part of the generated Actor name + main_func (Callable, optional): The main function of the Actor. + main_py (str, optional): The `src/main.py` file of the Actor. + source_files (dict, optional): A dictionary of the source files of the Actor. Returns: - ActorClientAsync: A resource client for the created actor. + ActorClientAsync: A resource client for the created Actor. """ if not (main_func or main_py or source_files): raise TypeError('One of `main_func`, `main_py` or `source_files` arguments must be specified') @@ -213,7 +213,7 @@ async def _make_actor( } ) - print(f'Creating actor {actor_name}...') + print(f'Creating Actor {actor_name}...') created_actor = await apify_client_async.actors().create( name=actor_name, default_run_build='latest', @@ -231,7 +231,7 @@ async def _make_actor( actor_client = apify_client_async.actor(created_actor['id']) - print(f'Building actor {actor_name}...') + print(f'Building Actor {actor_name}...') build = await actor_client.build(version_number='0.0', wait_for_finish=300) assert build['status'] == ActorJobStatus.SUCCEEDED diff --git a/tests/integration/test_actor_api_helpers.py b/tests/integration/test_actor_api_helpers.py index 3f05b9ba..589528fa 100644 --- a/tests/integration/test_actor_api_helpers.py +++ b/tests/integration/test_actor_api_helpers.py @@ -308,7 +308,7 @@ async def main_outer() -> None: # This should not be called await Actor.set_value('RECORD_AFTER_METAMORPH_CALL', 'dummy') - raise AssertionError('The actor should have been metamorphed by now') + raise AssertionError('The Actor should have been metamorphed by now') inner_actor = await make_actor('metamorph-inner', main_func=main_inner) outer_actor = await make_actor('metamorph-outer', main_func=main_outer) @@ -329,7 +329,7 @@ async def main_outer() -> None: assert await outer_run_key_value_store.get_record('RECORD_AFTER_METAMORPH_CALL') is None - # After metamorph, the run still belongs to the original actor, so the inner one should have no runs + # After metamorph, the run still belongs to the original Actor, so the inner one should have no runs assert await inner_actor.last_run().get() is None diff --git a/tests/integration/test_actor_lifecycle.py b/tests/integration/test_actor_lifecycle.py index 4b2545dd..e497116c 100644 --- a/tests/integration/test_actor_lifecycle.py +++ b/tests/integration/test_actor_lifecycle.py @@ -19,14 +19,14 @@ async def main() -> None: await my_actor.init() double_init = True except RuntimeError as err: - assert str(err) == 'The actor was already initialized!' # noqa: PT017 + assert str(err) == 'The Actor was already initialized!' # noqa: PT017 except Exception: raise try: await Actor.init() double_init = True except RuntimeError as err: - assert str(err) == 'The actor was already initialized!' # noqa: PT017 + assert str(err) == 'The Actor was already initialized!' # noqa: PT017 except Exception: raise await my_actor.exit() diff --git a/tests/integration/test_actor_log.py b/tests/integration/test_actor_log.py index fee29ac7..715d00c8 100644 --- a/tests/integration/test_actor_log.py +++ b/tests/integration/test_actor_log.py @@ -72,7 +72,7 @@ async def main() -> None: assert run_log_lines.pop(0).startswith('ACTOR: Pulling Docker image') assert run_log_lines.pop(0) == 'ACTOR: Creating Docker container.' assert run_log_lines.pop(0) == 'ACTOR: Starting Docker container.' - assert run_log_lines.pop(0) == '[apify] INFO Initializing actor...' + assert run_log_lines.pop(0) == '[apify] INFO Initializing Actor...' assert run_log_lines.pop(0).startswith(f'[apify] INFO System info ({{"apify_sdk_version": "{__version__}", "apify_client_version": "') assert run_log_lines.pop(0) == '[apify] DEBUG Debug message' assert run_log_lines.pop(0) == '[apify] INFO Info message' @@ -92,4 +92,4 @@ async def main() -> None: assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 44, in main' assert run_log_lines.pop(0) == " raise RuntimeError('Dummy RuntimeError')" assert run_log_lines.pop(0) == ' RuntimeError: Dummy RuntimeError' - assert run_log_lines.pop(0) == '[apify] INFO Exiting actor ({"exit_code": 91})' + assert run_log_lines.pop(0) == '[apify] INFO Exiting Actor ({"exit_code": 91})' diff --git a/tests/unit/actor/test_actor_lifecycle.py b/tests/unit/actor/test_actor_lifecycle.py index 9983bd45..d9496b20 100644 --- a/tests/unit/actor/test_actor_lifecycle.py +++ b/tests/unit/actor/test_actor_lifecycle.py @@ -155,7 +155,7 @@ async def actor_function() -> str: class TestMigratingEvent: async def test_migrating_event(self: TestMigratingEvent, monkeypatch: pytest.MonkeyPatch) -> None: # This should test whether when you get a MIGRATING event, - # the actor automatically emits the PERSIST_STATE event with data `{'isMigrating': True}` + # the Actor automatically emits the PERSIST_STATE event with data `{'isMigrating': True}` monkeypatch.setenv(ApifyEnvVars.PERSIST_STATE_INTERVAL_MILLIS, '500') monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, '1') diff --git a/tests/unit/actor/test_actor_log.py b/tests/unit/actor/test_actor_log.py index dc4dd940..599e8b3d 100644 --- a/tests/unit/actor/test_actor_log.py +++ b/tests/unit/actor/test_actor_log.py @@ -44,7 +44,7 @@ async def test_actor_log(self: TestActorLog, caplog: pytest.LogCaptureFixture, m assert len(caplog.records) == 12 assert caplog.records[0].levelno == logging.INFO - assert caplog.records[0].message == 'Initializing actor...' + assert caplog.records[0].message == 'Initializing Actor...' assert caplog.records[1].levelno == logging.INFO assert caplog.records[1].message == 'System info' @@ -86,7 +86,7 @@ async def test_actor_log(self: TestActorLog, caplog: pytest.LogCaptureFixture, m assert str(caplog.records[9].exc_info[1]) == 'Dummy RuntimeError' assert caplog.records[10].levelno == logging.INFO - assert caplog.records[10].message == 'Exiting actor' + assert caplog.records[10].message == 'Exiting Actor' assert caplog.records[11].levelno == logging.DEBUG - assert caplog.records[11].message == 'Not calling sys.exit(91) because actor is running in an unit test' + assert caplog.records[11].message == 'Not calling sys.exit(91) because Actor is running in an unit test' diff --git a/website/src/pages/index.js b/website/src/pages/index.js index f98dca94..8877d47a 100644 --- a/website/src/pages/index.js +++ b/website/src/pages/index.js @@ -15,10 +15,10 @@ function Hero() {

- Apify SDK for Python
is a toolkit for
building actors + Apify SDK for Python
is a toolkit for
building Actors

- Apify SDK for Python
is a toolkit for
building actors + Apify SDK for Python
is a toolkit for
building Actors

@@ -27,7 +27,7 @@ function Hero() {

The Apify SDK for Python is the official library for creating Apify Actors in Python. - It provides useful features like actor lifecycle management, local storage emulation, and actor event handling. + It provides useful features like Actor lifecycle management, local storage emulation, and Actor event handling.

@@ -66,8 +66,8 @@ export default function Home() {

- For example, the Apify SDK makes it easy to read the actor input with the Actor.get_input() method, - and to save scraped data from your actors to a dataset + For example, the Apify SDK makes it easy to read the Actor input with the Actor.get_input() method, + and to save scraped data from your Actors to a dataset {' '}by simply using the Actor.push_data() method.

From 10bb6d5d9394a2e58c5822d21e4c1e6d8554530a Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 15 Aug 2024 13:07:04 +0200 Subject: [PATCH 50/68] Update to work with a future Crawlee --- pyproject.toml | 2 +- src/apify/actor.py | 45 +++++++++++-------- .../request_queue_client.py | 15 ------- src/apify/config.py | 15 +------ src/apify/scrapy/__init__.py | 2 +- src/apify/scrapy/scheduler.py | 11 ++++- src/apify/scrapy/utils.py | 35 --------------- tests/integration/conftest.py | 14 +++--- tests/unit/conftest.py | 13 +++--- 9 files changed, 51 insertions(+), 101 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e97faf33..c747a02a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "aiofiles >= 22.1.0", "aioshutil >= 1.0", "colorama >= 0.4.6", - "crawlee >= 0.2.0", + "crawlee >= 0.3.0", "cryptography >= 39.0.0", "httpx >= 0.24.0", "psutil >= 5.9.0", diff --git a/src/apify/actor.py b/src/apify/actor.py index fa11973c..0f2f4050 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -10,8 +10,8 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars, WebhookEventType from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value +from crawlee import service_container from crawlee.events.types import Event, EventPersistStateData -from crawlee.storage_client_manager import StorageClientManager from pydantic import AliasChoices from typing_extensions import Self from werkzeug.local import LocalProxy @@ -55,18 +55,25 @@ def __init__(self, config: Configuration | None = None) -> None: self._configuration = config or Configuration.get_global_configuration() self._apify_client = self.new_client() + if self._configuration.token: + service_container.set_cloud_storage_client(ApifyStorageClient(configuration=self._configuration)) + self._event_manager: EventManager if self._configuration.is_at_home: + service_container.set_default_storage_client_type('cloud') self._event_manager = PlatformEventManager( config=self._configuration, persist_state_interval=self._configuration.persist_state_interval, ) else: + service_container.set_default_storage_client_type('local') self._event_manager = LocalEventManager( system_info_interval=self._configuration.system_info_interval, persist_state_interval=self._configuration.persist_state_interval, ) + service_container.set_event_manager(self._event_manager) + self._is_initialized = False @ignore_docs @@ -158,9 +165,6 @@ async def init(self) -> None: # TODO: Print outdated SDK version warning (we need a new env var for this) # https://github.com/apify/apify-sdk-python/issues/146 - if self._configuration.token: - StorageClientManager.set_cloud_client(ApifyStorageClient(configuration=self._configuration)) - await self._event_manager.__aenter__() self._is_initialized = True @@ -346,11 +350,12 @@ async def open_dataset( """ self._raise_if_not_initialized() - configuration_updates = {} - if force_cloud or self._configuration.is_at_home: - configuration_updates['in_cloud'] = True - - return await Dataset.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) + return await Dataset.open( + id=id, + name=name, + configuration=self._configuration, + storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None), + ) async def open_key_value_store( self, @@ -378,11 +383,12 @@ async def open_key_value_store( """ self._raise_if_not_initialized() - configuration_updates = {} - if force_cloud or self._configuration.is_at_home: - configuration_updates['in_cloud'] = True - - return await KeyValueStore.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) + return await KeyValueStore.open( + id=id, + name=name, + configuration=self._configuration, + storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None), + ) async def open_request_queue( self, @@ -411,11 +417,12 @@ async def open_request_queue( """ self._raise_if_not_initialized() - configuration_updates = {} - if force_cloud or self._configuration.is_at_home: - configuration_updates['in_cloud'] = True - - return await RequestQueue.open(id=id, name=name, configuration=self._configuration.model_copy(update=configuration_updates)) + return await RequestQueue.open( + id=id, + name=name, + configuration=self._configuration, + storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None), + ) async def push_data(self, data: Any) -> None: """Store an object or a list of objects to the default dataset of the current Actor run. diff --git a/src/apify/apify_storage_client/request_queue_client.py b/src/apify/apify_storage_client/request_queue_client.py index 5d2d83cc..b78412d9 100644 --- a/src/apify/apify_storage_client/request_queue_client.py +++ b/src/apify/apify_storage_client/request_queue_client.py @@ -8,7 +8,6 @@ ProcessedRequest, ProlongRequestLockResponse, Request, - RequestListResponse, RequestQueueHead, RequestQueueHeadWithLocks, RequestQueueMetadata, @@ -186,17 +185,3 @@ async def batch_delete_requests(self, requests: list[Request]) -> BatchRequestsO ], ) ) - - @override - async def list_requests( - self, - *, - limit: int | None = None, - exclusive_start_id: str | None = None, - ) -> RequestListResponse: - return RequestListResponse.model_validate( - await self._client.list_requests( - limit=limit, - exclusive_start_id=exclusive_start_id, - ) - ) diff --git a/src/apify/config.py b/src/apify/config.py index 610f84e8..f695007b 100644 --- a/src/apify/config.py +++ b/src/apify/config.py @@ -2,12 +2,11 @@ from __future__ import annotations from datetime import datetime, timedelta -from typing import Annotated, cast +from typing import Annotated from crawlee._utils.models import timedelta_ms from crawlee.configuration import Configuration as CrawleeConfiguration from pydantic import AliasChoices, BeforeValidator, Field -from typing_extensions import Self class Configuration(CrawleeConfiguration): @@ -180,18 +179,6 @@ class Configuration(CrawleeConfiguration): workflow_key: Annotated[str | None, Field(alias='apify_workflow_key')] = None - @classmethod - def get_global_configuration(cls) -> Self: - """Retrive the global configuration. - - The global configuration applies when you call Actor methods via their static versions, e.g. `Actor.init()`. - Also accessible via `Actor.config`. - """ - if CrawleeConfiguration._default_instance is None: - CrawleeConfiguration._default_instance = cls() - - return cast(Self, CrawleeConfiguration._default_instance) - # Monkey-patch the base class so that it works with the extended configuration CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore diff --git a/src/apify/scrapy/__init__.py b/src/apify/scrapy/__init__.py index 70ee1cfb..717873ce 100644 --- a/src/apify/scrapy/__init__.py +++ b/src/apify/scrapy/__init__.py @@ -1,3 +1,3 @@ from .requests import to_apify_request, to_scrapy_request from .scheduler import ApifyScheduler -from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client +from .utils import get_basic_auth_header, get_running_event_loop_id diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 1e3c8323..67f09305 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -2,6 +2,9 @@ import traceback +from apify.apify_storage_client.apify_storage_client import ApifyStorageClient +from apify.config import Configuration + try: from scrapy import Spider from scrapy.core.scheduler import BaseScheduler @@ -16,7 +19,7 @@ from apify.actor import Actor from apify.scrapy.requests import to_apify_request, to_scrapy_request -from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client +from apify.scrapy.utils import nested_event_loop from apify.storages import RequestQueue @@ -45,8 +48,12 @@ def open(self: ApifyScheduler, spider: Spider) -> None: # this has to be named """ self.spider = spider + async def open_queue() -> RequestQueue: + custom_loop_apify_client = ApifyStorageClient(configuration=Configuration.get_global_configuration()) + return await RequestQueue.open(storage_client=custom_loop_apify_client) + try: - self._rq = nested_event_loop.run_until_complete(open_queue_with_custom_client()) + self._rq = nested_event_loop.run_until_complete(open_queue()) except BaseException: traceback.print_exc() raise diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index b1658b65..dbd43a2b 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -13,15 +13,6 @@ 'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run "pip install apify[scrapy]".', ) from exc -from typing import TYPE_CHECKING - -from crawlee.storage_client_manager import StorageClientManager - -from apify import Actor, Configuration -from apify.apify_storage_client.apify_storage_client import ApifyStorageClient - -if TYPE_CHECKING: - from crawlee.storages import RequestQueue nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop() @@ -78,29 +69,3 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict settings['APIFY_PROXY_SETTINGS'] = proxy_config return settings - - -async def open_queue_with_custom_client() -> RequestQueue: - """Open a Request Queue with custom Apify Client. - - TODO: add support for custom client to Actor.open_request_queue(), so that - we don't have to do this hacky workaround - """ - # Create a new Apify Client with its httpx client in the custom event loop - custom_loop_apify_client = ApifyStorageClient(configuration=Configuration.get_global_configuration()) - - # Set the new Apify Client as the default client, back up the old client - old_client = StorageClientManager._cloud_client - StorageClientManager.set_cloud_client(custom_loop_apify_client) - - # Create a new Request Queue in the custom event loop, - # replace its Apify client with the custom loop's Apify client - rq = await Actor.open_request_queue() - - if Actor.config.is_at_home: - rq._resource_client = custom_loop_apify_client.request_queue(rq._id) - - # Restore the old Apify Client as the default client - if old_client: - StorageClientManager.set_cloud_client(old_client) - return rq diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 4dd6021a..1a0e52f6 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -7,13 +7,11 @@ import sys import textwrap from pathlib import Path -from typing import TYPE_CHECKING, Callable, Protocol +from typing import TYPE_CHECKING, Callable, Protocol, cast import pytest from apify_client import ApifyClientAsync from apify_shared.consts import ActorJobStatus, ActorSourceType -from crawlee.configuration import Configuration -from crawlee.storage_client_manager import StorageClientManager from filelock import FileLock import apify.actor @@ -32,11 +30,13 @@ # To isolate the tests, we need to reset the used singletons before each test case # We also patch the default storage client with a tmp_path @pytest.fixture(autouse=True) -def _reset_and_patch_default_instances(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(Configuration, '_default_instance', None) - monkeypatch.setattr(StorageClientManager, '_cloud_client', None) +def _reset_and_patch_default_instances() -> None: + from crawlee import service_container + + cast(dict, service_container._services).clear() apify.actor._default_instance = None - # TODO: StorageClientManager local client purge # noqa: TD003 + + # TODO: StorageClientManager local storage client purge # noqa: TD003 # This fixture can't be session-scoped, diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 2b6b9d8c..32fe9331 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -4,14 +4,13 @@ import inspect from collections import defaultdict from copy import deepcopy -from typing import TYPE_CHECKING, Any, Callable, get_type_hints +from typing import TYPE_CHECKING, Any, Callable, cast, get_type_hints import pytest from apify_client.client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars from crawlee.configuration import Configuration as CrawleeConfiguration from crawlee.memory_storage_client.memory_storage_client import MemoryStorageClient -from crawlee.storage_client_manager import StorageClientManager import apify.actor @@ -20,7 +19,7 @@ @pytest.fixture() -def reset_default_instances(monkeypatch: pytest.MonkeyPatch) -> Callable[[], None]: +def reset_default_instances() -> Callable[[], None]: def reset() -> None: from crawlee.storages._creation_management import ( _cache_dataset_by_id, @@ -38,12 +37,12 @@ def reset() -> None: _cache_rq_by_id.clear() _cache_rq_by_name.clear() - monkeypatch.setattr(CrawleeConfiguration, '_default_instance', None) - monkeypatch.setattr(StorageClientManager, '_cloud_client', None) - monkeypatch.setattr(StorageClientManager, '_local_client', MemoryStorageClient()) + from crawlee import service_container + + cast(dict, service_container._services).clear() apify.actor._default_instance = None - # TODO: StorageClientManager local client purge # noqa: TD003 + # TODO: local storage client purge # noqa: TD003 return reset From 391c8ded7e653bc771d70ca7fbbc9c431ba727b7 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 15 Aug 2024 15:16:45 +0200 Subject: [PATCH 51/68] Clear services on actor exit --- src/apify/actor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/apify/actor.py b/src/apify/actor.py index 0f2f4050..882d3d3f 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -210,6 +210,7 @@ async def finalize() -> None: await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout) await self._event_manager.__aexit__(None, None, None) + cast(dict, service_container._services).clear() await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds()) self._is_initialized = False From 1650b7a460275ed33242186adaee41e794266418 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 22 Aug 2024 16:03:52 +0200 Subject: [PATCH 52/68] Remove Actor.main --- src/apify/actor.py | 39 --------------------------------------- 1 file changed, 39 deletions(-) diff --git a/src/apify/actor.py b/src/apify/actor.py index 882d3d3f..9b0c6fec 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -250,45 +250,6 @@ async def fail( await self.exit(exit_code=exit_code, status_message=status_message) - async def main(self, main_actor_function: Callable[[], MainReturnType]) -> MainReturnType | None: - """Initialize the Actor, run the passed function and finish the Actor cleanly. - - **The `Actor.main()` function is optional** and is provided merely for your convenience. - It is mainly useful when you're running your code as an Actor on the [Apify platform](https://apify.com/actors). - - The `Actor.main()` function performs the following actions: - - - When running on the Apify platform (i.e. `APIFY_IS_AT_HOME` environment variable is set), - it sets up a connection to listen for platform events. - For example, to get a notification about an imminent migration to another server. - - It invokes the user function passed as the `main_actor_function` parameter. - - If the user function was an async function, it awaits it. - - If the user function throws an exception or some other error is encountered, - it prints error details to console so that they are stored to the log, - and finishes the Actor cleanly. - - Finally, it exits the Python process, with zero exit code on success and non-zero on errors. - - Args: - main_actor_function (Callable): The user function which should be run in the Actor - """ - if not inspect.isfunction(main_actor_function): - raise TypeError(f'First argument passed to Actor.main() must be a function, but instead it was {type(main_actor_function)}') - - await self.init() - try: - if inspect.iscoroutinefunction(main_actor_function): - res = await main_actor_function() - else: - res = main_actor_function() - await self.exit() - return cast(MainReturnType, res) - except Exception as exc: - await self.fail( - exit_code=ActorExitCodes.ERROR_USER_FUNCTION_THREW.value, - exception=exc, - ) - return None - def new_client( self, *, From 0690a7fd08ab99c7dce1aff5f0142c14a756d6ce Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 23 Aug 2024 14:08:20 +0200 Subject: [PATCH 53/68] Update log format in test --- tests/integration/test_actor_log.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_actor_log.py b/tests/integration/test_actor_log.py index 715d00c8..7077eaf4 100644 --- a/tests/integration/test_actor_log.py +++ b/tests/integration/test_actor_log.py @@ -84,9 +84,9 @@ async def main() -> None: assert run_log_lines.pop(0) == " raise ValueError('Dummy ValueError')" assert run_log_lines.pop(0) == ' ValueError: Dummy ValueError' assert run_log_lines.pop(0) == '[apify] INFO Multi' - assert run_log_lines.pop(0) == ' line' - assert run_log_lines.pop(0) == ' log' - assert run_log_lines.pop(0) == ' message' + assert run_log_lines.pop(0) == 'line' + assert run_log_lines.pop(0) == 'log' + assert run_log_lines.pop(0) == 'message' assert run_log_lines.pop(0) == '[apify] ERROR Actor failed with an exception' assert run_log_lines.pop(0) == ' Traceback (most recent call last):' assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 44, in main' From 16cdd9332bc9968cb8e60147363e9d1703089865 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 23 Aug 2024 14:08:51 +0200 Subject: [PATCH 54/68] Exclude new Request fields from API request bodies --- src/apify/apify_storage_client/request_queue_client.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/apify/apify_storage_client/request_queue_client.py b/src/apify/apify_storage_client/request_queue_client.py index b78412d9..ce8737a4 100644 --- a/src/apify/apify_storage_client/request_queue_client.py +++ b/src/apify/apify_storage_client/request_queue_client.py @@ -81,6 +81,8 @@ async def add_request( 'id', 'json_', 'order_no', + 'query_params', + 'data', }, ), forefront=forefront, @@ -107,6 +109,8 @@ async def update_request( exclude={ 'json_', 'order_no', + 'query_params', + 'data', }, ), forefront=forefront, @@ -161,6 +165,8 @@ async def batch_add_requests( 'id', 'json_', 'order_no', + 'query_params', + 'data', }, ) for r in requests @@ -179,6 +185,8 @@ async def batch_delete_requests(self, requests: list[Request]) -> BatchRequestsO exclude={ 'json_', 'order_no', + 'query_params', + 'data', }, ) for r in requests From 1edbbfdd8b6814d652726b9f28baa22fe142e31a Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 23 Aug 2024 14:22:02 +0200 Subject: [PATCH 55/68] Keep a client_key --- src/apify/apify_storage_client/apify_storage_client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/apify/apify_storage_client/apify_storage_client.py b/src/apify/apify_storage_client/apify_storage_client.py index 5aba8057..657eb8c4 100644 --- a/src/apify/apify_storage_client/apify_storage_client.py +++ b/src/apify/apify_storage_client/apify_storage_client.py @@ -1,4 +1,5 @@ from apify_client import ApifyClientAsync +from crawlee._utils.crypto import crypto_random_object_id from crawlee.base_storage_client.base_storage_client import BaseStorageClient from typing_extensions import override @@ -15,6 +16,7 @@ class ApifyStorageClient(BaseStorageClient): """A storage client implementation based on the Apify platform storage.""" def __init__(self, *, configuration: Configuration) -> None: + self._client_key = crypto_random_object_id() self._apify_client = ApifyClientAsync( token=configuration.token, api_url=configuration.api_base_url, @@ -42,7 +44,7 @@ def key_value_stores(self) -> KeyValueStoreCollectionClient: @override def request_queue(self, id: str) -> RequestQueueClient: - return RequestQueueClient(self._apify_client.request_queue(id)) + return RequestQueueClient(self._apify_client.request_queue(id, client_key=self._client_key)) @override def request_queues(self) -> RequestQueueCollectionClient: From 1effda867787fa2b308afafaad925fb979652c59 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 26 Aug 2024 14:33:25 +0200 Subject: [PATCH 56/68] Update for compatibility with Crawlee 0.3 --- src/apify/actor.py | 7 ++- .../apify_storage_client.py | 2 +- .../apify_storage_client/dataset_client.py | 7 ++- .../dataset_collection_client.py | 3 +- .../key_value_store_client.py | 3 +- .../key_value_store_collection_client.py | 3 +- .../request_queue_client.py | 6 +-- .../request_queue_collection_client.py | 3 +- src/apify/event_manager.py | 6 +-- src/apify/log.py | 2 +- src/apify/proxy_configuration.py | 6 +-- src/apify/scrapy/requests.py | 4 +- src/apify/storages/__init__.py | 6 +-- tests/integration/test_actor_events.py | 4 +- tests/integration/test_actor_lifecycle.py | 44 ------------------ tests/integration/test_actor_request_queue.py | 2 +- tests/unit/actor/test_actor_dataset.py | 2 +- .../unit/actor/test_actor_key_value_store.py | 2 +- tests/unit/actor/test_actor_lifecycle.py | 45 +------------------ tests/unit/conftest.py | 2 +- .../scrapy/requests/test_to_scrapy_request.py | 2 +- tests/unit/test_event_manager.py | 2 +- 22 files changed, 35 insertions(+), 128 deletions(-) diff --git a/src/apify/actor.py b/src/apify/actor.py index 9b0c6fec..836e8839 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -import inspect import os import sys from datetime import timedelta @@ -11,7 +10,7 @@ from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars, WebhookEventType from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value from crawlee import service_container -from crawlee.events.types import Event, EventPersistStateData +from crawlee.events._types import Event, EventPersistStateData from pydantic import AliasChoices from typing_extensions import Self from werkzeug.local import LocalProxy @@ -30,7 +29,7 @@ import logging from types import TracebackType - from crawlee.proxy_configuration import NewUrlFunction + from crawlee.proxy_configuration import _NewUrlFunction MainReturnType = TypeVar('MainReturnType') @@ -880,7 +879,7 @@ async def create_proxy_configuration( groups: list[str] | None = None, country_code: str | None = None, proxy_urls: list[str] | None = None, - new_url_function: NewUrlFunction | None = None, + new_url_function: _NewUrlFunction | None = None, ) -> ProxyConfiguration | None: """Create a ProxyConfiguration object with the passed proxy configuration. diff --git a/src/apify/apify_storage_client/apify_storage_client.py b/src/apify/apify_storage_client/apify_storage_client.py index 657eb8c4..b6198a6c 100644 --- a/src/apify/apify_storage_client/apify_storage_client.py +++ b/src/apify/apify_storage_client/apify_storage_client.py @@ -1,6 +1,6 @@ from apify_client import ApifyClientAsync from crawlee._utils.crypto import crypto_random_object_id -from crawlee.base_storage_client.base_storage_client import BaseStorageClient +from crawlee.base_storage_client import BaseStorageClient from typing_extensions import override from .dataset_client import DatasetClient diff --git a/src/apify/apify_storage_client/dataset_client.py b/src/apify/apify_storage_client/dataset_client.py index 6d817997..abcdb769 100644 --- a/src/apify/apify_storage_client/dataset_client.py +++ b/src/apify/apify_storage_client/dataset_client.py @@ -2,8 +2,7 @@ from typing import TYPE_CHECKING -from crawlee.base_storage_client.base_dataset_client import BaseDatasetClient -from crawlee.models import DatasetItemsListPage, DatasetMetadata +from crawlee.base_storage_client import BaseDatasetClient, DatasetItemsListPage, DatasetMetadata from typing_extensions import override if TYPE_CHECKING: @@ -11,7 +10,7 @@ from contextlib import AbstractAsyncContextManager from apify_client.clients import DatasetClientAsync - from crawlee.types import JSONSerializable + from crawlee._types import JsonSerializable from httpx import Response @@ -179,7 +178,7 @@ async def stream_items( ) @override - async def push_items(self, items: JSONSerializable) -> None: + async def push_items(self, items: JsonSerializable) -> None: await self._client.push_items( items=items, ) diff --git a/src/apify/apify_storage_client/dataset_collection_client.py b/src/apify/apify_storage_client/dataset_collection_client.py index 7c0da6f5..1cfc2de3 100644 --- a/src/apify/apify_storage_client/dataset_collection_client.py +++ b/src/apify/apify_storage_client/dataset_collection_client.py @@ -2,8 +2,7 @@ from typing import TYPE_CHECKING -from crawlee.base_storage_client.base_dataset_collection_client import BaseDatasetCollectionClient -from crawlee.models import DatasetListPage, DatasetMetadata +from crawlee.base_storage_client import BaseDatasetCollectionClient, DatasetListPage, DatasetMetadata from typing_extensions import override if TYPE_CHECKING: diff --git a/src/apify/apify_storage_client/key_value_store_client.py b/src/apify/apify_storage_client/key_value_store_client.py index 6e4f4721..bf74eb9b 100644 --- a/src/apify/apify_storage_client/key_value_store_client.py +++ b/src/apify/apify_storage_client/key_value_store_client.py @@ -3,8 +3,7 @@ from contextlib import asynccontextmanager from typing import TYPE_CHECKING, Any -from crawlee.base_storage_client.base_key_value_store_client import BaseKeyValueStoreClient -from crawlee.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord +from crawlee.base_storage_client import BaseKeyValueStoreClient, KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord from typing_extensions import override if TYPE_CHECKING: diff --git a/src/apify/apify_storage_client/key_value_store_collection_client.py b/src/apify/apify_storage_client/key_value_store_collection_client.py index d0316d9d..12c0c499 100644 --- a/src/apify/apify_storage_client/key_value_store_collection_client.py +++ b/src/apify/apify_storage_client/key_value_store_collection_client.py @@ -2,8 +2,7 @@ from typing import TYPE_CHECKING -from crawlee.base_storage_client.base_key_value_store_collection_client import BaseKeyValueStoreCollectionClient -from crawlee.models import KeyValueStoreListPage, KeyValueStoreMetadata +from crawlee.base_storage_client import BaseKeyValueStoreCollectionClient, KeyValueStoreListPage, KeyValueStoreMetadata from typing_extensions import override if TYPE_CHECKING: diff --git a/src/apify/apify_storage_client/request_queue_client.py b/src/apify/apify_storage_client/request_queue_client.py index ce8737a4..dfcc77a1 100644 --- a/src/apify/apify_storage_client/request_queue_client.py +++ b/src/apify/apify_storage_client/request_queue_client.py @@ -2,12 +2,12 @@ from typing import TYPE_CHECKING -from crawlee.base_storage_client.base_request_queue_client import BaseRequestQueueClient -from crawlee.models import ( +from crawlee import Request +from crawlee.base_storage_client import ( + BaseRequestQueueClient, BatchRequestsOperationResponse, ProcessedRequest, ProlongRequestLockResponse, - Request, RequestQueueHead, RequestQueueHeadWithLocks, RequestQueueMetadata, diff --git a/src/apify/apify_storage_client/request_queue_collection_client.py b/src/apify/apify_storage_client/request_queue_collection_client.py index a513846b..1a376d36 100644 --- a/src/apify/apify_storage_client/request_queue_collection_client.py +++ b/src/apify/apify_storage_client/request_queue_collection_client.py @@ -2,8 +2,7 @@ from typing import TYPE_CHECKING -from crawlee.base_storage_client.base_request_queue_collection_client import BaseRequestQueueCollectionClient -from crawlee.models import RequestQueueListPage, RequestQueueMetadata +from crawlee.base_storage_client import BaseRequestQueueCollectionClient, RequestQueueListPage, RequestQueueMetadata from typing_extensions import override if TYPE_CHECKING: diff --git a/src/apify/event_manager.py b/src/apify/event_manager.py index e13a8413..1d7a8315 100644 --- a/src/apify/event_manager.py +++ b/src/apify/event_manager.py @@ -6,9 +6,9 @@ import websockets.client from apify_shared.utils import ignore_docs -from crawlee.events.event_manager import EventManager, EventManagerOptions -from crawlee.events.local_event_manager import LocalEventManager -from crawlee.events.types import Event, EventAbortingData, EventExitData, EventMigratingData, EventPersistStateData, EventSystemInfoData +from crawlee.events._event_manager import EventManager, EventManagerOptions +from crawlee.events._local_event_manager import LocalEventManager +from crawlee.events._types import Event, EventAbortingData, EventExitData, EventMigratingData, EventPersistStateData, EventSystemInfoData from pydantic import BaseModel, Discriminator, Field, TypeAdapter from typing_extensions import Self, Unpack, override diff --git a/src/apify/log.py b/src/apify/log.py index 7ee60ca3..c0466bdb 100644 --- a/src/apify/log.py +++ b/src/apify/log.py @@ -2,7 +2,7 @@ import logging -from crawlee.log_config import CrawleeLogFormatter +from crawlee._log_config import CrawleeLogFormatter # Name of the logger used throughout the library (resolves to 'apify') logger_name = __name__.split('.')[0] diff --git a/src/apify/proxy_configuration.py b/src/apify/proxy_configuration.py index 244f15b4..20108bfc 100644 --- a/src/apify/proxy_configuration.py +++ b/src/apify/proxy_configuration.py @@ -10,16 +10,16 @@ import httpx from apify_shared.consts import ApifyEnvVars from apify_shared.utils import ignore_docs -from crawlee.proxy_configuration import NewUrlFunction from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo +from crawlee.proxy_configuration import _NewUrlFunction from apify.config import Configuration from apify.log import logger if TYPE_CHECKING: from apify_client import ApifyClientAsync - from crawlee.models import Request + from crawlee import Request APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$') COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$') @@ -109,7 +109,7 @@ def __init__( groups: list[str] | None = None, country_code: str | None = None, proxy_urls: list[str] | None = None, - new_url_function: NewUrlFunction | None = None, + new_url_function: _NewUrlFunction | None = None, tiered_proxy_urls: list[list[str]] | None = None, _actor_config: Configuration | None = None, _apify_client: ApifyClientAsync | None = None, diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 1942d4a5..a9f59bd9 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -13,9 +13,9 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc +from crawlee import Request as CrawleeRequest from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id -from crawlee.models import Request as CrawleeRequest from apify.actor import Actor @@ -111,7 +111,7 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: The converted Scrapy request. """ if not isinstance(cast(Any, apify_request), CrawleeRequest): - raise TypeError('apify_request must be a crawlee.models.Request instance') + raise TypeError('apify_request must be a crawlee.Request instance') call_id = crypto_random_object_id(8) Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...') diff --git a/src/apify/storages/__init__.py b/src/apify/storages/__init__.py index 80205cf4..3e168046 100644 --- a/src/apify/storages/__init__.py +++ b/src/apify/storages/__init__.py @@ -1,4 +1,4 @@ # ruff: noqa: PLC0414 -from crawlee.storages.dataset import Dataset as Dataset -from crawlee.storages.key_value_store import KeyValueStore as KeyValueStore -from crawlee.storages.request_queue import RequestQueue as RequestQueue +from crawlee.storages import Dataset, KeyValueStore, RequestQueue + +__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue'] diff --git a/tests/integration/test_actor_events.py b/tests/integration/test_actor_events.py index eeddaa80..460f456b 100644 --- a/tests/integration/test_actor_events.py +++ b/tests/integration/test_actor_events.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING from apify_shared.consts import ActorEventTypes -from crawlee.events.types import Event +from crawlee.events._types import Event from apify import Actor @@ -20,7 +20,7 @@ async def main() -> None: from typing import Any, Callable from apify_shared.consts import ActorEventTypes, ApifyEnvVars - from crawlee.events.types import EventSystemInfoData + from crawlee.events._types import EventSystemInfoData os.environ[ApifyEnvVars.PERSIST_STATE_INTERVAL_MILLIS] = '900' diff --git a/tests/integration/test_actor_lifecycle.py b/tests/integration/test_actor_lifecycle.py index e497116c..9c4f9aee 100644 --- a/tests/integration/test_actor_lifecycle.py +++ b/tests/integration/test_actor_lifecycle.py @@ -108,47 +108,3 @@ async def main() -> None: assert run_result is not None assert run_result['exitCode'] == 91 assert run_result['status'] == 'FAILED' - - -class TestActorMain: - async def test_actor_main(self: TestActorMain, make_actor: ActorFactory) -> None: - async def main() -> None: - async def actor_function() -> None: - input = await Actor.get_input() # noqa: A001 - if input.get('raise_exception'): - raise Exception(input.get('raise_exception')) # noqa: TRY002 - if input.get('exit_code'): - await Actor.exit(exit_code=input.get('exit_code')) - elif input.get('fail'): - await Actor.fail() - elif input.get('set_output'): - await Actor.set_value('OUTPUT', input.get('set_output')) - print('Main function called') - - await Actor.main(actor_function) - - actor = await make_actor('actor-main', main_func=main) - - exception_run = await actor.call(run_input={'raise_exception': 'This is a test exception'}) - assert exception_run is not None - assert exception_run['status'] == 'FAILED' - assert exception_run['exitCode'] == 91 - - exit_code = 10 - exited_run = await actor.call(run_input={'exit_code': exit_code}) - assert exited_run is not None - assert exited_run['status'] == 'FAILED' - assert exited_run['exitCode'] == exit_code - - failed_run = await actor.call(run_input={'fail': True}) - assert failed_run is not None - assert failed_run['status'] == 'FAILED' - assert failed_run['exitCode'] == 1 - - test_output = {'test': 'output'} - run_with_output = await actor.call(run_input={'set_output': test_output}) - assert run_with_output is not None - assert run_with_output['status'] == 'SUCCEEDED' - output = await actor.last_run().key_value_store().get_record('OUTPUT') - assert output is not None - assert output['value'] == test_output diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 6abd611b..6f94b60f 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING from apify_shared.consts import ApifyEnvVars -from crawlee.models import Request +from crawlee import Request from ._utils import generate_unique_resource_name from apify import Actor diff --git a/tests/unit/actor/test_actor_dataset.py b/tests/unit/actor/test_actor_dataset.py index d64d3e81..bf1ef7bc 100644 --- a/tests/unit/actor/test_actor_dataset.py +++ b/tests/unit/actor/test_actor_dataset.py @@ -8,7 +8,7 @@ from apify import Actor if TYPE_CHECKING: - from crawlee.memory_storage_client.memory_storage_client import MemoryStorageClient + from crawlee.memory_storage_client import MemoryStorageClient # NOTE: We only test the dataset methods available on Actor class/instance. # Actual tests for the implementations are in storages/. diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index 17955dcc..c1c38554 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -12,7 +12,7 @@ from apify.consts import ENCRYPTED_INPUT_VALUE_PREFIX if TYPE_CHECKING: - from crawlee.memory_storage_client.memory_storage_client import MemoryStorageClient + from crawlee.memory_storage_client import MemoryStorageClient # NOTE: We only test the key-value store methods available on Actor class/instance. diff --git a/tests/unit/actor/test_actor_lifecycle.py b/tests/unit/actor/test_actor_lifecycle.py index d9496b20..fc789dcd 100644 --- a/tests/unit/actor/test_actor_lifecycle.py +++ b/tests/unit/actor/test_actor_lifecycle.py @@ -4,12 +4,11 @@ import contextlib import json from typing import Any, Callable -from unittest.mock import AsyncMock import pytest import websockets.server from apify_shared.consts import ApifyEnvVars -from crawlee.events.types import Event, EventPersistStateData +from crawlee.events._types import Event, EventPersistStateData import apify.actor from apify.actor import Actor, _ActorType @@ -110,48 +109,6 @@ async def test_actor_reboot_not_work_locally(self: TestActorFail) -> None: await Actor.reboot() -class TestActorMainMethod: - async def test_actor_main_method(self: TestActorMainMethod) -> None: - my_actor = _ActorType() - main_was_called = False - - async def actor_function() -> None: - nonlocal main_was_called - main_was_called = True - assert my_actor._is_initialized - - await my_actor.main(actor_function) - assert my_actor._is_initialized is False - assert main_was_called - - async def test_actor_main_method_throw_exception(self: TestActorMainMethod) -> None: - my_actor = _ActorType() - err = Exception('Failed') - my_actor.fail = AsyncMock() # type: ignore - - async def actor_function() -> None: - nonlocal err - raise err - - await my_actor.main(actor_function) - # NOTE: Actor didn't call sys.exit() during testing, check if fail was called. - my_actor.fail.assert_called_with(exit_code=91, exception=err) - - # This is necessary to stop the event emitting intervals - await my_actor.exit() - - async def test_actor_main_method_raise_return_value(self: TestActorMainMethod) -> None: - my_actor = _ActorType() - expected_string = 'Hello world' - - async def actor_function() -> str: - nonlocal expected_string - return expected_string - - returned_value = await my_actor.main(actor_function) - assert returned_value == expected_string - - class TestMigratingEvent: async def test_migrating_event(self: TestMigratingEvent, monkeypatch: pytest.MonkeyPatch) -> None: # This should test whether when you get a MIGRATING event, diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 32fe9331..b5f6af8b 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -10,7 +10,7 @@ from apify_client.client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars from crawlee.configuration import Configuration as CrawleeConfiguration -from crawlee.memory_storage_client.memory_storage_client import MemoryStorageClient +from crawlee.memory_storage_client import MemoryStorageClient import apify.actor diff --git a/tests/unit/scrapy/requests/test_to_scrapy_request.py b/tests/unit/scrapy/requests/test_to_scrapy_request.py index 3624eefe..68d88656 100644 --- a/tests/unit/scrapy/requests/test_to_scrapy_request.py +++ b/tests/unit/scrapy/requests/test_to_scrapy_request.py @@ -3,7 +3,7 @@ import binascii import pytest -from crawlee.models import Request as CrawleeRequest +from crawlee import Request as CrawleeRequest from scrapy import Request, Spider from scrapy.http.headers import Headers diff --git a/tests/unit/test_event_manager.py b/tests/unit/test_event_manager.py index 4cd2db87..a9b5da0c 100644 --- a/tests/unit/test_event_manager.py +++ b/tests/unit/test_event_manager.py @@ -11,7 +11,7 @@ import websockets import websockets.server from apify_shared.consts import ActorEnvVars -from crawlee.events.types import Event +from crawlee.events._types import Event from apify.config import Configuration from apify.event_manager import EventManager, PlatformEventManager, SystemInfoEventData From 9d70986f0e7a6febcfaf88c27c57bc11098f9e57 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 26 Aug 2024 14:47:19 +0200 Subject: [PATCH 57/68] Reorganize imports --- pyproject.toml | 2 +- src/apify/_crypto.py | 4 ++-- src/apify/actor.py | 10 +++++----- src/apify/apify_storage_client/apify_storage_client.py | 5 +++-- src/apify/apify_storage_client/dataset_client.py | 6 ++++-- .../apify_storage_client/dataset_collection_client.py | 3 ++- .../apify_storage_client/key_value_store_client.py | 6 ++++-- .../key_value_store_collection_client.py | 3 ++- src/apify/apify_storage_client/request_queue_client.py | 3 ++- .../request_queue_collection_client.py | 3 ++- src/apify/config.py | 5 +++-- src/apify/event_manager.py | 8 ++++---- src/apify/proxy_configuration.py | 6 +++--- src/apify/scrapy/requests.py | 3 +-- src/apify/scrapy/scheduler.py | 3 +-- tests/integration/actor_source_base/src/__main__.py | 3 ++- tests/integration/conftest.py | 5 +++-- tests/integration/test_actor_api_helpers.py | 2 +- tests/integration/test_actor_dataset.py | 3 ++- tests/integration/test_actor_events.py | 3 +-- tests/integration/test_actor_key_value_store.py | 3 ++- tests/integration/test_actor_request_queue.py | 3 ++- tests/integration/test_fixtures.py | 3 +-- .../actor/test_actor_create_proxy_configuration.py | 4 ++-- tests/unit/actor/test_actor_dataset.py | 2 +- tests/unit/actor/test_actor_env_helpers.py | 2 +- tests/unit/actor/test_actor_helpers.py | 3 +-- tests/unit/actor/test_actor_key_value_store.py | 7 ++++--- tests/unit/actor/test_actor_lifecycle.py | 4 ++-- tests/unit/actor/test_actor_log.py | 3 +-- tests/unit/conftest.py | 4 ++-- tests/unit/scrapy/requests/test_to_scrapy_request.py | 2 +- tests/unit/test_event_manager.py | 4 ++-- tests/unit/test_proxy_configuration.py | 4 ++-- 34 files changed, 72 insertions(+), 62 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c747a02a..55fa7d53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -157,7 +157,7 @@ docstring-quotes = "double" inline-quotes = "single" [tool.ruff.lint.isort] -known-local-folder = ["apify"] +known-first-party = ["apify", "apify_client", "apify_shared", "crawlee"] [tool.ruff.lint.pydocstyle] convention = "google" diff --git a/src/apify/_crypto.py b/src/apify/_crypto.py index 830681b2..cfc97472 100644 --- a/src/apify/_crypto.py +++ b/src/apify/_crypto.py @@ -3,14 +3,14 @@ import base64 from typing import Any -from apify_shared.utils import ignore_docs -from crawlee._utils.crypto import crypto_random_object_id from cryptography.exceptions import InvalidTag as InvalidTagException from cryptography.hazmat.primitives import hashes, serialization from cryptography.hazmat.primitives.asymmetric import padding, rsa from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes from apify.consts import ENCRYPTED_INPUT_VALUE_REGEXP +from apify_shared.utils import ignore_docs +from crawlee._utils.crypto import crypto_random_object_id ENCRYPTION_KEY_LENGTH = 32 ENCRYPTION_IV_LENGTH = 16 diff --git a/src/apify/actor.py b/src/apify/actor.py index 836e8839..fe30da1f 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -6,11 +6,6 @@ from datetime import timedelta from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast -from apify_client import ApifyClientAsync -from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars, WebhookEventType -from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value -from crawlee import service_container -from crawlee.events._types import Event, EventPersistStateData from pydantic import AliasChoices from typing_extensions import Self from werkzeug.local import LocalProxy @@ -24,6 +19,11 @@ from apify.log import logger from apify.proxy_configuration import ProxyConfiguration from apify.storages import Dataset, KeyValueStore, RequestQueue +from apify_client import ApifyClientAsync +from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars, WebhookEventType +from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value +from crawlee import service_container +from crawlee.events._types import Event, EventPersistStateData if TYPE_CHECKING: import logging diff --git a/src/apify/apify_storage_client/apify_storage_client.py b/src/apify/apify_storage_client/apify_storage_client.py index b6198a6c..e827d820 100644 --- a/src/apify/apify_storage_client/apify_storage_client.py +++ b/src/apify/apify_storage_client/apify_storage_client.py @@ -1,7 +1,9 @@ +from typing_extensions import override + +from apify.config import Configuration from apify_client import ApifyClientAsync from crawlee._utils.crypto import crypto_random_object_id from crawlee.base_storage_client import BaseStorageClient -from typing_extensions import override from .dataset_client import DatasetClient from .dataset_collection_client import DatasetCollectionClient @@ -9,7 +11,6 @@ from .key_value_store_collection_client import KeyValueStoreCollectionClient from .request_queue_client import RequestQueueClient from .request_queue_collection_client import RequestQueueCollectionClient -from apify.config import Configuration class ApifyStorageClient(BaseStorageClient): diff --git a/src/apify/apify_storage_client/dataset_client.py b/src/apify/apify_storage_client/dataset_client.py index abcdb769..dd10ced8 100644 --- a/src/apify/apify_storage_client/dataset_client.py +++ b/src/apify/apify_storage_client/dataset_client.py @@ -2,16 +2,18 @@ from typing import TYPE_CHECKING -from crawlee.base_storage_client import BaseDatasetClient, DatasetItemsListPage, DatasetMetadata from typing_extensions import override +from crawlee.base_storage_client import BaseDatasetClient, DatasetItemsListPage, DatasetMetadata + if TYPE_CHECKING: from collections.abc import AsyncIterator from contextlib import AbstractAsyncContextManager + from httpx import Response + from apify_client.clients import DatasetClientAsync from crawlee._types import JsonSerializable - from httpx import Response class DatasetClient(BaseDatasetClient): diff --git a/src/apify/apify_storage_client/dataset_collection_client.py b/src/apify/apify_storage_client/dataset_collection_client.py index 1cfc2de3..1a6fb27a 100644 --- a/src/apify/apify_storage_client/dataset_collection_client.py +++ b/src/apify/apify_storage_client/dataset_collection_client.py @@ -2,9 +2,10 @@ from typing import TYPE_CHECKING -from crawlee.base_storage_client import BaseDatasetCollectionClient, DatasetListPage, DatasetMetadata from typing_extensions import override +from crawlee.base_storage_client import BaseDatasetCollectionClient, DatasetListPage, DatasetMetadata + if TYPE_CHECKING: from apify_client.clients import DatasetCollectionClientAsync diff --git a/src/apify/apify_storage_client/key_value_store_client.py b/src/apify/apify_storage_client/key_value_store_client.py index bf74eb9b..769d157e 100644 --- a/src/apify/apify_storage_client/key_value_store_client.py +++ b/src/apify/apify_storage_client/key_value_store_client.py @@ -3,16 +3,18 @@ from contextlib import asynccontextmanager from typing import TYPE_CHECKING, Any -from crawlee.base_storage_client import BaseKeyValueStoreClient, KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord from typing_extensions import override +from crawlee.base_storage_client import BaseKeyValueStoreClient, KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord + if TYPE_CHECKING: from collections.abc import AsyncIterator from contextlib import AbstractAsyncContextManager - from apify_client.clients import KeyValueStoreClientAsync from httpx import Response + from apify_client.clients import KeyValueStoreClientAsync + class KeyValueStoreClient(BaseKeyValueStoreClient): """Key-value store resource client implementation based on the Apify platform storage.""" diff --git a/src/apify/apify_storage_client/key_value_store_collection_client.py b/src/apify/apify_storage_client/key_value_store_collection_client.py index 12c0c499..27f76f37 100644 --- a/src/apify/apify_storage_client/key_value_store_collection_client.py +++ b/src/apify/apify_storage_client/key_value_store_collection_client.py @@ -2,9 +2,10 @@ from typing import TYPE_CHECKING -from crawlee.base_storage_client import BaseKeyValueStoreCollectionClient, KeyValueStoreListPage, KeyValueStoreMetadata from typing_extensions import override +from crawlee.base_storage_client import BaseKeyValueStoreCollectionClient, KeyValueStoreListPage, KeyValueStoreMetadata + if TYPE_CHECKING: from apify_client.clients import KeyValueStoreCollectionClientAsync diff --git a/src/apify/apify_storage_client/request_queue_client.py b/src/apify/apify_storage_client/request_queue_client.py index dfcc77a1..2cdbe58d 100644 --- a/src/apify/apify_storage_client/request_queue_client.py +++ b/src/apify/apify_storage_client/request_queue_client.py @@ -2,6 +2,8 @@ from typing import TYPE_CHECKING +from typing_extensions import override + from crawlee import Request from crawlee.base_storage_client import ( BaseRequestQueueClient, @@ -12,7 +14,6 @@ RequestQueueHeadWithLocks, RequestQueueMetadata, ) -from typing_extensions import override if TYPE_CHECKING: from collections.abc import Sequence diff --git a/src/apify/apify_storage_client/request_queue_collection_client.py b/src/apify/apify_storage_client/request_queue_collection_client.py index 1a376d36..50aad1aa 100644 --- a/src/apify/apify_storage_client/request_queue_collection_client.py +++ b/src/apify/apify_storage_client/request_queue_collection_client.py @@ -2,9 +2,10 @@ from typing import TYPE_CHECKING -from crawlee.base_storage_client import BaseRequestQueueCollectionClient, RequestQueueListPage, RequestQueueMetadata from typing_extensions import override +from crawlee.base_storage_client import BaseRequestQueueCollectionClient, RequestQueueListPage, RequestQueueMetadata + if TYPE_CHECKING: from apify_client.clients import RequestQueueCollectionClientAsync diff --git a/src/apify/config.py b/src/apify/config.py index f695007b..5b190992 100644 --- a/src/apify/config.py +++ b/src/apify/config.py @@ -1,12 +1,13 @@ -# ruff: noqa: TCH002 TCH003 +# ruff: noqa: TCH001 TCH002 TCH003 from __future__ import annotations from datetime import datetime, timedelta from typing import Annotated +from pydantic import AliasChoices, BeforeValidator, Field + from crawlee._utils.models import timedelta_ms from crawlee.configuration import Configuration as CrawleeConfiguration -from pydantic import AliasChoices, BeforeValidator, Field class Configuration(CrawleeConfiguration): diff --git a/src/apify/event_manager.py b/src/apify/event_manager.py index 1d7a8315..34412a19 100644 --- a/src/apify/event_manager.py +++ b/src/apify/event_manager.py @@ -5,14 +5,14 @@ from typing import TYPE_CHECKING, Annotated, Any, Literal, Union import websockets.client -from apify_shared.utils import ignore_docs -from crawlee.events._event_manager import EventManager, EventManagerOptions -from crawlee.events._local_event_manager import LocalEventManager -from crawlee.events._types import Event, EventAbortingData, EventExitData, EventMigratingData, EventPersistStateData, EventSystemInfoData from pydantic import BaseModel, Discriminator, Field, TypeAdapter from typing_extensions import Self, Unpack, override from apify.log import logger +from apify_shared.utils import ignore_docs +from crawlee.events._event_manager import EventManager, EventManagerOptions +from crawlee.events._local_event_manager import LocalEventManager +from crawlee.events._types import Event, EventAbortingData, EventExitData, EventMigratingData, EventPersistStateData, EventSystemInfoData if TYPE_CHECKING: from types import TracebackType diff --git a/src/apify/proxy_configuration.py b/src/apify/proxy_configuration.py index 20108bfc..228d70cd 100644 --- a/src/apify/proxy_configuration.py +++ b/src/apify/proxy_configuration.py @@ -8,15 +8,15 @@ from urllib.parse import urljoin, urlparse import httpx + +from apify.config import Configuration +from apify.log import logger from apify_shared.consts import ApifyEnvVars from apify_shared.utils import ignore_docs from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo from crawlee.proxy_configuration import _NewUrlFunction -from apify.config import Configuration -from apify.log import logger - if TYPE_CHECKING: from apify_client import ApifyClientAsync from crawlee import Request diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index a9f59bd9..a04862b6 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -13,12 +13,11 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc +from apify.actor import Actor from crawlee import Request as CrawleeRequest from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id -from apify.actor import Actor - def _is_request_produced_by_middleware(scrapy_request: Request) -> bool: """Returns True if the Scrapy request was produced by a downloader middleware, otherwise False. diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 67f09305..03635664 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -15,12 +15,11 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from crawlee._utils.crypto import crypto_random_object_id - from apify.actor import Actor from apify.scrapy.requests import to_apify_request, to_scrapy_request from apify.scrapy.utils import nested_event_loop from apify.storages import RequestQueue +from crawlee._utils.crypto import crypto_random_object_id class ApifyScheduler(BaseScheduler): diff --git a/tests/integration/actor_source_base/src/__main__.py b/tests/integration/actor_source_base/src/__main__.py index 0d1d65af..643eb63c 100644 --- a/tests/integration/actor_source_base/src/__main__.py +++ b/tests/integration/actor_source_base/src/__main__.py @@ -3,9 +3,10 @@ import asyncio import logging -from .main import main from apify.log import ActorLogFormatter +from .main import main + handler = logging.StreamHandler() handler.setFormatter(ActorLogFormatter()) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 1a0e52f6..34c36aa0 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -10,11 +10,12 @@ from typing import TYPE_CHECKING, Callable, Protocol, cast import pytest -from apify_client import ApifyClientAsync -from apify_shared.consts import ActorJobStatus, ActorSourceType from filelock import FileLock import apify.actor +from apify_client import ApifyClientAsync +from apify_shared.consts import ActorJobStatus, ActorSourceType + from ._utils import generate_unique_resource_name if TYPE_CHECKING: diff --git a/tests/integration/test_actor_api_helpers.py b/tests/integration/test_actor_api_helpers.py index 589528fa..db9f970b 100644 --- a/tests/integration/test_actor_api_helpers.py +++ b/tests/integration/test_actor_api_helpers.py @@ -4,10 +4,10 @@ import json from typing import TYPE_CHECKING +from apify import Actor from crawlee._utils.crypto import crypto_random_object_id from ._utils import generate_unique_resource_name -from apify import Actor if TYPE_CHECKING: from apify_client import ApifyClientAsync diff --git a/tests/integration/test_actor_dataset.py b/tests/integration/test_actor_dataset.py index 81a4f938..1486dbca 100644 --- a/tests/integration/test_actor_dataset.py +++ b/tests/integration/test_actor_dataset.py @@ -2,13 +2,14 @@ from typing import TYPE_CHECKING +from apify import Actor from apify_shared.consts import ApifyEnvVars from ._utils import generate_unique_resource_name -from apify import Actor if TYPE_CHECKING: import pytest + from apify_client import ApifyClientAsync from .conftest import ActorFactory diff --git a/tests/integration/test_actor_events.py b/tests/integration/test_actor_events.py index 460f456b..19f3339f 100644 --- a/tests/integration/test_actor_events.py +++ b/tests/integration/test_actor_events.py @@ -3,11 +3,10 @@ import asyncio from typing import TYPE_CHECKING +from apify import Actor from apify_shared.consts import ActorEventTypes from crawlee.events._types import Event -from apify import Actor - if TYPE_CHECKING: from .conftest import ActorFactory diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index 73e888b7..8521821d 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -2,13 +2,14 @@ from typing import TYPE_CHECKING +from apify import Actor from apify_shared.consts import ApifyEnvVars from ._utils import generate_unique_resource_name -from apify import Actor if TYPE_CHECKING: import pytest + from apify_client import ApifyClientAsync from .conftest import ActorFactory diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 6f94b60f..33d8082c 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -2,14 +2,15 @@ from typing import TYPE_CHECKING +from apify import Actor from apify_shared.consts import ApifyEnvVars from crawlee import Request from ._utils import generate_unique_resource_name -from apify import Actor if TYPE_CHECKING: import pytest + from apify_client import ApifyClientAsync from .conftest import ActorFactory diff --git a/tests/integration/test_fixtures.py b/tests/integration/test_fixtures.py index 93ff5588..a0f256ad 100644 --- a/tests/integration/test_fixtures.py +++ b/tests/integration/test_fixtures.py @@ -3,9 +3,8 @@ from datetime import datetime, timezone from typing import TYPE_CHECKING -from crawlee._utils.crypto import crypto_random_object_id - from apify import Actor +from crawlee._utils.crypto import crypto_random_object_id if TYPE_CHECKING: from apify_client import ApifyClientAsync diff --git a/tests/unit/actor/test_actor_create_proxy_configuration.py b/tests/unit/actor/test_actor_create_proxy_configuration.py index 593ee080..29c6e928 100644 --- a/tests/unit/actor/test_actor_create_proxy_configuration.py +++ b/tests/unit/actor/test_actor_create_proxy_configuration.py @@ -4,10 +4,10 @@ import httpx import pytest -from apify_client import ApifyClientAsync -from apify_shared.consts import ApifyEnvVars from apify import Actor +from apify_client import ApifyClientAsync +from apify_shared.consts import ApifyEnvVars if TYPE_CHECKING: from respx import MockRouter diff --git a/tests/unit/actor/test_actor_dataset.py b/tests/unit/actor/test_actor_dataset.py index bf1ef7bc..7e0b384b 100644 --- a/tests/unit/actor/test_actor_dataset.py +++ b/tests/unit/actor/test_actor_dataset.py @@ -3,9 +3,9 @@ from typing import TYPE_CHECKING import pytest -from apify_shared.consts import ActorEnvVars from apify import Actor +from apify_shared.consts import ActorEnvVars if TYPE_CHECKING: from crawlee.memory_storage_client import MemoryStorageClient diff --git a/tests/unit/actor/test_actor_env_helpers.py b/tests/unit/actor/test_actor_env_helpers.py index d1f46d35..44483cd0 100644 --- a/tests/unit/actor/test_actor_env_helpers.py +++ b/tests/unit/actor/test_actor_env_helpers.py @@ -5,10 +5,10 @@ from datetime import datetime, timedelta from typing import TYPE_CHECKING, Any -from apify_shared.consts import BOOL_ENV_VARS, DATETIME_ENV_VARS, FLOAT_ENV_VARS, INTEGER_ENV_VARS, STRING_ENV_VARS, ActorEnvVars, ApifyEnvVars from pydantic_core import TzInfo from apify import Actor +from apify_shared.consts import BOOL_ENV_VARS, DATETIME_ENV_VARS, FLOAT_ENV_VARS, INTEGER_ENV_VARS, STRING_ENV_VARS, ActorEnvVars, ApifyEnvVars if TYPE_CHECKING: import pytest diff --git a/tests/unit/actor/test_actor_helpers.py b/tests/unit/actor/test_actor_helpers.py index 34613f4f..4839fa25 100644 --- a/tests/unit/actor/test_actor_helpers.py +++ b/tests/unit/actor/test_actor_helpers.py @@ -2,11 +2,10 @@ from typing import TYPE_CHECKING +from apify.actor import Actor, _ActorType from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars, WebhookEventType -from apify.actor import Actor, _ActorType - if TYPE_CHECKING: import pytest diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index c1c38554..576f26df 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -3,13 +3,14 @@ from typing import TYPE_CHECKING import pytest -from apify_shared.consts import ApifyEnvVars -from apify_shared.utils import json_dumps -from ..test_crypto import PRIVATE_KEY_PASSWORD, PRIVATE_KEY_PEM_BASE64, PUBLIC_KEY from apify import Actor from apify._crypto import public_encrypt from apify.consts import ENCRYPTED_INPUT_VALUE_PREFIX +from apify_shared.consts import ApifyEnvVars +from apify_shared.utils import json_dumps + +from ..test_crypto import PRIVATE_KEY_PASSWORD, PRIVATE_KEY_PEM_BASE64, PUBLIC_KEY if TYPE_CHECKING: from crawlee.memory_storage_client import MemoryStorageClient diff --git a/tests/unit/actor/test_actor_lifecycle.py b/tests/unit/actor/test_actor_lifecycle.py index fc789dcd..ee9526b3 100644 --- a/tests/unit/actor/test_actor_lifecycle.py +++ b/tests/unit/actor/test_actor_lifecycle.py @@ -7,11 +7,11 @@ import pytest import websockets.server -from apify_shared.consts import ApifyEnvVars -from crawlee.events._types import Event, EventPersistStateData import apify.actor from apify.actor import Actor, _ActorType +from apify_shared.consts import ApifyEnvVars +from crawlee.events._types import Event, EventPersistStateData class TestActorInit: diff --git a/tests/unit/actor/test_actor_log.py b/tests/unit/actor/test_actor_log.py index 599e8b3d..d3a9fc3a 100644 --- a/tests/unit/actor/test_actor_log.py +++ b/tests/unit/actor/test_actor_log.py @@ -5,10 +5,9 @@ import sys from typing import TYPE_CHECKING -from apify_client import __version__ as apify_client_version - from apify import Actor, __version__ from apify.log import logger +from apify_client import __version__ as apify_client_version if TYPE_CHECKING: import pytest diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index b5f6af8b..101716db 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -7,13 +7,13 @@ from typing import TYPE_CHECKING, Any, Callable, cast, get_type_hints import pytest + +import apify.actor from apify_client.client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars from crawlee.configuration import Configuration as CrawleeConfiguration from crawlee.memory_storage_client import MemoryStorageClient -import apify.actor - if TYPE_CHECKING: from pathlib import Path diff --git a/tests/unit/scrapy/requests/test_to_scrapy_request.py b/tests/unit/scrapy/requests/test_to_scrapy_request.py index 68d88656..253f316b 100644 --- a/tests/unit/scrapy/requests/test_to_scrapy_request.py +++ b/tests/unit/scrapy/requests/test_to_scrapy_request.py @@ -3,11 +3,11 @@ import binascii import pytest -from crawlee import Request as CrawleeRequest from scrapy import Request, Spider from scrapy.http.headers import Headers from apify.scrapy.requests import to_scrapy_request +from crawlee import Request as CrawleeRequest class DummySpider(Spider): diff --git a/tests/unit/test_event_manager.py b/tests/unit/test_event_manager.py index a9b5da0c..4506b352 100644 --- a/tests/unit/test_event_manager.py +++ b/tests/unit/test_event_manager.py @@ -10,11 +10,11 @@ import pytest import websockets import websockets.server -from apify_shared.consts import ActorEnvVars -from crawlee.events._types import Event from apify.config import Configuration from apify.event_manager import EventManager, PlatformEventManager, SystemInfoEventData +from apify_shared.consts import ActorEnvVars +from crawlee.events._types import Event class TestEventManagerLocal: diff --git a/tests/unit/test_proxy_configuration.py b/tests/unit/test_proxy_configuration.py index 93be7f72..3487c23b 100644 --- a/tests/unit/test_proxy_configuration.py +++ b/tests/unit/test_proxy_configuration.py @@ -8,10 +8,10 @@ import httpx import pytest -from apify_client import ApifyClientAsync -from apify_shared.consts import ApifyEnvVars from apify.proxy_configuration import ProxyConfiguration, is_url +from apify_client import ApifyClientAsync +from apify_shared.consts import ApifyEnvVars if TYPE_CHECKING: from respx import MockRouter From e3c39e896800768b5f06ed762e04fe5005956840 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 26 Aug 2024 14:51:41 +0200 Subject: [PATCH 58/68] Comment --- src/apify/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apify/config.py b/src/apify/config.py index 5b190992..9612265a 100644 --- a/src/apify/config.py +++ b/src/apify/config.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH001 TCH002 TCH003 +# ruff: noqa: TCH001 TCH002 TCH003 (so that pydantic annotations work) from __future__ import annotations from datetime import datetime, timedelta From f900c581ed13af8e5c89ed95671887e0423eb096 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 26 Aug 2024 16:30:23 +0200 Subject: [PATCH 59/68] Put Actor() back --- src/apify/actor.py | 24 ++++++++++++------- .../actor/test_actor_non_default_instance.py | 9 +++++++ 2 files changed, 24 insertions(+), 9 deletions(-) create mode 100644 tests/unit/actor/test_actor_non_default_instance.py diff --git a/src/apify/actor.py b/src/apify/actor.py index fe30da1f..b132cbf3 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -45,8 +45,7 @@ class _ActorType: def __init__(self, config: Configuration | None = None) -> None: """Create an Actor instance. - Note that you don't have to do this, all the methods on this class function as classmethods too, - and that is their preferred usage. + Note that you don't have to do this, all the functionality is accessible using the default instance (e.g. `Actor.open_dataset()`). Args: config (Configuration, optional): The Actor configuration to be used. If not passed, a new Configuration instance will be created. @@ -54,25 +53,18 @@ def __init__(self, config: Configuration | None = None) -> None: self._configuration = config or Configuration.get_global_configuration() self._apify_client = self.new_client() - if self._configuration.token: - service_container.set_cloud_storage_client(ApifyStorageClient(configuration=self._configuration)) - self._event_manager: EventManager if self._configuration.is_at_home: - service_container.set_default_storage_client_type('cloud') self._event_manager = PlatformEventManager( config=self._configuration, persist_state_interval=self._configuration.persist_state_interval, ) else: - service_container.set_default_storage_client_type('local') self._event_manager = LocalEventManager( system_info_interval=self._configuration.system_info_interval, persist_state_interval=self._configuration.persist_state_interval, ) - service_container.set_event_manager(self._event_manager) - self._is_initialized = False @ignore_docs @@ -116,6 +108,10 @@ def __repr__(self) -> str: return super().__repr__() + def __call__(self, config: Configuration) -> Self: + """Make a new Actor instance with a non-default configuration.""" + return self.__class__(config=config) + @property def apify_client(self) -> ApifyClientAsync: """The ApifyClientAsync instance the Actor instance uses.""" @@ -155,6 +151,16 @@ async def init(self) -> None: if self._is_initialized: raise RuntimeError('The Actor was already initialized!') + if self._configuration.token: + service_container.set_cloud_storage_client(ApifyStorageClient(configuration=self._configuration)) + + if self._configuration.is_at_home: + service_container.set_default_storage_client_type('cloud') + else: + service_container.set_default_storage_client_type('local') + + service_container.set_event_manager(self._event_manager) + self._is_exiting = False self._was_final_persist_state_emitted = False diff --git a/tests/unit/actor/test_actor_non_default_instance.py b/tests/unit/actor/test_actor_non_default_instance.py new file mode 100644 index 00000000..639e80d2 --- /dev/null +++ b/tests/unit/actor/test_actor_non_default_instance.py @@ -0,0 +1,9 @@ +from datetime import timedelta + +from apify import Actor +from apify.config import Configuration + + +async def test_actor_non_default_instance() -> None: + async with Actor(Configuration(internal_timeout=timedelta(minutes=111))) as actor: + assert actor.config.internal_timeout == timedelta(minutes=111) From cf53178ba4e912b6a4ee4695965351d9b0d0e081 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 26 Aug 2024 16:57:42 +0200 Subject: [PATCH 60/68] Replace Werkzeug to save some disk space --- pyproject.toml | 4 ++-- src/apify/actor.py | 18 +++--------------- tests/integration/conftest.py | 2 +- tests/integration/test_actor_lifecycle.py | 4 ++-- tests/unit/actor/test_actor_lifecycle.py | 9 +++++---- tests/unit/conftest.py | 2 +- 6 files changed, 14 insertions(+), 25 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 55fa7d53..97214ec0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,12 +34,12 @@ dependencies = [ "crawlee >= 0.3.0", "cryptography >= 39.0.0", "httpx >= 0.24.0", + "lazy-object-proxy >= 1.10.0", "psutil >= 5.9.0", "pyee >= 11.0.0", "sortedcollections >= 2.0.0", "typing-extensions >= 4.1.0", "websockets >= 10.1", - "werkzeug >= 3.0.0", ] [project.optional-dependencies] @@ -185,5 +185,5 @@ warn_unreachable = true warn_unused_ignores = true [[tool.mypy.overrides]] -module = ['scrapy', 'scrapy.*', 'sortedcollections'] +module = ['scrapy', 'scrapy.*', 'sortedcollections', 'lazy_object_proxy'] ignore_missing_imports = true diff --git a/src/apify/actor.py b/src/apify/actor.py index b132cbf3..524d33c4 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -6,9 +6,9 @@ from datetime import timedelta from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast +from lazy_object_proxy import Proxy from pydantic import AliasChoices from typing_extensions import Self -from werkzeug.local import LocalProxy from apify._crypto import decrypt_input_secrets, load_private_key from apify._utils import get_system_info, is_running_in_ipython @@ -103,7 +103,7 @@ async def __aexit__( await self.exit() def __repr__(self) -> str: - if self is _default_instance: + if self is cast(Proxy, Actor).__wrapped__: return '' return super().__repr__() @@ -933,17 +933,5 @@ async def create_proxy_configuration( return proxy_configuration -_default_instance: _ActorType | None = None - - -def _get_default_instance() -> _ActorType: - global _default_instance # noqa: PLW0603 - - if not _default_instance: - _default_instance = _ActorType() - - return _default_instance - - -Actor = cast(_ActorType, LocalProxy(_get_default_instance)) +Actor = cast(_ActorType, Proxy(_ActorType)) """The entry point of the SDK, through which all the Actor operations should be done.""" diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 34c36aa0..2190d446 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -35,7 +35,7 @@ def _reset_and_patch_default_instances() -> None: from crawlee import service_container cast(dict, service_container._services).clear() - apify.actor._default_instance = None + delattr(apify.actor.Actor, '__wrapped__') # TODO: StorageClientManager local storage client purge # noqa: TD003 diff --git a/tests/integration/test_actor_lifecycle.py b/tests/integration/test_actor_lifecycle.py index 9c4f9aee..eae85700 100644 --- a/tests/integration/test_actor_lifecycle.py +++ b/tests/integration/test_actor_lifecycle.py @@ -45,8 +45,8 @@ async def main() -> None: import apify.actor async with Actor: - assert apify.actor._get_default_instance()._is_initialized - assert apify.actor._get_default_instance()._is_initialized is False + assert apify.actor.Actor._is_initialized + assert apify.actor.Actor._is_initialized is False actor = await make_actor('with-actor-init', main_func=main) diff --git a/tests/unit/actor/test_actor_lifecycle.py b/tests/unit/actor/test_actor_lifecycle.py index ee9526b3..a4f3d33f 100644 --- a/tests/unit/actor/test_actor_lifecycle.py +++ b/tests/unit/actor/test_actor_lifecycle.py @@ -3,10 +3,11 @@ import asyncio import contextlib import json -from typing import Any, Callable +from typing import Any, Callable, cast import pytest import websockets.server +from lazy_object_proxy import Proxy import apify.actor from apify.actor import Actor, _ActorType @@ -17,9 +18,9 @@ class TestActorInit: async def test_async_with_actor_properly_initialize(self: TestActorInit) -> None: async with Actor: - assert apify.actor._default_instance is not None - assert apify.actor._default_instance._is_initialized - assert not apify.actor._default_instance._is_initialized + assert cast(Proxy, apify.actor.Actor).__wrapped__ is not None + assert cast(Proxy, apify.actor.Actor).__wrapped__._is_initialized + assert not cast(Proxy, apify.actor.Actor).__wrapped__._is_initialized async def test_actor_init(self: TestActorInit) -> None: my_actor = _ActorType() diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 101716db..0a4d694e 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -41,7 +41,7 @@ def reset() -> None: cast(dict, service_container._services).clear() - apify.actor._default_instance = None + delattr(apify.actor.Actor, '__wrapped__') # TODO: local storage client purge # noqa: TD003 return reset From 85b560386bb6e603ad6a14a1b18c112b84b58313 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 26 Aug 2024 17:11:00 +0200 Subject: [PATCH 61/68] event_manager -> _platform_event_manager --- src/apify/__init__.py | 4 +++- src/apify/{event_manager.py => _platform_event_manager.py} | 0 src/apify/actor.py | 2 +- tests/unit/test_event_manager.py | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) rename src/apify/{event_manager.py => _platform_event_manager.py} (100%) diff --git a/src/apify/__init__.py b/src/apify/__init__.py index d9a2e5b2..e7f8b68d 100644 --- a/src/apify/__init__.py +++ b/src/apify/__init__.py @@ -1,9 +1,11 @@ from importlib import metadata +from crawlee.events._types import Event + from .actor import Actor from .config import Configuration from .proxy_configuration import ProxyConfiguration, ProxyInfo __version__ = metadata.version('apify') -__all__ = ['Actor', 'Configuration', 'ProxyConfiguration', 'ProxyInfo', '__version__'] +__all__ = ['Actor', 'Event', 'Configuration', 'ProxyConfiguration', 'ProxyInfo', '__version__'] diff --git a/src/apify/event_manager.py b/src/apify/_platform_event_manager.py similarity index 100% rename from src/apify/event_manager.py rename to src/apify/_platform_event_manager.py diff --git a/src/apify/actor.py b/src/apify/actor.py index 524d33c4..b7ddb558 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -11,11 +11,11 @@ from typing_extensions import Self from apify._crypto import decrypt_input_secrets, load_private_key +from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager from apify._utils import get_system_info, is_running_in_ipython from apify.apify_storage_client.apify_storage_client import ApifyStorageClient from apify.config import Configuration from apify.consts import EVENT_LISTENERS_TIMEOUT -from apify.event_manager import EventManager, LocalEventManager, PlatformEventManager from apify.log import logger from apify.proxy_configuration import ProxyConfiguration from apify.storages import Dataset, KeyValueStore, RequestQueue diff --git a/tests/unit/test_event_manager.py b/tests/unit/test_event_manager.py index 4506b352..5278f25c 100644 --- a/tests/unit/test_event_manager.py +++ b/tests/unit/test_event_manager.py @@ -11,8 +11,8 @@ import websockets import websockets.server +from apify._platform_event_manager import EventManager, PlatformEventManager, SystemInfoEventData from apify.config import Configuration -from apify.event_manager import EventManager, PlatformEventManager, SystemInfoEventData from apify_shared.consts import ActorEnvVars from crawlee.events._types import Event From 68903cbe5558f99238d4e5d8d7dbac5b4bb399e9 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 26 Aug 2024 17:37:30 +0200 Subject: [PATCH 62/68] Hide non-public members --- src/apify/__init__.py | 7 +++---- src/apify/{actor.py => _actor.py} | 8 ++++---- src/apify/{config.py => _configuration.py} | 0 src/apify/{consts.py => _consts.py} | 0 src/apify/_crypto.py | 2 +- src/apify/{log.py => _log.py} | 2 +- src/apify/_platform_event_manager.py | 4 ++-- ...y_configuration.py => _proxy_configuration.py} | 4 ++-- .../apify_storage_client/apify_storage_client.py | 15 +++++++-------- src/apify/scrapy/__init__.py | 14 +++++++++++--- src/apify/scrapy/middlewares/__init__.py | 4 +++- src/apify/scrapy/middlewares/apify_proxy.py | 3 +-- src/apify/scrapy/pipelines/__init__.py | 4 +++- src/apify/scrapy/pipelines/actor_dataset_push.py | 2 +- src/apify/scrapy/requests.py | 2 +- src/apify/scrapy/scheduler.py | 4 ++-- .../integration/actor_source_base/src/__main__.py | 2 +- tests/integration/conftest.py | 4 ++-- tests/integration/test_actor_lifecycle.py | 6 +++--- tests/integration/test_actor_log.py | 2 +- tests/unit/actor/test_actor_helpers.py | 3 ++- tests/unit/actor/test_actor_key_value_store.py | 2 +- tests/unit/actor/test_actor_lifecycle.py | 11 ++++++----- tests/unit/actor/test_actor_log.py | 2 +- .../unit/actor/test_actor_non_default_instance.py | 3 +-- tests/unit/conftest.py | 4 ++-- tests/unit/test_event_manager.py | 2 +- tests/unit/test_proxy_configuration.py | 2 +- 28 files changed, 64 insertions(+), 54 deletions(-) rename src/apify/{actor.py => _actor.py} (99%) rename src/apify/{config.py => _configuration.py} (100%) rename src/apify/{consts.py => _consts.py} (100%) rename src/apify/{log.py => _log.py} (77%) rename src/apify/{proxy_configuration.py => _proxy_configuration.py} (99%) diff --git a/src/apify/__init__.py b/src/apify/__init__.py index e7f8b68d..57cd6994 100644 --- a/src/apify/__init__.py +++ b/src/apify/__init__.py @@ -1,11 +1,10 @@ from importlib import metadata +from apify._actor import Actor +from apify._configuration import Configuration +from apify._proxy_configuration import ProxyConfiguration, ProxyInfo from crawlee.events._types import Event -from .actor import Actor -from .config import Configuration -from .proxy_configuration import ProxyConfiguration, ProxyInfo - __version__ = metadata.version('apify') __all__ = ['Actor', 'Event', 'Configuration', 'ProxyConfiguration', 'ProxyInfo', '__version__'] diff --git a/src/apify/actor.py b/src/apify/_actor.py similarity index 99% rename from src/apify/actor.py rename to src/apify/_actor.py index b7ddb558..aae51f0f 100644 --- a/src/apify/actor.py +++ b/src/apify/_actor.py @@ -10,14 +10,14 @@ from pydantic import AliasChoices from typing_extensions import Self +from apify._configuration import Configuration +from apify._consts import EVENT_LISTENERS_TIMEOUT from apify._crypto import decrypt_input_secrets, load_private_key +from apify._log import logger from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager +from apify._proxy_configuration import ProxyConfiguration from apify._utils import get_system_info, is_running_in_ipython from apify.apify_storage_client.apify_storage_client import ApifyStorageClient -from apify.config import Configuration -from apify.consts import EVENT_LISTENERS_TIMEOUT -from apify.log import logger -from apify.proxy_configuration import ProxyConfiguration from apify.storages import Dataset, KeyValueStore, RequestQueue from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars, WebhookEventType diff --git a/src/apify/config.py b/src/apify/_configuration.py similarity index 100% rename from src/apify/config.py rename to src/apify/_configuration.py diff --git a/src/apify/consts.py b/src/apify/_consts.py similarity index 100% rename from src/apify/consts.py rename to src/apify/_consts.py diff --git a/src/apify/_crypto.py b/src/apify/_crypto.py index cfc97472..7d1edb13 100644 --- a/src/apify/_crypto.py +++ b/src/apify/_crypto.py @@ -8,7 +8,7 @@ from cryptography.hazmat.primitives.asymmetric import padding, rsa from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes -from apify.consts import ENCRYPTED_INPUT_VALUE_REGEXP +from apify._consts import ENCRYPTED_INPUT_VALUE_REGEXP from apify_shared.utils import ignore_docs from crawlee._utils.crypto import crypto_random_object_id diff --git a/src/apify/log.py b/src/apify/_log.py similarity index 77% rename from src/apify/log.py rename to src/apify/_log.py index c0466bdb..c799420a 100644 --- a/src/apify/log.py +++ b/src/apify/_log.py @@ -11,5 +11,5 @@ logger = logging.getLogger(logger_name) -class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 Inherited from parent class +class ActorLogFormatter(CrawleeLogFormatter): # Inherited from parent class pass diff --git a/src/apify/_platform_event_manager.py b/src/apify/_platform_event_manager.py index 34412a19..dc352c8c 100644 --- a/src/apify/_platform_event_manager.py +++ b/src/apify/_platform_event_manager.py @@ -8,7 +8,7 @@ from pydantic import BaseModel, Discriminator, Field, TypeAdapter from typing_extensions import Self, Unpack, override -from apify.log import logger +from apify._log import logger from apify_shared.utils import ignore_docs from crawlee.events._event_manager import EventManager, EventManagerOptions from crawlee.events._local_event_manager import LocalEventManager @@ -17,7 +17,7 @@ if TYPE_CHECKING: from types import TracebackType - from apify.config import Configuration + from apify._configuration import Configuration __all__ = ['EventManager', 'LocalEventManager', 'PlatformEventManager'] diff --git a/src/apify/proxy_configuration.py b/src/apify/_proxy_configuration.py similarity index 99% rename from src/apify/proxy_configuration.py rename to src/apify/_proxy_configuration.py index 228d70cd..223f47e2 100644 --- a/src/apify/proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -9,8 +9,8 @@ import httpx -from apify.config import Configuration -from apify.log import logger +from apify._configuration import Configuration +from apify._log import logger from apify_shared.consts import ApifyEnvVars from apify_shared.utils import ignore_docs from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration diff --git a/src/apify/apify_storage_client/apify_storage_client.py b/src/apify/apify_storage_client/apify_storage_client.py index e827d820..b3c216a0 100644 --- a/src/apify/apify_storage_client/apify_storage_client.py +++ b/src/apify/apify_storage_client/apify_storage_client.py @@ -1,17 +1,16 @@ from typing_extensions import override -from apify.config import Configuration +from apify._configuration import Configuration +from apify.apify_storage_client.dataset_client import DatasetClient +from apify.apify_storage_client.dataset_collection_client import DatasetCollectionClient +from apify.apify_storage_client.key_value_store_client import KeyValueStoreClient +from apify.apify_storage_client.key_value_store_collection_client import KeyValueStoreCollectionClient +from apify.apify_storage_client.request_queue_client import RequestQueueClient +from apify.apify_storage_client.request_queue_collection_client import RequestQueueCollectionClient from apify_client import ApifyClientAsync from crawlee._utils.crypto import crypto_random_object_id from crawlee.base_storage_client import BaseStorageClient -from .dataset_client import DatasetClient -from .dataset_collection_client import DatasetCollectionClient -from .key_value_store_client import KeyValueStoreClient -from .key_value_store_collection_client import KeyValueStoreCollectionClient -from .request_queue_client import RequestQueueClient -from .request_queue_collection_client import RequestQueueCollectionClient - class ApifyStorageClient(BaseStorageClient): """A storage client implementation based on the Apify platform storage.""" diff --git a/src/apify/scrapy/__init__.py b/src/apify/scrapy/__init__.py index 717873ce..a1d065c2 100644 --- a/src/apify/scrapy/__init__.py +++ b/src/apify/scrapy/__init__.py @@ -1,3 +1,11 @@ -from .requests import to_apify_request, to_scrapy_request -from .scheduler import ApifyScheduler -from .utils import get_basic_auth_header, get_running_event_loop_id +from apify.scrapy.requests import to_apify_request, to_scrapy_request +from apify.scrapy.scheduler import ApifyScheduler +from apify.scrapy.utils import get_basic_auth_header, get_running_event_loop_id + +__all__ = [ + 'to_apify_request', + 'to_scrapy_request', + 'ApifyScheduler', + 'get_basic_auth_header', + 'get_running_event_loop_id', +] diff --git a/src/apify/scrapy/middlewares/__init__.py b/src/apify/scrapy/middlewares/__init__.py index 257252d5..c1d82a7e 100644 --- a/src/apify/scrapy/middlewares/__init__.py +++ b/src/apify/scrapy/middlewares/__init__.py @@ -1 +1,3 @@ -from .apify_proxy import ApifyHttpProxyMiddleware +from apify.scrapy.middlewares.apify_proxy import ApifyHttpProxyMiddleware + +__all__ = ['ApifyHttpProxyMiddleware'] diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index 4ab27166..3c0b9efc 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -12,8 +12,7 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from apify.actor import Actor -from apify.proxy_configuration import ProxyConfiguration +from apify import Actor, ProxyConfiguration from apify.scrapy.utils import get_basic_auth_header diff --git a/src/apify/scrapy/pipelines/__init__.py b/src/apify/scrapy/pipelines/__init__.py index fa2c95eb..7a94b771 100644 --- a/src/apify/scrapy/pipelines/__init__.py +++ b/src/apify/scrapy/pipelines/__init__.py @@ -1 +1,3 @@ -from .actor_dataset_push import ActorDatasetPushPipeline +from apify.scrapy.pipelines.actor_dataset_push import ActorDatasetPushPipeline + +__all__ = ['ActorDatasetPushPipeline'] diff --git a/src/apify/scrapy/pipelines/actor_dataset_push.py b/src/apify/scrapy/pipelines/actor_dataset_push.py index e75262da..8f371788 100644 --- a/src/apify/scrapy/pipelines/actor_dataset_push.py +++ b/src/apify/scrapy/pipelines/actor_dataset_push.py @@ -9,7 +9,7 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from apify.actor import Actor +from apify import Actor class ActorDatasetPushPipeline: diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index a04862b6..4ed40079 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -13,7 +13,7 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from apify.actor import Actor +from apify import Actor from crawlee import Request as CrawleeRequest from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 03635664..c98cdcbb 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -2,8 +2,8 @@ import traceback +from apify._configuration import Configuration from apify.apify_storage_client.apify_storage_client import ApifyStorageClient -from apify.config import Configuration try: from scrapy import Spider @@ -15,7 +15,7 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from apify.actor import Actor +from apify import Actor from apify.scrapy.requests import to_apify_request, to_scrapy_request from apify.scrapy.utils import nested_event_loop from apify.storages import RequestQueue diff --git a/tests/integration/actor_source_base/src/__main__.py b/tests/integration/actor_source_base/src/__main__.py index 643eb63c..48646f91 100644 --- a/tests/integration/actor_source_base/src/__main__.py +++ b/tests/integration/actor_source_base/src/__main__.py @@ -3,7 +3,7 @@ import asyncio import logging -from apify.log import ActorLogFormatter +from apify._log import ActorLogFormatter from .main import main diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 2190d446..72d992ba 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -12,7 +12,7 @@ import pytest from filelock import FileLock -import apify.actor +import apify._actor from apify_client import ApifyClientAsync from apify_shared.consts import ActorJobStatus, ActorSourceType @@ -35,7 +35,7 @@ def _reset_and_patch_default_instances() -> None: from crawlee import service_container cast(dict, service_container._services).clear() - delattr(apify.actor.Actor, '__wrapped__') + delattr(apify._actor.Actor, '__wrapped__') # TODO: StorageClientManager local storage client purge # noqa: TD003 diff --git a/tests/integration/test_actor_lifecycle.py b/tests/integration/test_actor_lifecycle.py index eae85700..84d1b22f 100644 --- a/tests/integration/test_actor_lifecycle.py +++ b/tests/integration/test_actor_lifecycle.py @@ -42,11 +42,11 @@ async def main() -> None: async def test_async_with_actor_properly_initialize(self: TestActorInit, make_actor: ActorFactory) -> None: async def main() -> None: - import apify.actor + import apify._actor async with Actor: - assert apify.actor.Actor._is_initialized - assert apify.actor.Actor._is_initialized is False + assert apify._actor.Actor._is_initialized + assert apify._actor.Actor._is_initialized is False actor = await make_actor('with-actor-init', main_func=main) diff --git a/tests/integration/test_actor_log.py b/tests/integration/test_actor_log.py index 7077eaf4..9c2c83be 100644 --- a/tests/integration/test_actor_log.py +++ b/tests/integration/test_actor_log.py @@ -13,7 +13,7 @@ async def test_actor_log(self: TestActorLog, make_actor: ActorFactory) -> None: async def main() -> None: import logging - from apify.log import ActorLogFormatter, logger + from apify._log import ActorLogFormatter, logger # Clear any other log handlers, so they don't mess with this test client_logger = logging.getLogger('apify_client') diff --git a/tests/unit/actor/test_actor_helpers.py b/tests/unit/actor/test_actor_helpers.py index 4839fa25..9f185c79 100644 --- a/tests/unit/actor/test_actor_helpers.py +++ b/tests/unit/actor/test_actor_helpers.py @@ -2,7 +2,8 @@ from typing import TYPE_CHECKING -from apify.actor import Actor, _ActorType +from apify import Actor +from apify._actor import _ActorType from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars, WebhookEventType diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index 576f26df..a9d7a37b 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -5,8 +5,8 @@ import pytest from apify import Actor +from apify._consts import ENCRYPTED_INPUT_VALUE_PREFIX from apify._crypto import public_encrypt -from apify.consts import ENCRYPTED_INPUT_VALUE_PREFIX from apify_shared.consts import ApifyEnvVars from apify_shared.utils import json_dumps diff --git a/tests/unit/actor/test_actor_lifecycle.py b/tests/unit/actor/test_actor_lifecycle.py index a4f3d33f..94a6c81c 100644 --- a/tests/unit/actor/test_actor_lifecycle.py +++ b/tests/unit/actor/test_actor_lifecycle.py @@ -9,8 +9,9 @@ import websockets.server from lazy_object_proxy import Proxy -import apify.actor -from apify.actor import Actor, _ActorType +import apify._actor +from apify import Actor +from apify._actor import _ActorType from apify_shared.consts import ApifyEnvVars from crawlee.events._types import Event, EventPersistStateData @@ -18,9 +19,9 @@ class TestActorInit: async def test_async_with_actor_properly_initialize(self: TestActorInit) -> None: async with Actor: - assert cast(Proxy, apify.actor.Actor).__wrapped__ is not None - assert cast(Proxy, apify.actor.Actor).__wrapped__._is_initialized - assert not cast(Proxy, apify.actor.Actor).__wrapped__._is_initialized + assert cast(Proxy, apify._actor.Actor).__wrapped__ is not None + assert cast(Proxy, apify._actor.Actor).__wrapped__._is_initialized + assert not cast(Proxy, apify._actor.Actor).__wrapped__._is_initialized async def test_actor_init(self: TestActorInit) -> None: my_actor = _ActorType() diff --git a/tests/unit/actor/test_actor_log.py b/tests/unit/actor/test_actor_log.py index d3a9fc3a..ee544bec 100644 --- a/tests/unit/actor/test_actor_log.py +++ b/tests/unit/actor/test_actor_log.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING from apify import Actor, __version__ -from apify.log import logger +from apify._log import logger from apify_client import __version__ as apify_client_version if TYPE_CHECKING: diff --git a/tests/unit/actor/test_actor_non_default_instance.py b/tests/unit/actor/test_actor_non_default_instance.py index 639e80d2..e9d34a0b 100644 --- a/tests/unit/actor/test_actor_non_default_instance.py +++ b/tests/unit/actor/test_actor_non_default_instance.py @@ -1,7 +1,6 @@ from datetime import timedelta -from apify import Actor -from apify.config import Configuration +from apify import Actor, Configuration async def test_actor_non_default_instance() -> None: diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 0a4d694e..551b8656 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -8,7 +8,7 @@ import pytest -import apify.actor +import apify._actor from apify_client.client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars from crawlee.configuration import Configuration as CrawleeConfiguration @@ -41,7 +41,7 @@ def reset() -> None: cast(dict, service_container._services).clear() - delattr(apify.actor.Actor, '__wrapped__') + delattr(apify._actor.Actor, '__wrapped__') # TODO: local storage client purge # noqa: TD003 return reset diff --git a/tests/unit/test_event_manager.py b/tests/unit/test_event_manager.py index 5278f25c..1f8c80bb 100644 --- a/tests/unit/test_event_manager.py +++ b/tests/unit/test_event_manager.py @@ -11,8 +11,8 @@ import websockets import websockets.server +from apify import Configuration from apify._platform_event_manager import EventManager, PlatformEventManager, SystemInfoEventData -from apify.config import Configuration from apify_shared.consts import ActorEnvVars from crawlee.events._types import Event diff --git a/tests/unit/test_proxy_configuration.py b/tests/unit/test_proxy_configuration.py index 3487c23b..b9ee787a 100644 --- a/tests/unit/test_proxy_configuration.py +++ b/tests/unit/test_proxy_configuration.py @@ -9,7 +9,7 @@ import httpx import pytest -from apify.proxy_configuration import ProxyConfiguration, is_url +from apify._proxy_configuration import ProxyConfiguration, is_url from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars From c944c15125aab5331eaf4e09f4e960ae82596d80 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 26 Aug 2024 17:44:13 +0200 Subject: [PATCH 63/68] Hide non-public parts of apify_storage_client --- src/apify/_actor.py | 2 +- src/apify/apify_storage_client/__init__.py | 3 +++ ...fy_storage_client.py => _apify_storage_client.py} | 12 ++++++------ .../{dataset_client.py => _dataset_client.py} | 0 ...ction_client.py => _dataset_collection_client.py} | 0 ...ue_store_client.py => _key_value_store_client.py} | 0 ...ient.py => _key_value_store_collection_client.py} | 0 ...uest_queue_client.py => _request_queue_client.py} | 0 ...client.py => _request_queue_collection_client.py} | 0 src/apify/scrapy/scheduler.py | 2 +- tests/integration/test_actor_key_value_store.py | 2 +- 11 files changed, 12 insertions(+), 9 deletions(-) rename src/apify/apify_storage_client/{apify_storage_client.py => _apify_storage_client.py} (77%) rename src/apify/apify_storage_client/{dataset_client.py => _dataset_client.py} (100%) rename src/apify/apify_storage_client/{dataset_collection_client.py => _dataset_collection_client.py} (100%) rename src/apify/apify_storage_client/{key_value_store_client.py => _key_value_store_client.py} (100%) rename src/apify/apify_storage_client/{key_value_store_collection_client.py => _key_value_store_collection_client.py} (100%) rename src/apify/apify_storage_client/{request_queue_client.py => _request_queue_client.py} (100%) rename src/apify/apify_storage_client/{request_queue_collection_client.py => _request_queue_collection_client.py} (100%) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index aae51f0f..efca7763 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -17,7 +17,7 @@ from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager from apify._proxy_configuration import ProxyConfiguration from apify._utils import get_system_info, is_running_in_ipython -from apify.apify_storage_client.apify_storage_client import ApifyStorageClient +from apify.apify_storage_client import ApifyStorageClient from apify.storages import Dataset, KeyValueStore, RequestQueue from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars, WebhookEventType diff --git a/src/apify/apify_storage_client/__init__.py b/src/apify/apify_storage_client/__init__.py index e69de29b..8b6d517c 100644 --- a/src/apify/apify_storage_client/__init__.py +++ b/src/apify/apify_storage_client/__init__.py @@ -0,0 +1,3 @@ +from apify.apify_storage_client._apify_storage_client import ApifyStorageClient + +__all__ = ['ApifyStorageClient'] diff --git a/src/apify/apify_storage_client/apify_storage_client.py b/src/apify/apify_storage_client/_apify_storage_client.py similarity index 77% rename from src/apify/apify_storage_client/apify_storage_client.py rename to src/apify/apify_storage_client/_apify_storage_client.py index b3c216a0..0b7a0c29 100644 --- a/src/apify/apify_storage_client/apify_storage_client.py +++ b/src/apify/apify_storage_client/_apify_storage_client.py @@ -1,12 +1,12 @@ from typing_extensions import override from apify._configuration import Configuration -from apify.apify_storage_client.dataset_client import DatasetClient -from apify.apify_storage_client.dataset_collection_client import DatasetCollectionClient -from apify.apify_storage_client.key_value_store_client import KeyValueStoreClient -from apify.apify_storage_client.key_value_store_collection_client import KeyValueStoreCollectionClient -from apify.apify_storage_client.request_queue_client import RequestQueueClient -from apify.apify_storage_client.request_queue_collection_client import RequestQueueCollectionClient +from apify.apify_storage_client._dataset_client import DatasetClient +from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient +from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient +from apify.apify_storage_client._key_value_store_collection_client import KeyValueStoreCollectionClient +from apify.apify_storage_client._request_queue_client import RequestQueueClient +from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient from apify_client import ApifyClientAsync from crawlee._utils.crypto import crypto_random_object_id from crawlee.base_storage_client import BaseStorageClient diff --git a/src/apify/apify_storage_client/dataset_client.py b/src/apify/apify_storage_client/_dataset_client.py similarity index 100% rename from src/apify/apify_storage_client/dataset_client.py rename to src/apify/apify_storage_client/_dataset_client.py diff --git a/src/apify/apify_storage_client/dataset_collection_client.py b/src/apify/apify_storage_client/_dataset_collection_client.py similarity index 100% rename from src/apify/apify_storage_client/dataset_collection_client.py rename to src/apify/apify_storage_client/_dataset_collection_client.py diff --git a/src/apify/apify_storage_client/key_value_store_client.py b/src/apify/apify_storage_client/_key_value_store_client.py similarity index 100% rename from src/apify/apify_storage_client/key_value_store_client.py rename to src/apify/apify_storage_client/_key_value_store_client.py diff --git a/src/apify/apify_storage_client/key_value_store_collection_client.py b/src/apify/apify_storage_client/_key_value_store_collection_client.py similarity index 100% rename from src/apify/apify_storage_client/key_value_store_collection_client.py rename to src/apify/apify_storage_client/_key_value_store_collection_client.py diff --git a/src/apify/apify_storage_client/request_queue_client.py b/src/apify/apify_storage_client/_request_queue_client.py similarity index 100% rename from src/apify/apify_storage_client/request_queue_client.py rename to src/apify/apify_storage_client/_request_queue_client.py diff --git a/src/apify/apify_storage_client/request_queue_collection_client.py b/src/apify/apify_storage_client/_request_queue_collection_client.py similarity index 100% rename from src/apify/apify_storage_client/request_queue_collection_client.py rename to src/apify/apify_storage_client/_request_queue_collection_client.py diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index c98cdcbb..a25045bb 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -3,7 +3,7 @@ import traceback from apify._configuration import Configuration -from apify.apify_storage_client.apify_storage_client import ApifyStorageClient +from apify.apify_storage_client import ApifyStorageClient try: from scrapy import Spider diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index 8521821d..4e5f525b 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -189,7 +189,7 @@ async def test_get_public_url(self: TestGetPublicUrl, make_actor: ActorFactory) async def main() -> None: from typing import cast - from apify.apify_storage_client.key_value_store_client import KeyValueStoreClient + from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient async with Actor: public_api_url = Actor.config.api_public_base_url From c129c6820450d4db7e52b080efc30e0e0ff0238d Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 27 Aug 2024 14:43:59 +0200 Subject: [PATCH 64/68] Remove useless type info from docstrings --- src/apify/_actor.py | 259 ++++++++---------- src/apify/_crypto.py | 16 +- src/apify/_platform_event_manager.py | 4 +- src/apify/_proxy_configuration.py | 21 +- .../_key_value_store_client.py | 2 +- src/apify/scrapy/middlewares/apify_proxy.py | 10 +- tests/integration/conftest.py | 11 +- tests/unit/conftest.py | 14 +- 8 files changed, 154 insertions(+), 183 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index efca7763..1536d0d5 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -48,7 +48,7 @@ def __init__(self, config: Configuration | None = None) -> None: Note that you don't have to do this, all the functionality is accessible using the default instance (e.g. `Actor.open_dataset()`). Args: - config (Configuration, optional): The Actor configuration to be used. If not passed, a new Configuration instance will be created. + config: The Actor configuration to be used. If not passed, a new Configuration instance will be created. """ self._configuration = config or Configuration.get_global_configuration() self._apify_client = self.new_client() @@ -191,10 +191,10 @@ async def exit( and stops the event manager. Args: - exit_code (int, optional): The exit code with which the Actor should fail (defaults to `0`). - event_listeners_timeout (timedelta, optional): How long should the Actor wait for Actor event listeners to finish before exiting. - status_message (str, optional): The final status message that the Actor should display. - cleanup_timeout (timedelta, optional): How long we should wait for event listeners. + exit_code: The exit code with which the Actor should fail (defaults to `0`). + event_listeners_timeout: How long should the Actor wait for Actor event listeners to finish before exiting. + status_message: The final status message that the Actor should display. + cleanup_timeout: How long we should wait for event listeners. """ self._raise_if_not_initialized() @@ -242,9 +242,9 @@ async def fail( but it additionally sets the exit code to `1` (by default). Args: - exit_code (int, optional): The exit code with which the Actor should fail (defaults to `1`). - exception (BaseException, optional): The exception with which the Actor failed. - status_message (str, optional): The final status message that the Actor should display. + exit_code: The exit code with which the Actor should fail (defaults to `1`). + exception: The exception with which the Actor failed. + status_message: The final status message that the Actor should display. """ self._raise_if_not_initialized() @@ -273,12 +273,11 @@ def new_client( That's useful if you want to use the client as a different Apify user than the SDK internals are using. Args: - token (str, optional): The Apify API token - api_url (str, optional): The URL of the Apify API server to which to connect to. Defaults to https://api.apify.com - max_retries (int, optional): How many times to retry a failed request at most - min_delay_between_retries (timedelta, optional): How long will the client wait between retrying requests - (increases exponentially from this value) - timeout (timedelta, optional): The socket timeout of the HTTP requests sent to the Apify API + token: The Apify API token + api_url: The URL of the Apify API server to which to connect to. Defaults to https://api.apify.com + max_retries: How many times to retry a failed request at most + min_delay_between_retries: How long will the client wait between retrying requests (increases exponentially from this value) + timeout: The socket timeout of the HTTP requests sent to the Apify API """ token = token or self._configuration.token api_url = api_url or self._configuration.api_base_url @@ -304,15 +303,14 @@ async def open_dataset( The actual data is stored either on the local filesystem or in the Apify cloud. Args: - id (str, optional): ID of the dataset to be opened. + id: ID of the dataset to be opened. If neither `id` nor `name` are provided, the method returns the default dataset associated with the Actor run. - name (str, optional): Name of the dataset to be opened. + name: Name of the dataset to be opened. If neither `id` nor `name` are provided, the method returns the default dataset associated with the Actor run. - force_cloud (bool, optional): If set to `True` then the Apify cloud storage is always used. + force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible to combine local and cloud storage. - Returns: - Dataset: An instance of the `Dataset` class for the given ID or name. + Returns: An instance of the `Dataset` class for the given ID or name. """ self._raise_if_not_initialized() @@ -338,15 +336,14 @@ async def open_key_value_store( The actual data is stored either on a local filesystem or in the Apify cloud. Args: - id (str, optional): ID of the key-value store to be opened. + id: ID of the key-value store to be opened. If neither `id` nor `name` are provided, the method returns the default key-value store associated with the Actor run. - name (str, optional): Name of the key-value store to be opened. + name: Name of the key-value store to be opened. If neither `id` nor `name` are provided, the method returns the default key-value store associated with the Actor run. - force_cloud (bool, optional): If set to `True` then the Apify cloud storage is always used. + force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible to combine local and cloud storage. - Returns: - KeyValueStore: An instance of the `KeyValueStore` class for the given ID or name. + Returns: An instance of the `KeyValueStore` class for the given ID or name. """ self._raise_if_not_initialized() @@ -372,15 +369,14 @@ async def open_request_queue( and depth-first crawling orders. Args: - id (str, optional): ID of the request queue to be opened. + id: ID of the request queue to be opened. If neither `id` nor `name` are provided, the method returns the default request queue associated with the Actor run. - name (str, optional): Name of the request queue to be opened. + name: Name of the request queue to be opened. If neither `id` nor `name` are provided, the method returns the default request queue associated with the Actor run. - force_cloud (bool, optional): If set to `True` then the Apify cloud storage is always used. + force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible to combine local and cloud storage. - Returns: - RequestQueue: An instance of the `RequestQueue` class for the given ID or name. + Returns: An instance of the `RequestQueue` class for the given ID or name. """ self._raise_if_not_initialized() @@ -391,11 +387,11 @@ async def open_request_queue( storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None), ) - async def push_data(self, data: Any) -> None: + async def push_data(self, data: dict | list[dict]) -> None: """Store an object or a list of objects to the default dataset of the current Actor run. Args: - data (object or list of objects, optional): The data to push to the default dataset. + data: The data to push to the default dataset. """ self._raise_if_not_initialized() @@ -425,8 +421,8 @@ async def get_value(self, key: str, default_value: Any = None) -> Any: """Get a value from the default key-value store associated with the current Actor run. Args: - key (str): The key of the record which to retrieve. - default_value (Any, optional): Default value returned in case the record does not exist. + key: The key of the record which to retrieve. + default_value: Default value returned in case the record does not exist. """ self._raise_if_not_initialized() @@ -443,9 +439,9 @@ async def set_value( """Set or delete a value in the default key-value store associated with the current Actor run. Args: - key (str): The key of the record which to set. - value (any): The value of the record which to set, or None, if the record should be deleted. - content_type (str, optional): The content type which should be set to the value. + key: The key of the record which to set. + value: The value of the record which to set, or None, if the record should be deleted. + content_type: The content type which should be set to the value. """ self._raise_if_not_initialized() @@ -456,27 +452,27 @@ def on(self, event_name: Event, listener: Callable) -> Callable: """Add an event listener to the Actor's event manager. The following events can be emitted: - - `ActorEventTypes.SYSTEM_INFO`: + - `Event.SYSTEM_INFO`: Emitted every minute, the event data contains info about the resource usage of the Actor. - - `ActorEventTypes.MIGRATING`: + - `Event.MIGRATING`: Emitted when the Actor running on the Apify platform is going to be migrated to another worker server soon. You can use it to persist the state of the Actor and gracefully stop your in-progress tasks, so that they are not interrupted by the migration.. - - `ActorEventTypes.PERSIST_STATE`: + - `Event.PERSIST_STATE`: Emitted in regular intervals (by default 60 seconds) to notify the Actor that it should persist its state, in order to avoid repeating all work when the Actor restarts. This event is automatically emitted together with the migrating event, in which case the `isMigrating` flag in the event data is set to True, otherwise the flag is False. Note that this event is provided merely for your convenience, you can achieve the same effect using an interval and listening for the migrating event. - - `ActorEventTypes.ABORTING`: + - `Event.ABORTING`: When a user aborts an Actor run on the Apify platform, they can choose to abort it gracefully, to allow the Actor some time before getting terminated. This graceful abort emits the aborting event, which you can use to clean up the Actor state. Args: - event_name (ActorEventTypes): The Actor event for which to listen to. - listener (Callable): The function which is to be called when the event is emitted (can be async). + event_name: The Actor event for which to listen to. + listener: The function which is to be called when the event is emitted (can be async). """ self._raise_if_not_initialized() @@ -487,8 +483,8 @@ def off(self, event_name: Event, listener: Callable | None = None) -> None: """Remove a listener, or all listeners, from an Actor event. Args: - event_name (ActorEventTypes): The Actor event for which to remove listeners. - listener (Callable, optional): The listener which is supposed to be removed. If not passed, all listeners of this event are removed. + event_name: The Actor event for which to remove listeners. + listener: The listener which is supposed to be removed. If not passed, all listeners of this event are removed. """ self._raise_if_not_initialized() @@ -545,29 +541,25 @@ async def start( Unlike `Actor.call`, this method just starts the run without waiting for finish. Args: - actor_id (str): The ID of the Actor to be run. - run_input (Any, optional): The input to pass to the Actor run. - token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). - content_type (str, optional): The content type of the input. - build (str, optional): Specifies the Actor build to run. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the Actor (typically latest). - memory_mbytes (int, optional): Memory limit for the run, in megabytes. - By default, the run uses a memory limit specified in the default run configuration for the Actor. - timeout (timedelta, optional): Optional timeout for the run, in seconds. - By default, the run uses timeout specified in the default run configuration for the Actor. - wait_for_finish (int, optional): The maximum number of seconds the server waits for the run to finish. - By default, it is 0, the maximum value is 300. - webhooks (list of dict, optional): Optional ad-hoc webhooks (https://docs.apify.com/webhooks/ad-hoc-webhooks) - associated with the Actor run which can be used to receive a notification, - e.g. when the Actor finished or failed. - If you already have a webhook set up for the Actor or task, you do not have to add it again here. - Each webhook is represented by a dictionary containing these items: - * ``event_types``: list of ``WebhookEventType`` values which trigger the webhook - * ``request_url``: URL to which to send the webhook HTTP request - * ``payload_template`` (optional): Optional template for the request payload - - Returns: - dict: Info about the started Actor run + actor_id: The ID of the Actor to be run. + run_input: The input to pass to the Actor run. + token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). + content_type: The content type of the input. + build: Specifies the Actor build to run. It can be either a build tag or build number. + By default, the run uses the build specified in the default run configuration for the Actor (typically latest). + memory_mbytes: Memory limit for the run, in megabytes. + By default, the run uses a memory limit specified in the default run configuration for the Actor. + timeout: Optional timeout for the run, in seconds. + By default, the run uses timeout specified in the default run configuration for the Actor. + wait_for_finish: The maximum number of seconds the server waits for the run to finish. By default, it is 0, the maximum value is 300. + webhooks: Optional ad-hoc webhooks (https://docs.apify.com/webhooks/ad-hoc-webhooks) associated with the Actor run which can be used to + receive a notification, e.g. when the Actor finished or failed. If you already have a webhook set up for the Actor or task, + you do not have to add it again here. Each webhook is represented by a dictionary containing these items: + * ``event_types``: list of ``WebhookEventType`` values which trigger the webhook + * ``request_url``: URL to which to send the webhook HTTP request + * ``payload_template`` (optional): Optional template for the request payload + + Returns: Info about the started Actor run """ self._raise_if_not_initialized() @@ -594,15 +586,14 @@ async def abort( """Abort given Actor run on the Apify platform using the current user account (determined by the `APIFY_TOKEN` environment variable). Args: - run_id (str): The ID of the Actor run to be aborted. - token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). - status_message (str, optional): Status message of the Actor to be set on the platform. - gracefully (bool, optional): If True, the Actor run will abort gracefully. + run_id: The ID of the Actor run to be aborted. + token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). + status_message: Status message of the Actor to be set on the platform. + gracefully: If True, the Actor run will abort gracefully. It will send ``aborting`` and ``persistStates`` events into the run and force-stop the run after 30 seconds. It is helpful in cases where you plan to resurrect the run later. - Returns: - dict: Info about the aborted Actor run + Returns: Info about the aborted Actor run """ self._raise_if_not_initialized() @@ -631,23 +622,21 @@ async def call( It waits indefinitely, unless the wait argument is provided. Args: - actor_id (str): The ID of the Actor to be run. - run_input (Any, optional): The input to pass to the Actor run. - token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). - content_type (str, optional): The content type of the input. - build (str, optional): Specifies the Actor build to run. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the Actor (typically latest). - memory_mbytes (int, optional): Memory limit for the run, in megabytes. - By default, the run uses a memory limit specified in the default run configuration for the Actor. - timeout (timedelta, optional): Optional timeout for the run, in seconds. - By default, the run uses timeout specified in the default run configuration for the Actor. - webhooks (list, optional): Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, - which can be used to receive a notification, e.g. when the Actor finished or failed. - If you already have a webhook set up for the Actor, you do not have to add it again here. - wait(timedelta, optional): The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. - - Returns: - dict: Info about the started Actor run + actor_id: The ID of the Actor to be run. + run_input: The input to pass to the Actor run. + token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). + content_type: The content type of the input. + build: Specifies the Actor build to run. It can be either a build tag or build number. By default, the run uses the build specified in + the default run configuration for the Actor (typically latest). + memory_mbytes: Memory limit for the run, in megabytes. + By default, the run uses a memory limit specified in the default run configuration for the Actor. + timeout: Optional timeout for the run, in seconds. + By default, the run uses timeout specified in the default run configuration for the Actor. + webhooks: Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, which can be used to receive a notification, + e.g. when the Actor finished or failed. If you already have a webhook set up for the Actor, you do not have to add it again here. + wait: The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. + + Returns: Info about the started Actor run """ self._raise_if_not_initialized() @@ -683,23 +672,21 @@ async def call_task( If you want to run an Actor directly rather than an Actor task, please use the `Actor.call` Args: - task_id (str): The ID of the Actor to be run. - task_input (Any, optional): Overrides the input to pass to the Actor run. - token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). - content_type (str, optional): The content type of the input. - build (str, optional): Specifies the Actor build to run. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the Actor (typically latest). - memory_mbytes (int, optional): Memory limit for the run, in megabytes. - By default, the run uses a memory limit specified in the default run configuration for the Actor. - timeout (timedelta, optional): Optional timeout for the run, in seconds. - By default, the run uses timeout specified in the default run configuration for the Actor. - webhooks (list, optional): Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, - which can be used to receive a notification, e.g. when the Actor finished or failed. - If you already have a webhook set up for the Actor, you do not have to add it again here. - wait (timedelta, optional): The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. - - Returns: - dict: Info about the started Actor run + task_id: The ID of the Actor to be run. + task_input: Overrides the input to pass to the Actor run. + token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). + content_type: The content type of the input. + build: Specifies the Actor build to run. It can be either a build tag or build number. + By default, the run uses the build specified in the default run configuration for the Actor (typically latest). + memory_mbytes: Memory limit for the run, in megabytes. + By default, the run uses a memory limit specified in the default run configuration for the Actor. + timeout: Optional timeout for the run, in seconds. + By default, the run uses timeout specified in the default run configuration for the Actor. + webhooks: Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, which can be used to receive a notification, + e.g. when the Actor finished or failed. If you already have a webhook set up for the Actor, you do not have to add it again here. + wait: The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. + + Returns: Info about the started Actor run """ self._raise_if_not_initialized() @@ -730,12 +717,12 @@ async def metamorph( and the new input is stored under the `INPUT-METAMORPH-1` key in the same default key-value store. Args: - target_actor_id (str): ID of the target Actor that the run should be transformed into - run_input (Any, optional): The input to pass to the new run. - target_actor_build (str, optional): The build of the target Actor. It can be either a build tag or build number. + target_actor_id: ID of the target Actor that the run should be transformed into + run_input: The input to pass to the new run. + target_actor_build: The build of the target Actor. It can be either a build tag or build number. By default, the run uses the build specified in the default run configuration for the target Actor (typically the latest build). - content_type (str, optional): The content type of the input. - custom_after_sleep (timedelta, optional): How long to sleep for after the metamorph, to wait for the container to be stopped. + content_type: The content type of the input. + custom_after_sleep: How long to sleep for after the metamorph, to wait for the container to be stopped. """ self._raise_if_not_initialized() @@ -771,8 +758,8 @@ async def reboot( The system stops the current container and starts a new one, with the same run ID and default storages. Args: - event_listeners_timeout (timedelta, optional): How long should the Actor wait for Actor event listeners to finish before exiting - custom_after_sleep (timedelta, optional): How long to sleep for after the reboot, to wait for the container to be stopped. + event_listeners_timeout: How long should the Actor wait for Actor event listeners to finish before exiting + custom_after_sleep: How long to sleep for after the reboot, to wait for the container to be stopped. """ self._raise_if_not_initialized() @@ -815,17 +802,14 @@ async def add_webhook( For more information about Apify Actor webhooks, please see the [documentation](https://docs.apify.com/webhooks). Args: - event_types (list of WebhookEventType): List of event types that should trigger the webhook. At least one is required. - request_url (str): URL that will be invoked once the webhook is triggered. - payload_template (str, optional): Specification of the payload that will be sent to request_url - ignore_ssl_errors (bool, optional): Whether the webhook should ignore SSL errors returned by request_url - do_not_retry (bool, optional): Whether the webhook should retry sending the payload to request_url upon - failure. - idempotency_key (str, optional): A unique identifier of a webhook. You can use it to ensure that you won't - create the same webhook multiple times. - - Returns: - dict: The created webhook + event_types: List of event types that should trigger the webhook. At least one is required. + request_url: URL that will be invoked once the webhook is triggered. + payload_template: Specification of the payload that will be sent to request_url + ignore_ssl_errors: Whether the webhook should ignore SSL errors returned by request_url + do_not_retry: Whether the webhook should retry sending the payload to request_url upon failure. + idempotency_key: A unique identifier of a webhook. You can use it to ensure that you won't create the same webhook multiple times. + + Returns: The created webhook """ self._raise_if_not_initialized() @@ -856,11 +840,10 @@ async def set_status_message( """Set the status message for the current Actor run. Args: - status_message (str): The status message to set to the run. - is_terminal (bool, optional): Set this flag to True if this is the final status message of the Actor run. + status_message: The status message to set to the run. + is_terminal: Set this flag to True if this is the final status message of the Actor run. - Returns: - dict: The updated Actor run object + Returns: The updated Actor run object """ self._raise_if_not_initialized() @@ -895,17 +878,15 @@ async def create_proxy_configuration( For more details and code examples, see the `ProxyConfiguration` class. Args: - actor_proxy_input (dict, optional): Proxy configuration field from the Actor input, if input has such input field. + actor_proxy_input: Proxy configuration field from the Actor input, if input has such input field. If you pass this argument, all the other arguments will be inferred from it. - password (str, optional): Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available. - groups (list of str, optional): Proxy groups which the Apify Proxy should use, if provided. - country_code (str, optional): Country which the Apify Proxy should use, if provided. - proxy_urls (list of str, optional): Custom proxy server URLs which should be rotated through. - new_url_function (Callable, optional): Function which returns a custom proxy URL to be used. - - Returns: - ProxyConfiguration, optional: ProxyConfiguration object with the passed configuration, - or None, if no proxy should be used based on the configuration. + password: Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available. + groups: Proxy groups which the Apify Proxy should use, if provided. + country_code: Country which the Apify Proxy should use, if provided. + proxy_urls: Custom proxy server URLs which should be rotated through. + new_url_function: Function which returns a custom proxy URL to be used. + + Returns: ProxyConfiguration object with the passed configuration, or None, if no proxy should be used based on the configuration. """ self._raise_if_not_initialized() diff --git a/src/apify/_crypto.py b/src/apify/_crypto.py index 7d1edb13..f0c79593 100644 --- a/src/apify/_crypto.py +++ b/src/apify/_crypto.py @@ -25,11 +25,10 @@ def public_encrypt(value: str, *, public_key: rsa.RSAPublicKey) -> dict: It returns the encrypted password and encrypted value in BASE64 format. Args: - value (str): The value which should be encrypted. - public_key (RSAPublicKey): Public key to use for encryption. + value: The value which should be encrypted. + public_key: Public key to use for encryption. - Returns: - disc: Encrypted password and value. + Returns: Encrypted password and value. """ key_bytes = crypto_random_object_id(ENCRYPTION_KEY_LENGTH).encode('utf-8') initialized_vector_bytes = crypto_random_object_id(ENCRYPTION_IV_LENGTH).encode('utf-8') @@ -65,12 +64,11 @@ def private_decrypt( """Decrypts the given encrypted value using the private key and password. Args: - encrypted_password (str): Password used to encrypt the private key encoded as base64 string. - encrypted_value (str): Encrypted value to decrypt as base64 string. - private_key (RSAPrivateKey): Private key to use for decryption. + encrypted_password: Password used to encrypt the private key encoded as base64 string. + encrypted_value: Encrypted value to decrypt as base64 string. + private_key: Private key to use for decryption. - Returns: - str: Decrypted value. + Returns: Decrypted value. """ encrypted_password_bytes = base64.b64decode(encrypted_password.encode('utf-8')) encrypted_value_bytes = base64.b64decode(encrypted_value.encode('utf-8')) diff --git a/src/apify/_platform_event_manager.py b/src/apify/_platform_event_manager.py index dc352c8c..b7c56c91 100644 --- a/src/apify/_platform_event_manager.py +++ b/src/apify/_platform_event_manager.py @@ -129,8 +129,8 @@ def __init__(self, config: Configuration, **kwargs: Unpack[EventManagerOptions]) """Create an instance of the EventManager. Args: - config (Configuration): The Actor configuration to be used in this event manager. - kwargs (EventManagerOptions): Event manager options - forwarded to the base class + config: The Actor configuration to be used in this event manager. + kwargs: Event manager options - forwarded to the base class """ super().__init__(**kwargs) diff --git a/src/apify/_proxy_configuration.py b/src/apify/_proxy_configuration.py index 223f47e2..da9681ed 100644 --- a/src/apify/_proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -117,12 +117,12 @@ def __init__( """Create a ProxyConfiguration instance. It is highly recommended to use `Actor.create_proxy_configuration()` instead of this. Args: - password (str, optional): Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available. - groups (list of str, optional): Proxy groups which the Apify Proxy should use, if provided. - country_code (str, optional): Country which the Apify Proxy should use, if provided. - proxy_urls (list of str, optional): Custom proxy server URLs which should be rotated through. - new_url_function (Callable, optional): Function which returns a custom proxy URL to be used. - tiered_proxy_urls (list of list of str, optional): Proxy URLs arranged into tiers + password: Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available. + groups: Proxy groups which the Apify Proxy should use, if provided. + country_code: Country which the Apify Proxy should use, if provided. + proxy_urls: Custom proxy server URLs which should be rotated through. + new_url_function: Function which returns a custom proxy URL to be used. + tiered_proxy_urls: Proxy URLs arranged into tiers """ _actor_config = _actor_config or Configuration.get_global_configuration() @@ -193,15 +193,14 @@ async def new_proxy_info( If you need the URL string only, use `ProxyConfiguration.new_url`. Args: - session_id (int or str, optional): Represents the identifier of a proxy session (https://docs.apify.com/proxy#sessions). + session_id: Represents the identifier of a proxy session (https://docs.apify.com/proxy#sessions). All the HTTP requests going through the proxy with the same session identifier will use the same target proxy server (i.e. the same IP address). The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. - request (Request, optional): request for which the proxy info is being issued, used in proxy tier handling - proxy_tier (int, optional): allows forcing the proxy tier to be used + request: request for which the proxy info is being issued, used in proxy tier handling + proxy_tier: allows forcing the proxy tier to be used - Returns: - ProxyInfo: Dictionary that represents information about the proxy and its configuration. + Returns: Dictionary that represents information about the proxy and its configuration. """ if session_id is not None: _check(session_id, label='session_id', max_length=SESSION_ID_MAX_LENGTH, pattern=APIFY_PROXY_VALUE_REGEX) diff --git a/src/apify/apify_storage_client/_key_value_store_client.py b/src/apify/apify_storage_client/_key_value_store_client.py index 769d157e..d02d18cb 100644 --- a/src/apify/apify_storage_client/_key_value_store_client.py +++ b/src/apify/apify_storage_client/_key_value_store_client.py @@ -86,7 +86,7 @@ async def get_public_url(self, key: str) -> str: """Get a URL for the given key that may be used to publicly access the value in the remote key-value store. Args: - key (str): The key for which the URL should be generated. + key: The key for which the URL should be generated. """ public_api_url = self._api_public_base_url diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index 3c0b9efc..3120f972 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -42,11 +42,9 @@ def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> Apify """Create an instance of ApifyHttpProxyMiddleware from a Scrapy Crawler. Args: - cls: Class type. crawler: Scrapy Crawler object. - Returns: - ApifyHttpProxyMiddleware: Instance of the class. + Returns: Instance of the class. """ proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS') @@ -73,9 +71,6 @@ async def process_request(self: ApifyHttpProxyMiddleware, request: Request, spid Raises: ValueError: If username and password are not provided in the proxy URL. - - Returns: - None: The request is processed and middleware pipeline can continue. """ Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}') url = await self._get_new_proxy_url() @@ -122,8 +117,7 @@ async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult: Raises: NotConfigured: If creation of the proxy configuration fails. - Returns: - ParseResult: New proxy URL. + Returns: New proxy URL. """ # Get proxy configuration, creating it if necessary proxy_cfg = ( diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 72d992ba..366ff3c8 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -153,13 +153,12 @@ async def _make_actor( You have to pass exactly one of the `main_func`, `main_py` and `source_files` arguments. Args: - actor_label (str): The label which will be a part of the generated Actor name - main_func (Callable, optional): The main function of the Actor. - main_py (str, optional): The `src/main.py` file of the Actor. - source_files (dict, optional): A dictionary of the source files of the Actor. + actor_label: The label which will be a part of the generated Actor name + main_func: The main function of the Actor. + main_py: The `src/main.py` file of the Actor. + source_files: A dictionary of the source files of the Actor. - Returns: - ActorClientAsync: A resource client for the created Actor. + Returns: A resource client for the created Actor. """ if not (main_func or main_py or source_files): raise TypeError('One of `main_func`, `main_py` or `source_files` arguments must be specified') diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 551b8656..661a7e52 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -82,13 +82,13 @@ def patch( One of `return_value` and `replacement_method` arguments must be specified. Args: - method (str): Which root method to patch in the ApifyClientAsync. - submethod (str): Which submethod to patch in the root method's result. - return_value (optional, Any): What should the patched method return. - replacement_method (optional, Callable): What method should the original method be replaced by. - is_async (optional, bool): Whether the return value or replacement method should be wrapped by an async wrapper, - in order to not break any `await` statements. - If not passed, it is automatically detected from the type of the method which is being replaced. + method: Which root method to patch in the ApifyClientAsync. + submethod: Which submethod to patch in the root method's result. + return_value: What should the patched method return. + replacement_method: What method should the original method be replaced by. + is_async: Whether the return value or replacement method should be wrapped by an async wrapper, + in order to not break any `await` statements. + If not passed, it is automatically detected from the type of the method which is being replaced. """ client_method = getattr(ApifyClientAsync, method, None) From 64159dc1a2260d2620f4f1f867fc314acbce20e2 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 27 Aug 2024 15:42:43 +0200 Subject: [PATCH 65/68] Add Configuration field descriptions --- src/apify/_configuration.py | 186 ++++++++++++++++++++++++++++++------ 1 file changed, 155 insertions(+), 31 deletions(-) diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 9612265a..e65c7102 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -24,7 +24,8 @@ class Configuration(CrawleeConfiguration): 'actor_id', 'apify_actor_id', 'apify_act_id', - ) + ), + description='ID of the Actor', ), ] = None @@ -35,7 +36,8 @@ class Configuration(CrawleeConfiguration): 'actor_run_id', 'apify_actor_run_id', 'apify_act_run_id', - ) + ), + description='ID of the Actor run', ), ] = None @@ -45,7 +47,8 @@ class Configuration(CrawleeConfiguration): validation_alias=AliasChoices( 'actor_build_id', 'apify_actor_build_id', - ) + ), + description='ID of the Actor build used in the run', ), ] = None @@ -55,7 +58,8 @@ class Configuration(CrawleeConfiguration): validation_alias=AliasChoices( 'actor_build_number', 'apify_actor_build_number', - ) + ), + description='Build number of the Actor build used in the run', ), ] = None @@ -65,7 +69,8 @@ class Configuration(CrawleeConfiguration): validation_alias=AliasChoices( 'actor_task_id', 'apify_actor_task_id', - ) + ), + description='ID of the Actor task. Empty if Actor is run outside of any task, e.g. directly using the API', ), ] = None @@ -75,19 +80,41 @@ class Configuration(CrawleeConfiguration): validation_alias=AliasChoices( 'actor_events_websocket_url', 'apify_actor_events_ws_url', - ) + ), + description='Websocket URL where Actor may listen for events from Actor platform', ), ] = None - api_base_url: Annotated[str, Field(alias='apify_api_base_url')] = 'https://api.apify.com' + api_base_url: Annotated[ + str, + Field( + alias='apify_api_base_url', + description='Internal URL of the Apify API. May be used to interact with the platform programmatically', + ), + ] = 'https://api.apify.com' - api_public_base_url: Annotated[str, Field(alias='apify_api_public_base_url')] = 'https://api.apify.com' + api_public_base_url: Annotated[ + str, + Field( + alias='apify_api_public_base_url', + description='Public URL of the Apify API. May be used to link to REST API resources', + ), + ] = 'https://api.apify.com' - dedicated_cpus: Annotated[float | None, Field(alias='apify_dedicated_cpus')] = None + dedicated_cpus: Annotated[ + float | None, + Field( + alias='apify_dedicated_cpus', + description='Number of CPU cores reserved for the actor, based on allocated memory', + ), + ] = None disable_outdated_warning: Annotated[ bool, - Field(alias='apify_disable_outdated_warning'), + Field( + alias='apify_disable_outdated_warning', + description='Controls the display of outdated SDK version warnings', + ), BeforeValidator(lambda val: val or False), ] = False @@ -100,37 +127,105 @@ class Configuration(CrawleeConfiguration): 'actor_input_key', 'apify_input_key', 'crawlee_input_key', - ) + ), + description='Key of the record in the default key-value store that holds the Actor input', ), ] = 'INPUT' - input_secrets_private_key_file: Annotated[str | None, Field(alias='apify_input_secrets_private_key_file')] = None + input_secrets_private_key_file: Annotated[ + str | None, + Field( + alias='apify_input_secrets_private_key_file', + description='Path to the secret key used to decrypt Secret inputs.', + ), + ] = None - input_secrets_private_key_passphrase: Annotated[str | None, Field(alias='apify_input_secrets_private_key_passphrase')] = None + input_secrets_private_key_passphrase: Annotated[ + str | None, + Field( + alias='apify_input_secrets_private_key_passphrase', + description='Passphrase for the input secret key', + ), + ] = None - is_at_home: Annotated[bool, Field(alias='apify_is_at_home')] = False + is_at_home: Annotated[ + bool, + Field( + alias='apify_is_at_home', + description='True if the Actor is running on Apify servers', + ), + ] = False - latest_sdk_version: Annotated[str | None, Field(alias='apify_sdk_latest_version', deprecated=True)] = None + latest_sdk_version: Annotated[ + str | None, + Field( + alias='apify_sdk_latest_version', + deprecated=True, + description='Specifies the most recent release version of the Apify SDK for Javascript. Used for checking for updates.', + ), + ] = None - log_format: Annotated[str | None, Field(alias='apify_log_format', deprecated=True)] = None + log_format: Annotated[ + str | None, + Field(alias='apify_log_format', deprecated=True), + ] = None max_paid_dataset_items: Annotated[ int | None, - Field(alias='actor_max_paid_dataset_items'), + Field( + alias='actor_max_paid_dataset_items', + description='For paid-per-result Actors, the user-set limit on returned results. Do not exceed this limit', + ), BeforeValidator(lambda val: val or None), ] = None - meta_origin: Annotated[str | None, Field(alias='apify_meta_origin')] = None + meta_origin: Annotated[ + str | None, + Field( + alias='apify_meta_origin', + description='Specifies how an Actor run was started', + ), + ] = None - metamorph_after_sleep: Annotated[timedelta_ms, Field(alias='apify_metamorph_after_sleep_millis')] = timedelta(minutes=5) + metamorph_after_sleep: Annotated[ + timedelta_ms, + Field( + alias='apify_metamorph_after_sleep_millis', + description='How long the Actor needs to wait before exiting after triggering a metamorph', + ), + ] = timedelta(minutes=5) - proxy_hostname: Annotated[str, Field(alias='apify_proxy_hostname')] = 'proxy.apify.com' + proxy_hostname: Annotated[ + str, + Field( + alias='apify_proxy_hostname', + description='Hostname of the Apify proxy', + ), + ] = 'proxy.apify.com' - proxy_password: Annotated[str | None, Field(alias='apify_proxy_password')] = None + proxy_password: Annotated[ + str | None, + Field( + alias='apify_proxy_password', + description='Password to the Apify proxy', + ), + ] = None - proxy_port: Annotated[int, Field(alias='apify_proxy_port')] = 8000 + proxy_port: Annotated[ + int, + Field( + alias='apify_proxy_port', + description='Port to communicate with the Apify proxy', + ), + ] = 8000 - proxy_status_url: Annotated[str, Field(alias='apify_proxy_status_url')] = 'http://proxy.apify.com' + proxy_status_url: Annotated[ + str, + Field( + alias='apify_proxy_status_url', + description='URL for retrieving proxy status information', + ), + ] = 'http://proxy.apify.com' started_at: Annotated[ datetime | None, @@ -138,7 +233,8 @@ class Configuration(CrawleeConfiguration): validation_alias=AliasChoices( 'actor_started_at', 'apify_started_at', - ) + ), + description='Date when the Actor was started', ), ] = None @@ -148,15 +244,34 @@ class Configuration(CrawleeConfiguration): validation_alias=AliasChoices( 'actor_timeout_at', 'apify_timeout_at', - ) + ), + description='Date when the Actor will time out', ), ] = None - standby_port: Annotated[int, Field(alias='actor_standby_port')] = 4322 + standby_port: Annotated[ + int, + Field( + alias='actor_standby_port', + description='TCP port for the Actor to start an HTTP server to receive messages in the Actor Standby mode', + ), + ] = 4322 - token: Annotated[str | None, Field(alias='apify_token')] = None + token: Annotated[ + str | None, + Field( + alias='apify_token', + description='API token of the user who started the Actor', + ), + ] = None - user_id: Annotated[str | None, Field(alias='apify_user_id')] = None + user_id: Annotated[ + str | None, + Field( + alias='apify_user_id', + description='ID of the user who started the Actor. May differ from the Actor owner', + ), + ] = None web_server_port: Annotated[ int, @@ -164,7 +279,9 @@ class Configuration(CrawleeConfiguration): validation_alias=AliasChoices( 'actor_web_server_port', 'apify_container_port', - ) + ), + description='TCP port for the Actor to start an HTTP server on' + 'This server can be used to receive external messages or expose monitoring and control interfaces', ), ] = 4321 @@ -174,11 +291,18 @@ class Configuration(CrawleeConfiguration): validation_alias=AliasChoices( 'actor_web_server_url', 'apify_container_url', - ) + ), + description='Unique public URL for accessing the Actor run web server from the outside world', ), ] = 'http://localhost:4321' - workflow_key: Annotated[str | None, Field(alias='apify_workflow_key')] = None + workflow_key: Annotated[ + str | None, + Field( + alias='apify_workflow_key', + description='Identifier used for grouping related runs and API calls together', + ), + ] = None # Monkey-patch the base class so that it works with the extended configuration From e4261ae4527aee71ca23ff6b1716889885f0a02e Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 27 Aug 2024 15:58:51 +0200 Subject: [PATCH 66/68] Shuffle around some imports --- pyproject.toml | 3 ++- src/apify/__init__.py | 3 ++- src/apify/_actor.py | 11 ++++++----- src/apify/_crypto.py | 3 ++- src/apify/_platform_event_manager.py | 3 ++- src/apify/_proxy_configuration.py | 5 +++-- .../apify_storage_client/_apify_storage_client.py | 7 ++++--- src/apify/scrapy/requests.py | 3 ++- src/apify/scrapy/scheduler.py | 3 ++- tests/integration/actor_source_base/src/__main__.py | 3 +-- tests/integration/conftest.py | 2 +- tests/integration/test_actor_api_helpers.py | 2 +- tests/integration/test_actor_dataset.py | 2 +- tests/integration/test_actor_events.py | 3 ++- tests/integration/test_actor_key_value_store.py | 2 +- tests/integration/test_actor_request_queue.py | 2 +- tests/integration/test_fixtures.py | 3 ++- .../actor/test_actor_create_proxy_configuration.py | 3 ++- tests/unit/actor/test_actor_dataset.py | 3 ++- tests/unit/actor/test_actor_env_helpers.py | 3 ++- tests/unit/actor/test_actor_helpers.py | 5 +++-- tests/unit/actor/test_actor_key_value_store.py | 6 +++--- tests/unit/actor/test_actor_lifecycle.py | 5 +++-- tests/unit/actor/test_actor_log.py | 3 ++- tests/unit/conftest.py | 3 ++- tests/unit/scrapy/requests/test_to_scrapy_request.py | 3 ++- tests/unit/test_event_manager.py | 5 +++-- tests/unit/test_proxy_configuration.py | 3 ++- 28 files changed, 61 insertions(+), 41 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 97214ec0..f630f420 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -157,7 +157,8 @@ docstring-quotes = "double" inline-quotes = "single" [tool.ruff.lint.isort] -known-first-party = ["apify", "apify_client", "apify_shared", "crawlee"] +known-local-folder = ["apify"] +known-first-party = ["apify_client", "apify_shared", "crawlee"] [tool.ruff.lint.pydocstyle] convention = "google" diff --git a/src/apify/__init__.py b/src/apify/__init__.py index 57cd6994..71ca3d2a 100644 --- a/src/apify/__init__.py +++ b/src/apify/__init__.py @@ -1,9 +1,10 @@ from importlib import metadata +from crawlee.events._types import Event + from apify._actor import Actor from apify._configuration import Configuration from apify._proxy_configuration import ProxyConfiguration, ProxyInfo -from crawlee.events._types import Event __version__ = metadata.version('apify') diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 1536d0d5..d2b85a7b 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -10,6 +10,12 @@ from pydantic import AliasChoices from typing_extensions import Self +from apify_client import ApifyClientAsync +from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars, WebhookEventType +from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value +from crawlee import service_container +from crawlee.events._types import Event, EventPersistStateData + from apify._configuration import Configuration from apify._consts import EVENT_LISTENERS_TIMEOUT from apify._crypto import decrypt_input_secrets, load_private_key @@ -19,11 +25,6 @@ from apify._utils import get_system_info, is_running_in_ipython from apify.apify_storage_client import ApifyStorageClient from apify.storages import Dataset, KeyValueStore, RequestQueue -from apify_client import ApifyClientAsync -from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars, WebhookEventType -from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value -from crawlee import service_container -from crawlee.events._types import Event, EventPersistStateData if TYPE_CHECKING: import logging diff --git a/src/apify/_crypto.py b/src/apify/_crypto.py index f0c79593..499beaa0 100644 --- a/src/apify/_crypto.py +++ b/src/apify/_crypto.py @@ -8,10 +8,11 @@ from cryptography.hazmat.primitives.asymmetric import padding, rsa from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes -from apify._consts import ENCRYPTED_INPUT_VALUE_REGEXP from apify_shared.utils import ignore_docs from crawlee._utils.crypto import crypto_random_object_id +from apify._consts import ENCRYPTED_INPUT_VALUE_REGEXP + ENCRYPTION_KEY_LENGTH = 32 ENCRYPTION_IV_LENGTH = 16 ENCRYPTION_AUTH_TAG_LENGTH = 16 diff --git a/src/apify/_platform_event_manager.py b/src/apify/_platform_event_manager.py index b7c56c91..0eb0dda6 100644 --- a/src/apify/_platform_event_manager.py +++ b/src/apify/_platform_event_manager.py @@ -8,12 +8,13 @@ from pydantic import BaseModel, Discriminator, Field, TypeAdapter from typing_extensions import Self, Unpack, override -from apify._log import logger from apify_shared.utils import ignore_docs from crawlee.events._event_manager import EventManager, EventManagerOptions from crawlee.events._local_event_manager import LocalEventManager from crawlee.events._types import Event, EventAbortingData, EventExitData, EventMigratingData, EventPersistStateData, EventSystemInfoData +from apify._log import logger + if TYPE_CHECKING: from types import TracebackType diff --git a/src/apify/_proxy_configuration.py b/src/apify/_proxy_configuration.py index da9681ed..47347c8b 100644 --- a/src/apify/_proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -9,14 +9,15 @@ import httpx -from apify._configuration import Configuration -from apify._log import logger from apify_shared.consts import ApifyEnvVars from apify_shared.utils import ignore_docs from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo from crawlee.proxy_configuration import _NewUrlFunction +from apify._configuration import Configuration +from apify._log import logger + if TYPE_CHECKING: from apify_client import ApifyClientAsync from crawlee import Request diff --git a/src/apify/apify_storage_client/_apify_storage_client.py b/src/apify/apify_storage_client/_apify_storage_client.py index 0b7a0c29..1153e95d 100644 --- a/src/apify/apify_storage_client/_apify_storage_client.py +++ b/src/apify/apify_storage_client/_apify_storage_client.py @@ -1,5 +1,9 @@ from typing_extensions import override +from apify_client import ApifyClientAsync +from crawlee._utils.crypto import crypto_random_object_id +from crawlee.base_storage_client import BaseStorageClient + from apify._configuration import Configuration from apify.apify_storage_client._dataset_client import DatasetClient from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient @@ -7,9 +11,6 @@ from apify.apify_storage_client._key_value_store_collection_client import KeyValueStoreCollectionClient from apify.apify_storage_client._request_queue_client import RequestQueueClient from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient -from apify_client import ApifyClientAsync -from crawlee._utils.crypto import crypto_random_object_id -from crawlee.base_storage_client import BaseStorageClient class ApifyStorageClient(BaseStorageClient): diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 4ed40079..6d2fd348 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -13,11 +13,12 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from apify import Actor from crawlee import Request as CrawleeRequest from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id +from apify import Actor + def _is_request_produced_by_middleware(scrapy_request: Request) -> bool: """Returns True if the Scrapy request was produced by a downloader middleware, otherwise False. diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index a25045bb..db8f6ad0 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -15,11 +15,12 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc +from crawlee._utils.crypto import crypto_random_object_id + from apify import Actor from apify.scrapy.requests import to_apify_request, to_scrapy_request from apify.scrapy.utils import nested_event_loop from apify.storages import RequestQueue -from crawlee._utils.crypto import crypto_random_object_id class ApifyScheduler(BaseScheduler): diff --git a/tests/integration/actor_source_base/src/__main__.py b/tests/integration/actor_source_base/src/__main__.py index 48646f91..f6228448 100644 --- a/tests/integration/actor_source_base/src/__main__.py +++ b/tests/integration/actor_source_base/src/__main__.py @@ -3,9 +3,8 @@ import asyncio import logging -from apify._log import ActorLogFormatter - from .main import main +from apify._log import ActorLogFormatter handler = logging.StreamHandler() handler.setFormatter(ActorLogFormatter()) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 366ff3c8..e152da58 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -12,10 +12,10 @@ import pytest from filelock import FileLock -import apify._actor from apify_client import ApifyClientAsync from apify_shared.consts import ActorJobStatus, ActorSourceType +import apify._actor from ._utils import generate_unique_resource_name if TYPE_CHECKING: diff --git a/tests/integration/test_actor_api_helpers.py b/tests/integration/test_actor_api_helpers.py index db9f970b..589528fa 100644 --- a/tests/integration/test_actor_api_helpers.py +++ b/tests/integration/test_actor_api_helpers.py @@ -4,10 +4,10 @@ import json from typing import TYPE_CHECKING -from apify import Actor from crawlee._utils.crypto import crypto_random_object_id from ._utils import generate_unique_resource_name +from apify import Actor if TYPE_CHECKING: from apify_client import ApifyClientAsync diff --git a/tests/integration/test_actor_dataset.py b/tests/integration/test_actor_dataset.py index 1486dbca..e61446db 100644 --- a/tests/integration/test_actor_dataset.py +++ b/tests/integration/test_actor_dataset.py @@ -2,10 +2,10 @@ from typing import TYPE_CHECKING -from apify import Actor from apify_shared.consts import ApifyEnvVars from ._utils import generate_unique_resource_name +from apify import Actor if TYPE_CHECKING: import pytest diff --git a/tests/integration/test_actor_events.py b/tests/integration/test_actor_events.py index 19f3339f..460f456b 100644 --- a/tests/integration/test_actor_events.py +++ b/tests/integration/test_actor_events.py @@ -3,10 +3,11 @@ import asyncio from typing import TYPE_CHECKING -from apify import Actor from apify_shared.consts import ActorEventTypes from crawlee.events._types import Event +from apify import Actor + if TYPE_CHECKING: from .conftest import ActorFactory diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index 4e5f525b..6a27ef40 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -2,10 +2,10 @@ from typing import TYPE_CHECKING -from apify import Actor from apify_shared.consts import ApifyEnvVars from ._utils import generate_unique_resource_name +from apify import Actor if TYPE_CHECKING: import pytest diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 33d8082c..adc5784a 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -2,11 +2,11 @@ from typing import TYPE_CHECKING -from apify import Actor from apify_shared.consts import ApifyEnvVars from crawlee import Request from ._utils import generate_unique_resource_name +from apify import Actor if TYPE_CHECKING: import pytest diff --git a/tests/integration/test_fixtures.py b/tests/integration/test_fixtures.py index a0f256ad..93ff5588 100644 --- a/tests/integration/test_fixtures.py +++ b/tests/integration/test_fixtures.py @@ -3,9 +3,10 @@ from datetime import datetime, timezone from typing import TYPE_CHECKING -from apify import Actor from crawlee._utils.crypto import crypto_random_object_id +from apify import Actor + if TYPE_CHECKING: from apify_client import ApifyClientAsync diff --git a/tests/unit/actor/test_actor_create_proxy_configuration.py b/tests/unit/actor/test_actor_create_proxy_configuration.py index 29c6e928..8dd4db95 100644 --- a/tests/unit/actor/test_actor_create_proxy_configuration.py +++ b/tests/unit/actor/test_actor_create_proxy_configuration.py @@ -5,10 +5,11 @@ import httpx import pytest -from apify import Actor from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars +from apify import Actor + if TYPE_CHECKING: from respx import MockRouter diff --git a/tests/unit/actor/test_actor_dataset.py b/tests/unit/actor/test_actor_dataset.py index 7e0b384b..10400069 100644 --- a/tests/unit/actor/test_actor_dataset.py +++ b/tests/unit/actor/test_actor_dataset.py @@ -4,9 +4,10 @@ import pytest -from apify import Actor from apify_shared.consts import ActorEnvVars +from apify import Actor + if TYPE_CHECKING: from crawlee.memory_storage_client import MemoryStorageClient diff --git a/tests/unit/actor/test_actor_env_helpers.py b/tests/unit/actor/test_actor_env_helpers.py index 44483cd0..36a5268f 100644 --- a/tests/unit/actor/test_actor_env_helpers.py +++ b/tests/unit/actor/test_actor_env_helpers.py @@ -7,9 +7,10 @@ from pydantic_core import TzInfo -from apify import Actor from apify_shared.consts import BOOL_ENV_VARS, DATETIME_ENV_VARS, FLOAT_ENV_VARS, INTEGER_ENV_VARS, STRING_ENV_VARS, ActorEnvVars, ApifyEnvVars +from apify import Actor + if TYPE_CHECKING: import pytest diff --git a/tests/unit/actor/test_actor_helpers.py b/tests/unit/actor/test_actor_helpers.py index 9f185c79..0d6a08d0 100644 --- a/tests/unit/actor/test_actor_helpers.py +++ b/tests/unit/actor/test_actor_helpers.py @@ -2,11 +2,12 @@ from typing import TYPE_CHECKING -from apify import Actor -from apify._actor import _ActorType from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars, WebhookEventType +from apify import Actor +from apify._actor import _ActorType + if TYPE_CHECKING: import pytest diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index a9d7a37b..5d855f36 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -4,13 +4,13 @@ import pytest -from apify import Actor -from apify._consts import ENCRYPTED_INPUT_VALUE_PREFIX -from apify._crypto import public_encrypt from apify_shared.consts import ApifyEnvVars from apify_shared.utils import json_dumps from ..test_crypto import PRIVATE_KEY_PASSWORD, PRIVATE_KEY_PEM_BASE64, PUBLIC_KEY +from apify import Actor +from apify._consts import ENCRYPTED_INPUT_VALUE_PREFIX +from apify._crypto import public_encrypt if TYPE_CHECKING: from crawlee.memory_storage_client import MemoryStorageClient diff --git a/tests/unit/actor/test_actor_lifecycle.py b/tests/unit/actor/test_actor_lifecycle.py index 94a6c81c..8053f2f3 100644 --- a/tests/unit/actor/test_actor_lifecycle.py +++ b/tests/unit/actor/test_actor_lifecycle.py @@ -9,11 +9,12 @@ import websockets.server from lazy_object_proxy import Proxy +from apify_shared.consts import ApifyEnvVars +from crawlee.events._types import Event, EventPersistStateData + import apify._actor from apify import Actor from apify._actor import _ActorType -from apify_shared.consts import ApifyEnvVars -from crawlee.events._types import Event, EventPersistStateData class TestActorInit: diff --git a/tests/unit/actor/test_actor_log.py b/tests/unit/actor/test_actor_log.py index ee544bec..9103c7c3 100644 --- a/tests/unit/actor/test_actor_log.py +++ b/tests/unit/actor/test_actor_log.py @@ -5,9 +5,10 @@ import sys from typing import TYPE_CHECKING +from apify_client import __version__ as apify_client_version + from apify import Actor, __version__ from apify._log import logger -from apify_client import __version__ as apify_client_version if TYPE_CHECKING: import pytest diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 661a7e52..2c441883 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -8,12 +8,13 @@ import pytest -import apify._actor from apify_client.client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars from crawlee.configuration import Configuration as CrawleeConfiguration from crawlee.memory_storage_client import MemoryStorageClient +import apify._actor + if TYPE_CHECKING: from pathlib import Path diff --git a/tests/unit/scrapy/requests/test_to_scrapy_request.py b/tests/unit/scrapy/requests/test_to_scrapy_request.py index 253f316b..8c9ebe4f 100644 --- a/tests/unit/scrapy/requests/test_to_scrapy_request.py +++ b/tests/unit/scrapy/requests/test_to_scrapy_request.py @@ -6,9 +6,10 @@ from scrapy import Request, Spider from scrapy.http.headers import Headers -from apify.scrapy.requests import to_scrapy_request from crawlee import Request as CrawleeRequest +from apify.scrapy.requests import to_scrapy_request + class DummySpider(Spider): name = 'dummy_spider' diff --git a/tests/unit/test_event_manager.py b/tests/unit/test_event_manager.py index 1f8c80bb..80977e97 100644 --- a/tests/unit/test_event_manager.py +++ b/tests/unit/test_event_manager.py @@ -11,11 +11,12 @@ import websockets import websockets.server -from apify import Configuration -from apify._platform_event_manager import EventManager, PlatformEventManager, SystemInfoEventData from apify_shared.consts import ActorEnvVars from crawlee.events._types import Event +from apify import Configuration +from apify._platform_event_manager import EventManager, PlatformEventManager, SystemInfoEventData + class TestEventManagerLocal: async def test_lifecycle_local(self, caplog: pytest.LogCaptureFixture) -> None: diff --git a/tests/unit/test_proxy_configuration.py b/tests/unit/test_proxy_configuration.py index b9ee787a..7074e395 100644 --- a/tests/unit/test_proxy_configuration.py +++ b/tests/unit/test_proxy_configuration.py @@ -9,10 +9,11 @@ import httpx import pytest -from apify._proxy_configuration import ProxyConfiguration, is_url from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars +from apify._proxy_configuration import ProxyConfiguration, is_url + if TYPE_CHECKING: from respx import MockRouter From 671b2cefe06724ea40b64bf35ac4f25bb1458b6d Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 27 Aug 2024 16:17:37 +0200 Subject: [PATCH 67/68] Remove obsolete stuff --- src/apify/_utils.py | 5 ----- src/apify/storages/__init__.py | 1 - 2 files changed, 6 deletions(-) diff --git a/src/apify/_utils.py b/src/apify/_utils.py index ff1426fd..687bf93c 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -3,11 +3,6 @@ import builtins import sys from importlib import metadata -from logging import getLogger -from typing import TypeVar - -T = TypeVar('T') -logger = getLogger(__name__) def get_system_info() -> dict: diff --git a/src/apify/storages/__init__.py b/src/apify/storages/__init__.py index 3e168046..2ed85e84 100644 --- a/src/apify/storages/__init__.py +++ b/src/apify/storages/__init__.py @@ -1,4 +1,3 @@ -# ruff: noqa: PLC0414 from crawlee.storages import Dataset, KeyValueStore, RequestQueue __all__ = ['Dataset', 'KeyValueStore', 'RequestQueue'] From 53c06a6e521a688121b5e951aee8c9de7567fd8a Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 27 Aug 2024 16:39:46 +0200 Subject: [PATCH 68/68] Fix integration tests --- tests/integration/conftest.py | 1 - tests/integration/test_actor_events.py | 4 ++-- tests/integration/test_actor_log.py | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index e152da58..040bc71a 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -177,7 +177,6 @@ async def _make_actor( 'import asyncio', '', 'from apify import Actor', - 'from crawlee.events.types import Event', '', '', '', diff --git a/tests/integration/test_actor_events.py b/tests/integration/test_actor_events.py index 460f456b..f1a89ace 100644 --- a/tests/integration/test_actor_events.py +++ b/tests/integration/test_actor_events.py @@ -4,7 +4,6 @@ from typing import TYPE_CHECKING from apify_shared.consts import ActorEventTypes -from crawlee.events._types import Event from apify import Actor @@ -20,7 +19,7 @@ async def main() -> None: from typing import Any, Callable from apify_shared.consts import ActorEventTypes, ApifyEnvVars - from crawlee.events._types import EventSystemInfoData + from crawlee.events._types import Event, EventSystemInfoData os.environ[ApifyEnvVars.PERSIST_STATE_INTERVAL_MILLIS] = '900' @@ -71,6 +70,7 @@ async def main() -> None: import os from apify_shared.consts import ApifyEnvVars + from crawlee.events._types import Event os.environ[ApifyEnvVars.PERSIST_STATE_INTERVAL_MILLIS] = '100' diff --git a/tests/integration/test_actor_log.py b/tests/integration/test_actor_log.py index 9c2c83be..dcfe8d8d 100644 --- a/tests/integration/test_actor_log.py +++ b/tests/integration/test_actor_log.py @@ -80,7 +80,7 @@ async def main() -> None: assert run_log_lines.pop(0) == '[apify] ERROR Error message' assert run_log_lines.pop(0) == '[apify] ERROR Exception message' assert run_log_lines.pop(0) == ' Traceback (most recent call last):' - assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 36, in main' + assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 35, in main' assert run_log_lines.pop(0) == " raise ValueError('Dummy ValueError')" assert run_log_lines.pop(0) == ' ValueError: Dummy ValueError' assert run_log_lines.pop(0) == '[apify] INFO Multi' @@ -89,7 +89,7 @@ async def main() -> None: assert run_log_lines.pop(0) == 'message' assert run_log_lines.pop(0) == '[apify] ERROR Actor failed with an exception' assert run_log_lines.pop(0) == ' Traceback (most recent call last):' - assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 44, in main' + assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 43, in main' assert run_log_lines.pop(0) == " raise RuntimeError('Dummy RuntimeError')" assert run_log_lines.pop(0) == ' RuntimeError: Dummy RuntimeError' assert run_log_lines.pop(0) == '[apify] INFO Exiting Actor ({"exit_code": 91})'