diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a7cb99ad..bb92f3e8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -50,7 +50,7 @@ tests with HTML coverage report execute `make unit-tests-cov`. ## Integration tests -We have integration tests which build and run actors using the Python SDK on the Apify Platform. To run these tests, +We have integration tests which build and run Actors using the Python SDK on the Apify Platform. To run these tests, you need to set the `APIFY_TEST_USER_API_TOKEN` environment variable to the API token of the Apify user you want to use for the tests, and then start them with `make integration-tests`. diff --git a/docs/02-guides/02-beautiful-soup.mdx b/docs/02-guides/02-beautiful-soup.mdx index a625741f..a7ebdc84 100644 --- a/docs/02-guides/02-beautiful-soup.mdx +++ b/docs/02-guides/02-beautiful-soup.mdx @@ -36,7 +36,7 @@ async def main(): max_depth = actor_input.get('max_depth', 1) if not start_urls: - Actor.log.info('No start URLs specified in actor input, exiting...') + Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() # Enqueue the starting URLs in the default request queue diff --git a/docs/02-guides/03-playwright.mdx b/docs/02-guides/03-playwright.mdx index 8094e621..a46f578f 100644 --- a/docs/02-guides/03-playwright.mdx +++ b/docs/02-guides/03-playwright.mdx @@ -29,7 +29,7 @@ To create Actors which use Playwright, start from the [Playwright & Python](http On the Apify platform, the Actor will already have Playwright and the necessary browsers preinstalled in its Docker image, including the tools and setup necessary to run browsers in headful mode. -When running the Actor locally, you'll need to finish the Playwright setup yourself before you can run the actor. +When running the Actor locally, you'll need to finish the Playwright setup yourself before you can run the Actor. @@ -69,7 +69,7 @@ async def main(): max_depth = actor_input.get('max_depth', 1) if not start_urls: - Actor.log.info('No start URLs specified in actor input, exiting...') + Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() # Enqueue the starting URLs in the default request queue diff --git a/docs/02-guides/04-selenium.mdx b/docs/02-guides/04-selenium.mdx index 3efa5149..3fb77d7c 100644 --- a/docs/02-guides/04-selenium.mdx +++ b/docs/02-guides/04-selenium.mdx @@ -53,7 +53,7 @@ async def main(): max_depth = actor_input.get('max_depth', 1) if not start_urls: - Actor.log.info('No start URLs specified in actor input, exiting...') + Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() # Enqueue the starting URLs in the default request queue diff --git a/docs/02-guides/05-scrapy.mdx b/docs/02-guides/05-scrapy.mdx index f73c4a3c..ea9825d0 100644 --- a/docs/02-guides/05-scrapy.mdx +++ b/docs/02-guides/05-scrapy.mdx @@ -87,7 +87,7 @@ class TitleSpider(scrapy.Spider): if link_url.startswith(('http://', 'https://')): yield scrapy.Request(link_url) -# Pushes the scraped items into the actor's default dataset +# Pushes the scraped items into the Actor's default dataset class ActorDatasetPushPipeline: async def process_item(self, item, spider): item_dict = ItemAdapter(item).asdict() diff --git a/docs/03-concepts/04-actor-events.mdx b/docs/03-concepts/04-actor-events.mdx index 8d795cab..c041035b 100644 --- a/docs/03-concepts/04-actor-events.mdx +++ b/docs/03-concepts/04-actor-events.mdx @@ -91,7 +91,7 @@ async def main(): # Save the state when the `PERSIST_STATE` event happens async def save_state(event_data): nonlocal processed_items - Actor.log.info('Saving actor state', extra=event_data) + Actor.log.info('Saving Actor state', extra=event_data) await Actor.set_value('STATE', processed_items) Actor.on(ActorEventTypes.PERSIST_STATE, save_state) diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index df472b2f..00000000 --- a/mypy.ini +++ /dev/null @@ -1,22 +0,0 @@ -[mypy] -python_version = 3.8 -files = - scripts, - src, - tests -check_untyped_defs = True -disallow_incomplete_defs = True -disallow_untyped_calls = True -disallow_untyped_decorators = True -disallow_untyped_defs = True -no_implicit_optional = True -warn_redundant_casts = True -warn_return_any = True -warn_unreachable = True -warn_unused_ignores = True - -[mypy-scrapy.*] -ignore_missing_imports = True - -[mypy-sortedcollections.*] -ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index 7688aaa4..f630f420 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "apify" -version = "1.7.3" +version = "2.0.0" description = "Apify SDK for Python" readme = "README.md" license = { text = "Apache Software License" } @@ -20,7 +20,7 @@ classifiers = [ "Topic :: Software Development :: Libraries", ] -requires-python = ">=3.8" +requires-python = ">=3.9" # We use inclusive ordered comparison clause for non-Apify packages intentionally in order to enhance the Apify SDK's # compatibility with a wide range of external packages. This decision was discussed in detail in the following PR: @@ -31,8 +31,10 @@ dependencies = [ "aiofiles >= 22.1.0", "aioshutil >= 1.0", "colorama >= 0.4.6", + "crawlee >= 0.3.0", "cryptography >= 39.0.0", "httpx >= 0.24.0", + "lazy-object-proxy >= 1.10.0", "psutil >= 5.9.0", "pyee >= 11.0.0", "sortedcollections >= 2.0.0", @@ -90,12 +92,16 @@ line-length = 150 [tool.ruff.lint] select = ["ALL"] ignore = [ + "A002", # Argument is shadowing a Python builtin + "ANN101", # Missing type annotation for `self` in method + "ANN102", # Missing type annotation for `cls` in method "ANN401", # Dynamically typed expressions (typing.Any) are disallowed in {filename} "BLE001", # Do not catch blind exception "C901", # `{name}` is too complex "COM812", # This rule may cause conflicts when used with the formatter "D100", # Missing docstring in public module "D104", # Missing docstring in public package + "D107", # Missing docstring in `__init__` "EM", # flake8-errmsg "G004", # Logging statement uses f-string "ISC001", # This rule may cause conflicts when used with the formatter @@ -152,6 +158,33 @@ inline-quotes = "single" [tool.ruff.lint.isort] known-local-folder = ["apify"] +known-first-party = ["apify_client", "apify_shared", "crawlee"] [tool.ruff.lint.pydocstyle] convention = "google" + +[tool.basedpyright] +typeCheckingMode = "standard" + +[tool.pytest.ini_options] +asyncio_mode = "auto" +timeout = 1200 + +[tool.mypy] +python_version = "3.9" +plugins = ["pydantic.mypy"] +files = ["scripts", "src", "tests"] +check_untyped_defs = true +disallow_incomplete_defs = true +disallow_untyped_calls = true +disallow_untyped_decorators = true +disallow_untyped_defs = true +no_implicit_optional = true +warn_redundant_casts = true +warn_return_any = true +warn_unreachable = true +warn_unused_ignores = true + +[[tool.mypy.overrides]] +module = ['scrapy', 'scrapy.*', 'sortedcollections', 'lazy_object_proxy'] +ignore_missing_imports = true diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 5231254c..00000000 --- a/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -asyncio_mode = auto -timeout = 1200 diff --git a/src/apify/__init__.py b/src/apify/__init__.py index d9a2e5b2..71ca3d2a 100644 --- a/src/apify/__init__.py +++ b/src/apify/__init__.py @@ -1,9 +1,11 @@ from importlib import metadata -from .actor import Actor -from .config import Configuration -from .proxy_configuration import ProxyConfiguration, ProxyInfo +from crawlee.events._types import Event + +from apify._actor import Actor +from apify._configuration import Configuration +from apify._proxy_configuration import ProxyConfiguration, ProxyInfo __version__ = metadata.version('apify') -__all__ = ['Actor', 'Configuration', 'ProxyConfiguration', 'ProxyInfo', '__version__'] +__all__ = ['Actor', 'Event', 'Configuration', 'ProxyConfiguration', 'ProxyInfo', '__version__'] diff --git a/src/apify/_actor.py b/src/apify/_actor.py new file mode 100644 index 00000000..d2b85a7b --- /dev/null +++ b/src/apify/_actor.py @@ -0,0 +1,919 @@ +from __future__ import annotations + +import asyncio +import os +import sys +from datetime import timedelta +from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast + +from lazy_object_proxy import Proxy +from pydantic import AliasChoices +from typing_extensions import Self + +from apify_client import ApifyClientAsync +from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars, WebhookEventType +from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value +from crawlee import service_container +from crawlee.events._types import Event, EventPersistStateData + +from apify._configuration import Configuration +from apify._consts import EVENT_LISTENERS_TIMEOUT +from apify._crypto import decrypt_input_secrets, load_private_key +from apify._log import logger +from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager +from apify._proxy_configuration import ProxyConfiguration +from apify._utils import get_system_info, is_running_in_ipython +from apify.apify_storage_client import ApifyStorageClient +from apify.storages import Dataset, KeyValueStore, RequestQueue + +if TYPE_CHECKING: + import logging + from types import TracebackType + + from crawlee.proxy_configuration import _NewUrlFunction + + +MainReturnType = TypeVar('MainReturnType') + + +class _ActorType: + """The class of `Actor`. Only make a new instance if you're absolutely sure you need to.""" + + _apify_client: ApifyClientAsync + _configuration: Configuration + _is_exiting = False + + def __init__(self, config: Configuration | None = None) -> None: + """Create an Actor instance. + + Note that you don't have to do this, all the functionality is accessible using the default instance (e.g. `Actor.open_dataset()`). + + Args: + config: The Actor configuration to be used. If not passed, a new Configuration instance will be created. + """ + self._configuration = config or Configuration.get_global_configuration() + self._apify_client = self.new_client() + + self._event_manager: EventManager + if self._configuration.is_at_home: + self._event_manager = PlatformEventManager( + config=self._configuration, + persist_state_interval=self._configuration.persist_state_interval, + ) + else: + self._event_manager = LocalEventManager( + system_info_interval=self._configuration.system_info_interval, + persist_state_interval=self._configuration.persist_state_interval, + ) + + self._is_initialized = False + + @ignore_docs + async def __aenter__(self) -> Self: + """Initialize the Actor. + + Automatically initializes the Actor instance when you use it in an `async with ...` statement. + + When you exit the `async with` block, the `Actor.exit()` method is called, + and if any exception happens while executing the block code, + the `Actor.fail` method is called. + """ + await self.init() + return self + + @ignore_docs + async def __aexit__( + self, + _exc_type: type[BaseException] | None, + exc_value: BaseException | None, + _exc_traceback: TracebackType | None, + ) -> None: + """Exit the Actor, handling any exceptions properly. + + When you exit the `async with` block, the `Actor.exit()` method is called, + and if any exception happens while executing the block code, + the `Actor.fail` method is called. + """ + if not self._is_exiting: + if exc_value: + await self.fail( + exit_code=ActorExitCodes.ERROR_USER_FUNCTION_THREW.value, + exception=exc_value, + ) + else: + await self.exit() + + def __repr__(self) -> str: + if self is cast(Proxy, Actor).__wrapped__: + return '' + + return super().__repr__() + + def __call__(self, config: Configuration) -> Self: + """Make a new Actor instance with a non-default configuration.""" + return self.__class__(config=config) + + @property + def apify_client(self) -> ApifyClientAsync: + """The ApifyClientAsync instance the Actor instance uses.""" + return self._apify_client + + @property + def config(self) -> Configuration: + """The Configuration instance the Actor instance uses.""" + return self._configuration + + @property + def event_manager(self) -> EventManager: + """The EventManager instance the Actor instance uses.""" + return self._event_manager + + @property + def log(self) -> logging.Logger: + """The logging.Logger instance the Actor uses.""" + return logger + + def _raise_if_not_initialized(self) -> None: + if not self._is_initialized: + raise RuntimeError('The Actor was not initialized!') + + async def init(self) -> None: + """Initialize the Actor instance. + + This initializes the Actor instance. + It configures the right storage client based on whether the Actor is running locally or on the Apify platform, + it initializes the event manager for processing Actor events, + and starts an interval for regularly sending `PERSIST_STATE` events, + so that the Actor can regularly persist its state in response to these events. + + This method should be called immediately before performing any additional Actor actions, + and it should be called only once. + """ + if self._is_initialized: + raise RuntimeError('The Actor was already initialized!') + + if self._configuration.token: + service_container.set_cloud_storage_client(ApifyStorageClient(configuration=self._configuration)) + + if self._configuration.is_at_home: + service_container.set_default_storage_client_type('cloud') + else: + service_container.set_default_storage_client_type('local') + + service_container.set_event_manager(self._event_manager) + + self._is_exiting = False + self._was_final_persist_state_emitted = False + + self.log.info('Initializing Actor...') + self.log.info('System info', extra=get_system_info()) + + # TODO: Print outdated SDK version warning (we need a new env var for this) + # https://github.com/apify/apify-sdk-python/issues/146 + + await self._event_manager.__aenter__() + + self._is_initialized = True + + async def exit( + self, + *, + exit_code: int = 0, + event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, + status_message: str | None = None, + cleanup_timeout: timedelta = timedelta(seconds=30), + ) -> None: + """Exit the Actor instance. + + This stops the Actor instance. + It cancels all the intervals for regularly sending `PERSIST_STATE` events, + sends a final `PERSIST_STATE` event, + waits for all the event listeners to finish, + and stops the event manager. + + Args: + exit_code: The exit code with which the Actor should fail (defaults to `0`). + event_listeners_timeout: How long should the Actor wait for Actor event listeners to finish before exiting. + status_message: The final status message that the Actor should display. + cleanup_timeout: How long we should wait for event listeners. + """ + self._raise_if_not_initialized() + + self._is_exiting = True + + exit_code = maybe_extract_enum_member_value(exit_code) + + self.log.info('Exiting Actor', extra={'exit_code': exit_code}) + + async def finalize() -> None: + if status_message is not None: + await self.set_status_message(status_message, is_terminal=True) + + # Sleep for a bit so that the listeners have a chance to trigger + await asyncio.sleep(0.1) + + if event_listeners_timeout: + await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout) + + await self._event_manager.__aexit__(None, None, None) + cast(dict, service_container._services).clear() + + await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds()) + self._is_initialized = False + + if is_running_in_ipython(): + self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in IPython') + elif os.getenv('PYTEST_CURRENT_TEST', default=False): # noqa: PLW1508 + self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in an unit test') + elif hasattr(asyncio, '_nest_patched'): + self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in a nested event loop') + else: + sys.exit(exit_code) + + async def fail( + self, + *, + exit_code: int = 1, + exception: BaseException | None = None, + status_message: str | None = None, + ) -> None: + """Fail the Actor instance. + + This performs all the same steps as Actor.exit(), + but it additionally sets the exit code to `1` (by default). + + Args: + exit_code: The exit code with which the Actor should fail (defaults to `1`). + exception: The exception with which the Actor failed. + status_message: The final status message that the Actor should display. + """ + self._raise_if_not_initialized() + + # In IPython, we don't run `sys.exit()` during Actor exits, + # so the exception traceback will be printed on its own + if exception and not is_running_in_ipython(): + self.log.exception('Actor failed with an exception', exc_info=exception) + + await self.exit(exit_code=exit_code, status_message=status_message) + + def new_client( + self, + *, + token: str | None = None, + api_url: str | None = None, + max_retries: int | None = None, + min_delay_between_retries: timedelta | None = None, + timeout: timedelta | None = None, + ) -> ApifyClientAsync: + """Return a new instance of the Apify API client. + + The `ApifyClientAsync` class is provided by the [apify-client](https://github.com/apify/apify-client-python) package, + and it is automatically configured using the `APIFY_API_BASE_URL` and `APIFY_TOKEN` environment variables. + + You can override the token via the available options. + That's useful if you want to use the client as a different Apify user than the SDK internals are using. + + Args: + token: The Apify API token + api_url: The URL of the Apify API server to which to connect to. Defaults to https://api.apify.com + max_retries: How many times to retry a failed request at most + min_delay_between_retries: How long will the client wait between retrying requests (increases exponentially from this value) + timeout: The socket timeout of the HTTP requests sent to the Apify API + """ + token = token or self._configuration.token + api_url = api_url or self._configuration.api_base_url + return ApifyClientAsync( + token=token, + api_url=api_url, + max_retries=max_retries, + min_delay_between_retries_millis=int(min_delay_between_retries.total_seconds() * 1000) if min_delay_between_retries is not None else None, + timeout_secs=int(timeout.total_seconds()) if timeout else None, + ) + + async def open_dataset( + self, + *, + id: str | None = None, + name: str | None = None, + force_cloud: bool = False, + ) -> Dataset: + """Open a dataset. + + Datasets are used to store structured data where each object stored has the same attributes, + such as online store products or real estate offers. + The actual data is stored either on the local filesystem or in the Apify cloud. + + Args: + id: ID of the dataset to be opened. + If neither `id` nor `name` are provided, the method returns the default dataset associated with the Actor run. + name: Name of the dataset to be opened. + If neither `id` nor `name` are provided, the method returns the default dataset associated with the Actor run. + force_cloud: If set to `True` then the Apify cloud storage is always used. + This way it is possible to combine local and cloud storage. + + Returns: An instance of the `Dataset` class for the given ID or name. + + """ + self._raise_if_not_initialized() + + return await Dataset.open( + id=id, + name=name, + configuration=self._configuration, + storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None), + ) + + async def open_key_value_store( + self, + *, + id: str | None = None, + name: str | None = None, + force_cloud: bool = False, + ) -> KeyValueStore: + """Open a key-value store. + + Key-value stores are used to store records or files, along with their MIME content type. + The records are stored and retrieved using a unique key. + The actual data is stored either on a local filesystem or in the Apify cloud. + + Args: + id: ID of the key-value store to be opened. + If neither `id` nor `name` are provided, the method returns the default key-value store associated with the Actor run. + name: Name of the key-value store to be opened. + If neither `id` nor `name` are provided, the method returns the default key-value store associated with the Actor run. + force_cloud: If set to `True` then the Apify cloud storage is always used. + This way it is possible to combine local and cloud storage. + + Returns: An instance of the `KeyValueStore` class for the given ID or name. + """ + self._raise_if_not_initialized() + + return await KeyValueStore.open( + id=id, + name=name, + configuration=self._configuration, + storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None), + ) + + async def open_request_queue( + self, + *, + id: str | None = None, + name: str | None = None, + force_cloud: bool = False, + ) -> RequestQueue: + """Open a request queue. + + Request queue represents a queue of URLs to crawl, which is stored either on local filesystem or in the Apify cloud. + The queue is used for deep crawling of websites, where you start with several URLs and then + recursively follow links to other pages. The data structure supports both breadth-first + and depth-first crawling orders. + + Args: + id: ID of the request queue to be opened. + If neither `id` nor `name` are provided, the method returns the default request queue associated with the Actor run. + name: Name of the request queue to be opened. + If neither `id` nor `name` are provided, the method returns the default request queue associated with the Actor run. + force_cloud: If set to `True` then the Apify cloud storage is always used. + This way it is possible to combine local and cloud storage. + + Returns: An instance of the `RequestQueue` class for the given ID or name. + """ + self._raise_if_not_initialized() + + return await RequestQueue.open( + id=id, + name=name, + configuration=self._configuration, + storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None), + ) + + async def push_data(self, data: dict | list[dict]) -> None: + """Store an object or a list of objects to the default dataset of the current Actor run. + + Args: + data: The data to push to the default dataset. + """ + self._raise_if_not_initialized() + + if not data: + return + + dataset = await self.open_dataset() + await dataset.push_data(data) + + async def get_input(self) -> Any: + """Get the Actor input value from the default key-value store associated with the current Actor run.""" + self._raise_if_not_initialized() + + input_value = await self.get_value(self._configuration.input_key) + input_secrets_private_key = self._configuration.input_secrets_private_key_file + input_secrets_key_passphrase = self._configuration.input_secrets_private_key_passphrase + if input_secrets_private_key and input_secrets_key_passphrase: + private_key = load_private_key( + input_secrets_private_key, + input_secrets_key_passphrase, + ) + input_value = decrypt_input_secrets(private_key, input_value) + + return input_value + + async def get_value(self, key: str, default_value: Any = None) -> Any: + """Get a value from the default key-value store associated with the current Actor run. + + Args: + key: The key of the record which to retrieve. + default_value: Default value returned in case the record does not exist. + """ + self._raise_if_not_initialized() + + key_value_store = await self.open_key_value_store() + return await key_value_store.get_value(key, default_value) + + async def set_value( + self, + key: str, + value: Any, + *, + content_type: str | None = None, + ) -> None: + """Set or delete a value in the default key-value store associated with the current Actor run. + + Args: + key: The key of the record which to set. + value: The value of the record which to set, or None, if the record should be deleted. + content_type: The content type which should be set to the value. + """ + self._raise_if_not_initialized() + + key_value_store = await self.open_key_value_store() + return await key_value_store.set_value(key, value, content_type=content_type) + + def on(self, event_name: Event, listener: Callable) -> Callable: + """Add an event listener to the Actor's event manager. + + The following events can be emitted: + - `Event.SYSTEM_INFO`: + Emitted every minute, the event data contains info about the resource usage of the Actor. + - `Event.MIGRATING`: + Emitted when the Actor running on the Apify platform is going to be migrated to another worker server soon. + You can use it to persist the state of the Actor and gracefully stop your in-progress tasks, + so that they are not interrupted by the migration.. + - `Event.PERSIST_STATE`: + Emitted in regular intervals (by default 60 seconds) to notify the Actor that it should persist its state, + in order to avoid repeating all work when the Actor restarts. + This event is automatically emitted together with the migrating event, + in which case the `isMigrating` flag in the event data is set to True, otherwise the flag is False. + Note that this event is provided merely for your convenience, + you can achieve the same effect using an interval and listening for the migrating event. + - `Event.ABORTING`: + When a user aborts an Actor run on the Apify platform, + they can choose to abort it gracefully, to allow the Actor some time before getting terminated. + This graceful abort emits the aborting event, which you can use to clean up the Actor state. + + Args: + event_name: The Actor event for which to listen to. + listener: The function which is to be called when the event is emitted (can be async). + """ + self._raise_if_not_initialized() + + self._event_manager.on(event=event_name, listener=listener) + return listener + + def off(self, event_name: Event, listener: Callable | None = None) -> None: + """Remove a listener, or all listeners, from an Actor event. + + Args: + event_name: The Actor event for which to remove listeners. + listener: The listener which is supposed to be removed. If not passed, all listeners of this event are removed. + """ + self._raise_if_not_initialized() + + self._event_manager.off(event=event_name, listener=listener) + + def is_at_home(self) -> bool: + """Return `True` when the Actor is running on the Apify platform, and `False` otherwise (for example when running locally).""" + return self._configuration.is_at_home + + def get_env(self) -> dict: + """Return a dictionary with information parsed from all the `APIFY_XXX` environment variables. + + For a list of all the environment variables, + see the [Actor documentation](https://docs.apify.com/actors/development/environment-variables). + If some variables are not defined or are invalid, the corresponding value in the resulting dictionary will be None. + """ + self._raise_if_not_initialized() + + config = dict[str, Any]() + for field_name, field in Configuration.model_fields.items(): + if field.deprecated: + continue + + if field.alias: + aliases = [field.alias] + elif isinstance(field.validation_alias, str): + aliases = [field.validation_alias] + elif isinstance(field.validation_alias, AliasChoices): + aliases = cast(list[str], field.validation_alias.choices) + else: + aliases = [field_name] + + for alias in aliases: + config[alias] = getattr(self._configuration, field_name) + + env_vars = {env_var.value.lower(): env_var.name.lower() for env_var in [*ActorEnvVars, *ApifyEnvVars]} + return {option_name: config[env_var] for env_var, option_name in env_vars.items() if env_var in config} + + async def start( + self, + actor_id: str, + run_input: Any = None, + *, + token: str | None = None, + content_type: str | None = None, + build: str | None = None, + memory_mbytes: int | None = None, + timeout: timedelta | None = None, + wait_for_finish: int | None = None, + webhooks: list[dict] | None = None, + ) -> dict: + """Run an Actor on the Apify platform. + + Unlike `Actor.call`, this method just starts the run without waiting for finish. + + Args: + actor_id: The ID of the Actor to be run. + run_input: The input to pass to the Actor run. + token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). + content_type: The content type of the input. + build: Specifies the Actor build to run. It can be either a build tag or build number. + By default, the run uses the build specified in the default run configuration for the Actor (typically latest). + memory_mbytes: Memory limit for the run, in megabytes. + By default, the run uses a memory limit specified in the default run configuration for the Actor. + timeout: Optional timeout for the run, in seconds. + By default, the run uses timeout specified in the default run configuration for the Actor. + wait_for_finish: The maximum number of seconds the server waits for the run to finish. By default, it is 0, the maximum value is 300. + webhooks: Optional ad-hoc webhooks (https://docs.apify.com/webhooks/ad-hoc-webhooks) associated with the Actor run which can be used to + receive a notification, e.g. when the Actor finished or failed. If you already have a webhook set up for the Actor or task, + you do not have to add it again here. Each webhook is represented by a dictionary containing these items: + * ``event_types``: list of ``WebhookEventType`` values which trigger the webhook + * ``request_url``: URL to which to send the webhook HTTP request + * ``payload_template`` (optional): Optional template for the request payload + + Returns: Info about the started Actor run + """ + self._raise_if_not_initialized() + + client = self.new_client(token=token) if token else self._apify_client + + return await client.actor(actor_id).start( + run_input=run_input, + content_type=content_type, + build=build, + memory_mbytes=memory_mbytes, + timeout_secs=int(timeout.total_seconds()) if timeout is not None else None, + wait_for_finish=wait_for_finish, + webhooks=webhooks, + ) + + async def abort( + self, + run_id: str, + *, + token: str | None = None, + status_message: str | None = None, + gracefully: bool | None = None, + ) -> dict: + """Abort given Actor run on the Apify platform using the current user account (determined by the `APIFY_TOKEN` environment variable). + + Args: + run_id: The ID of the Actor run to be aborted. + token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). + status_message: Status message of the Actor to be set on the platform. + gracefully: If True, the Actor run will abort gracefully. + It will send ``aborting`` and ``persistStates`` events into the run and force-stop the run after 30 seconds. + It is helpful in cases where you plan to resurrect the run later. + + Returns: Info about the aborted Actor run + """ + self._raise_if_not_initialized() + + client = self.new_client(token=token) if token else self._apify_client + + if status_message: + await client.run(run_id).update(status_message=status_message) + + return await client.run(run_id).abort(gracefully=gracefully) + + async def call( + self, + actor_id: str, + run_input: Any = None, + *, + token: str | None = None, + content_type: str | None = None, + build: str | None = None, + memory_mbytes: int | None = None, + timeout: timedelta | None = None, + webhooks: list[dict] | None = None, + wait: timedelta | None = None, + ) -> dict | None: + """Start an Actor on the Apify Platform and wait for it to finish before returning. + + It waits indefinitely, unless the wait argument is provided. + + Args: + actor_id: The ID of the Actor to be run. + run_input: The input to pass to the Actor run. + token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). + content_type: The content type of the input. + build: Specifies the Actor build to run. It can be either a build tag or build number. By default, the run uses the build specified in + the default run configuration for the Actor (typically latest). + memory_mbytes: Memory limit for the run, in megabytes. + By default, the run uses a memory limit specified in the default run configuration for the Actor. + timeout: Optional timeout for the run, in seconds. + By default, the run uses timeout specified in the default run configuration for the Actor. + webhooks: Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, which can be used to receive a notification, + e.g. when the Actor finished or failed. If you already have a webhook set up for the Actor, you do not have to add it again here. + wait: The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. + + Returns: Info about the started Actor run + """ + self._raise_if_not_initialized() + + client = self.new_client(token=token) if token else self._apify_client + + return await client.actor(actor_id).call( + run_input=run_input, + content_type=content_type, + build=build, + memory_mbytes=memory_mbytes, + timeout_secs=int(timeout.total_seconds()) if timeout is not None else None, + webhooks=webhooks, + wait_secs=int(wait.total_seconds()) if wait is not None else None, + ) + + async def call_task( + self, + task_id: str, + task_input: dict | None = None, + *, + build: str | None = None, + memory_mbytes: int | None = None, + timeout: timedelta | None = None, + webhooks: list[dict] | None = None, + wait: timedelta | None = None, + token: str | None = None, + ) -> dict | None: + """Start an Actor task on the Apify Platform and wait for it to finish before returning. + + It waits indefinitely, unless the wait argument is provided. + + Note that an Actor task is a saved input configuration and options for an Actor. + If you want to run an Actor directly rather than an Actor task, please use the `Actor.call` + + Args: + task_id: The ID of the Actor to be run. + task_input: Overrides the input to pass to the Actor run. + token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). + content_type: The content type of the input. + build: Specifies the Actor build to run. It can be either a build tag or build number. + By default, the run uses the build specified in the default run configuration for the Actor (typically latest). + memory_mbytes: Memory limit for the run, in megabytes. + By default, the run uses a memory limit specified in the default run configuration for the Actor. + timeout: Optional timeout for the run, in seconds. + By default, the run uses timeout specified in the default run configuration for the Actor. + webhooks: Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, which can be used to receive a notification, + e.g. when the Actor finished or failed. If you already have a webhook set up for the Actor, you do not have to add it again here. + wait: The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. + + Returns: Info about the started Actor run + """ + self._raise_if_not_initialized() + + client = self.new_client(token=token) if token else self._apify_client + + return await client.task(task_id).call( + task_input=task_input, + build=build, + memory_mbytes=memory_mbytes, + timeout_secs=int(timeout.total_seconds()) if timeout is not None else None, + webhooks=webhooks, + wait_secs=int(wait.total_seconds()) if wait is not None else None, + ) + + async def metamorph( + self, + target_actor_id: str, + run_input: Any = None, + *, + target_actor_build: str | None = None, + content_type: str | None = None, + custom_after_sleep: timedelta | None = None, + ) -> None: + """Transform this Actor run to an Actor run of a different Actor. + + The platform stops the current Actor container and starts a new container with the new Actor instead. + All the default storages are preserved, + and the new input is stored under the `INPUT-METAMORPH-1` key in the same default key-value store. + + Args: + target_actor_id: ID of the target Actor that the run should be transformed into + run_input: The input to pass to the new run. + target_actor_build: The build of the target Actor. It can be either a build tag or build number. + By default, the run uses the build specified in the default run configuration for the target Actor (typically the latest build). + content_type: The content type of the input. + custom_after_sleep: How long to sleep for after the metamorph, to wait for the container to be stopped. + """ + self._raise_if_not_initialized() + + if not self.is_at_home(): + self.log.error('Actor.metamorph() is only supported when running on the Apify platform.') + return + + if not custom_after_sleep: + custom_after_sleep = self._configuration.metamorph_after_sleep + + # If is_at_home() is True, config.actor_run_id is always set + if not self._configuration.actor_run_id: + raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.') + + await self._apify_client.run(self._configuration.actor_run_id).metamorph( + target_actor_id=target_actor_id, + run_input=run_input, + target_actor_build=target_actor_build, + content_type=content_type, + ) + + if custom_after_sleep: + await asyncio.sleep(custom_after_sleep.total_seconds()) + + async def reboot( + self, + *, + event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, # noqa: ARG002 + custom_after_sleep: timedelta | None = None, + ) -> None: + """Internally reboot this Actor. + + The system stops the current container and starts a new one, with the same run ID and default storages. + + Args: + event_listeners_timeout: How long should the Actor wait for Actor event listeners to finish before exiting + custom_after_sleep: How long to sleep for after the reboot, to wait for the container to be stopped. + """ + self._raise_if_not_initialized() + + if not self.is_at_home(): + self.log.error('Actor.reboot() is only supported when running on the Apify platform.') + return + + if not custom_after_sleep: + custom_after_sleep = self._configuration.metamorph_after_sleep + + self._event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True)) + + await self._event_manager.__aexit__(None, None, None) + + if not self._configuration.actor_run_id: + raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.') + + await self._apify_client.run(self._configuration.actor_run_id).reboot() + + if custom_after_sleep: + await asyncio.sleep(custom_after_sleep.total_seconds()) + + async def add_webhook( + self, + *, + event_types: list[WebhookEventType], + request_url: str, + payload_template: str | None = None, + ignore_ssl_errors: bool | None = None, + do_not_retry: bool | None = None, + idempotency_key: str | None = None, + ) -> dict | None: + """Create an ad-hoc webhook for the current Actor run. + + This webhook lets you receive a notification when the Actor run finished or failed. + + Note that webhooks are only supported for Actors running on the Apify platform. + When running the Actor locally, the function will print a warning and have no effect. + + For more information about Apify Actor webhooks, please see the [documentation](https://docs.apify.com/webhooks). + + Args: + event_types: List of event types that should trigger the webhook. At least one is required. + request_url: URL that will be invoked once the webhook is triggered. + payload_template: Specification of the payload that will be sent to request_url + ignore_ssl_errors: Whether the webhook should ignore SSL errors returned by request_url + do_not_retry: Whether the webhook should retry sending the payload to request_url upon failure. + idempotency_key: A unique identifier of a webhook. You can use it to ensure that you won't create the same webhook multiple times. + + Returns: The created webhook + """ + self._raise_if_not_initialized() + + if not self.is_at_home(): + self.log.error('Actor.add_webhook() is only supported when running on the Apify platform.') + return None + + # If is_at_home() is True, config.actor_run_id is always set + if not self._configuration.actor_run_id: + raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.') + + return await self._apify_client.webhooks().create( + actor_run_id=self._configuration.actor_run_id, + event_types=event_types, + request_url=request_url, + payload_template=payload_template, + ignore_ssl_errors=ignore_ssl_errors, + do_not_retry=do_not_retry, + idempotency_key=idempotency_key, + ) + + async def set_status_message( + self, + status_message: str, + *, + is_terminal: bool | None = None, + ) -> dict | None: + """Set the status message for the current Actor run. + + Args: + status_message: The status message to set to the run. + is_terminal: Set this flag to True if this is the final status message of the Actor run. + + Returns: The updated Actor run object + """ + self._raise_if_not_initialized() + + if not self.is_at_home(): + title = 'Terminal status message' if is_terminal else 'Status message' + self.log.info(f'[{title}]: {status_message}') + return None + + # If is_at_home() is True, config.actor_run_id is always set + if not self._configuration.actor_run_id: + raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.') + + return await self._apify_client.run(self._configuration.actor_run_id).update( + status_message=status_message, is_status_message_terminal=is_terminal + ) + + async def create_proxy_configuration( + self, + *, + actor_proxy_input: dict | None = None, # this is the raw proxy input from the actor run input, it is not spread or snake_cased in here + password: str | None = None, + groups: list[str] | None = None, + country_code: str | None = None, + proxy_urls: list[str] | None = None, + new_url_function: _NewUrlFunction | None = None, + ) -> ProxyConfiguration | None: + """Create a ProxyConfiguration object with the passed proxy configuration. + + Configures connection to a proxy server with the provided options. + Proxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or blacklists. + + For more details and code examples, see the `ProxyConfiguration` class. + + Args: + actor_proxy_input: Proxy configuration field from the Actor input, if input has such input field. + If you pass this argument, all the other arguments will be inferred from it. + password: Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available. + groups: Proxy groups which the Apify Proxy should use, if provided. + country_code: Country which the Apify Proxy should use, if provided. + proxy_urls: Custom proxy server URLs which should be rotated through. + new_url_function: Function which returns a custom proxy URL to be used. + + Returns: ProxyConfiguration object with the passed configuration, or None, if no proxy should be used based on the configuration. + """ + self._raise_if_not_initialized() + + if actor_proxy_input is not None: + if actor_proxy_input.get('useApifyProxy', False): + country_code = country_code or actor_proxy_input.get('apifyProxyCountry') + groups = groups or actor_proxy_input.get('apifyProxyGroups') + else: + proxy_urls = actor_proxy_input.get('proxyUrls', []) + if not proxy_urls: + return None + + proxy_configuration = ProxyConfiguration( + password=password, + groups=groups, + country_code=country_code, + proxy_urls=proxy_urls, + new_url_function=new_url_function, + _actor_config=self._configuration, + _apify_client=self._apify_client, + ) + + await proxy_configuration.initialize() + + return proxy_configuration + + +Actor = cast(_ActorType, Proxy(_ActorType)) +"""The entry point of the SDK, through which all the Actor operations should be done.""" diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py new file mode 100644 index 00000000..e65c7102 --- /dev/null +++ b/src/apify/_configuration.py @@ -0,0 +1,309 @@ +# ruff: noqa: TCH001 TCH002 TCH003 (so that pydantic annotations work) +from __future__ import annotations + +from datetime import datetime, timedelta +from typing import Annotated + +from pydantic import AliasChoices, BeforeValidator, Field + +from crawlee._utils.models import timedelta_ms +from crawlee.configuration import Configuration as CrawleeConfiguration + + +class Configuration(CrawleeConfiguration): + """A class for specifying the configuration of an Actor. + + Can be used either globally via `Configuration.get_global_configuration()`, + or it can be specific to each `Actor` instance on the `actor.config` property. + """ + + actor_id: Annotated[ + str | None, + Field( + validation_alias=AliasChoices( + 'actor_id', + 'apify_actor_id', + 'apify_act_id', + ), + description='ID of the Actor', + ), + ] = None + + actor_run_id: Annotated[ + str | None, + Field( + validation_alias=AliasChoices( + 'actor_run_id', + 'apify_actor_run_id', + 'apify_act_run_id', + ), + description='ID of the Actor run', + ), + ] = None + + actor_build_id: Annotated[ + str | None, + Field( + validation_alias=AliasChoices( + 'actor_build_id', + 'apify_actor_build_id', + ), + description='ID of the Actor build used in the run', + ), + ] = None + + actor_build_number: Annotated[ + str | None, + Field( + validation_alias=AliasChoices( + 'actor_build_number', + 'apify_actor_build_number', + ), + description='Build number of the Actor build used in the run', + ), + ] = None + + actor_task_id: Annotated[ + str | None, + Field( + validation_alias=AliasChoices( + 'actor_task_id', + 'apify_actor_task_id', + ), + description='ID of the Actor task. Empty if Actor is run outside of any task, e.g. directly using the API', + ), + ] = None + + actor_events_ws_url: Annotated[ + str | None, + Field( + validation_alias=AliasChoices( + 'actor_events_websocket_url', + 'apify_actor_events_ws_url', + ), + description='Websocket URL where Actor may listen for events from Actor platform', + ), + ] = None + + api_base_url: Annotated[ + str, + Field( + alias='apify_api_base_url', + description='Internal URL of the Apify API. May be used to interact with the platform programmatically', + ), + ] = 'https://api.apify.com' + + api_public_base_url: Annotated[ + str, + Field( + alias='apify_api_public_base_url', + description='Public URL of the Apify API. May be used to link to REST API resources', + ), + ] = 'https://api.apify.com' + + dedicated_cpus: Annotated[ + float | None, + Field( + alias='apify_dedicated_cpus', + description='Number of CPU cores reserved for the actor, based on allocated memory', + ), + ] = None + + disable_outdated_warning: Annotated[ + bool, + Field( + alias='apify_disable_outdated_warning', + description='Controls the display of outdated SDK version warnings', + ), + BeforeValidator(lambda val: val or False), + ] = False + + fact: Annotated[str | None, Field(alias='apify_fact')] = None + + input_key: Annotated[ + str, + Field( + validation_alias=AliasChoices( + 'actor_input_key', + 'apify_input_key', + 'crawlee_input_key', + ), + description='Key of the record in the default key-value store that holds the Actor input', + ), + ] = 'INPUT' + + input_secrets_private_key_file: Annotated[ + str | None, + Field( + alias='apify_input_secrets_private_key_file', + description='Path to the secret key used to decrypt Secret inputs.', + ), + ] = None + + input_secrets_private_key_passphrase: Annotated[ + str | None, + Field( + alias='apify_input_secrets_private_key_passphrase', + description='Passphrase for the input secret key', + ), + ] = None + + is_at_home: Annotated[ + bool, + Field( + alias='apify_is_at_home', + description='True if the Actor is running on Apify servers', + ), + ] = False + + latest_sdk_version: Annotated[ + str | None, + Field( + alias='apify_sdk_latest_version', + deprecated=True, + description='Specifies the most recent release version of the Apify SDK for Javascript. Used for checking for updates.', + ), + ] = None + + log_format: Annotated[ + str | None, + Field(alias='apify_log_format', deprecated=True), + ] = None + + max_paid_dataset_items: Annotated[ + int | None, + Field( + alias='actor_max_paid_dataset_items', + description='For paid-per-result Actors, the user-set limit on returned results. Do not exceed this limit', + ), + BeforeValidator(lambda val: val or None), + ] = None + + meta_origin: Annotated[ + str | None, + Field( + alias='apify_meta_origin', + description='Specifies how an Actor run was started', + ), + ] = None + + metamorph_after_sleep: Annotated[ + timedelta_ms, + Field( + alias='apify_metamorph_after_sleep_millis', + description='How long the Actor needs to wait before exiting after triggering a metamorph', + ), + ] = timedelta(minutes=5) + + proxy_hostname: Annotated[ + str, + Field( + alias='apify_proxy_hostname', + description='Hostname of the Apify proxy', + ), + ] = 'proxy.apify.com' + + proxy_password: Annotated[ + str | None, + Field( + alias='apify_proxy_password', + description='Password to the Apify proxy', + ), + ] = None + + proxy_port: Annotated[ + int, + Field( + alias='apify_proxy_port', + description='Port to communicate with the Apify proxy', + ), + ] = 8000 + + proxy_status_url: Annotated[ + str, + Field( + alias='apify_proxy_status_url', + description='URL for retrieving proxy status information', + ), + ] = 'http://proxy.apify.com' + + started_at: Annotated[ + datetime | None, + Field( + validation_alias=AliasChoices( + 'actor_started_at', + 'apify_started_at', + ), + description='Date when the Actor was started', + ), + ] = None + + timeout_at: Annotated[ + datetime | None, + Field( + validation_alias=AliasChoices( + 'actor_timeout_at', + 'apify_timeout_at', + ), + description='Date when the Actor will time out', + ), + ] = None + + standby_port: Annotated[ + int, + Field( + alias='actor_standby_port', + description='TCP port for the Actor to start an HTTP server to receive messages in the Actor Standby mode', + ), + ] = 4322 + + token: Annotated[ + str | None, + Field( + alias='apify_token', + description='API token of the user who started the Actor', + ), + ] = None + + user_id: Annotated[ + str | None, + Field( + alias='apify_user_id', + description='ID of the user who started the Actor. May differ from the Actor owner', + ), + ] = None + + web_server_port: Annotated[ + int, + Field( + validation_alias=AliasChoices( + 'actor_web_server_port', + 'apify_container_port', + ), + description='TCP port for the Actor to start an HTTP server on' + 'This server can be used to receive external messages or expose monitoring and control interfaces', + ), + ] = 4321 + + web_server_url: Annotated[ + str, + Field( + validation_alias=AliasChoices( + 'actor_web_server_url', + 'apify_container_url', + ), + description='Unique public URL for accessing the Actor run web server from the outside world', + ), + ] = 'http://localhost:4321' + + workflow_key: Annotated[ + str | None, + Field( + alias='apify_workflow_key', + description='Identifier used for grouping related runs and API calls together', + ), + ] = None + + +# Monkey-patch the base class so that it works with the extended configuration +CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore diff --git a/src/apify/_consts.py b/src/apify/_consts.py new file mode 100644 index 00000000..71f373a0 --- /dev/null +++ b/src/apify/_consts.py @@ -0,0 +1,10 @@ +from __future__ import annotations + +import re +from datetime import timedelta + +EVENT_LISTENERS_TIMEOUT = timedelta(seconds=5) + +BASE64_REGEXP = '[-A-Za-z0-9+/]*={0,3}' +ENCRYPTED_INPUT_VALUE_PREFIX = 'ENCRYPTED_VALUE' +ENCRYPTED_INPUT_VALUE_REGEXP = re.compile(f'^{ENCRYPTED_INPUT_VALUE_PREFIX}:({BASE64_REGEXP}):({BASE64_REGEXP})$') diff --git a/src/apify/_crypto.py b/src/apify/_crypto.py index 237bc8cd..499beaa0 100644 --- a/src/apify/_crypto.py +++ b/src/apify/_crypto.py @@ -1,16 +1,17 @@ from __future__ import annotations import base64 -import secrets from typing import Any -from apify_shared.utils import ignore_docs from cryptography.exceptions import InvalidTag as InvalidTagException from cryptography.hazmat.primitives import hashes, serialization from cryptography.hazmat.primitives.asymmetric import padding, rsa from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes -from apify.consts import ENCRYPTED_INPUT_VALUE_REGEXP +from apify_shared.utils import ignore_docs +from crawlee._utils.crypto import crypto_random_object_id + +from apify._consts import ENCRYPTED_INPUT_VALUE_REGEXP ENCRYPTION_KEY_LENGTH = 32 ENCRYPTION_IV_LENGTH = 16 @@ -25,11 +26,10 @@ def public_encrypt(value: str, *, public_key: rsa.RSAPublicKey) -> dict: It returns the encrypted password and encrypted value in BASE64 format. Args: - value (str): The value which should be encrypted. - public_key (RSAPublicKey): Public key to use for encryption. + value: The value which should be encrypted. + public_key: Public key to use for encryption. - Returns: - disc: Encrypted password and value. + Returns: Encrypted password and value. """ key_bytes = crypto_random_object_id(ENCRYPTION_KEY_LENGTH).encode('utf-8') initialized_vector_bytes = crypto_random_object_id(ENCRYPTION_IV_LENGTH).encode('utf-8') @@ -65,12 +65,11 @@ def private_decrypt( """Decrypts the given encrypted value using the private key and password. Args: - encrypted_password (str): Password used to encrypt the private key encoded as base64 string. - encrypted_value (str): Encrypted value to decrypt as base64 string. - private_key (RSAPrivateKey): Private key to use for decryption. + encrypted_password: Password used to encrypt the private key encoded as base64 string. + encrypted_value: Encrypted value to decrypt as base64 string. + private_key: Private key to use for decryption. - Returns: - str: Decrypted value. + Returns: Decrypted value. """ encrypted_password_bytes = base64.b64decode(encrypted_password.encode('utf-8')) encrypted_value_bytes = base64.b64decode(encrypted_value.encode('utf-8')) @@ -125,13 +124,7 @@ def _load_public_key(public_key_file_base64: str) -> rsa.RSAPublicKey: return public_key -def crypto_random_object_id(length: int = 17) -> str: - """Python reimplementation of cryptoRandomObjectId from `@apify/utilities`.""" - chars = 'abcdefghijklmnopqrstuvwxyzABCEDFGHIJKLMNOPQRSTUVWXYZ0123456789' - return ''.join(secrets.choice(chars) for _ in range(length)) - - -def decrypt_input_secrets(private_key: rsa.RSAPrivateKey, input: Any) -> Any: # noqa: A002 +def decrypt_input_secrets(private_key: rsa.RSAPrivateKey, input: Any) -> Any: """Decrypt input secrets.""" if not isinstance(input, dict): return input diff --git a/src/apify/_log.py b/src/apify/_log.py new file mode 100644 index 00000000..c799420a --- /dev/null +++ b/src/apify/_log.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import logging + +from crawlee._log_config import CrawleeLogFormatter + +# Name of the logger used throughout the library (resolves to 'apify') +logger_name = __name__.split('.')[0] + +# Logger used throughout the library +logger = logging.getLogger(logger_name) + + +class ActorLogFormatter(CrawleeLogFormatter): # Inherited from parent class + pass diff --git a/src/apify/_memory_storage/__init__.py b/src/apify/_memory_storage/__init__.py deleted file mode 100644 index 6b51836d..00000000 --- a/src/apify/_memory_storage/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .memory_storage_client import MemoryStorageClient - -__all__ = ['MemoryStorageClient'] diff --git a/src/apify/_memory_storage/file_storage_utils.py b/src/apify/_memory_storage/file_storage_utils.py deleted file mode 100644 index 64645001..00000000 --- a/src/apify/_memory_storage/file_storage_utils.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import annotations - -import os - -import aiofiles -from aiofiles.os import makedirs -from apify_shared.utils import json_dumps - -from apify._utils import force_remove - - -async def update_metadata(*, data: dict, entity_directory: str, write_metadata: bool) -> None: - # Skip writing the actual metadata file. This is done after ensuring the directory exists so we have the directory present - if not write_metadata: - return - - # Ensure the directory for the entity exists - await makedirs(entity_directory, exist_ok=True) - - # Write the metadata to the file - file_path = os.path.join(entity_directory, '__metadata__.json') - async with aiofiles.open(file_path, mode='wb') as f: - await f.write(json_dumps(data).encode('utf-8')) - - -async def _update_dataset_items( - *, - data: list[tuple[str, dict]], - entity_directory: str, - persist_storage: bool, -) -> None: - # Skip writing files to the disk if the client has the option set to false - if not persist_storage: - return - - # Ensure the directory for the entity exists - await makedirs(entity_directory, exist_ok=True) - - # Save all the new items to the disk - for idx, item in data: - file_path = os.path.join(entity_directory, f'{idx}.json') - async with aiofiles.open(file_path, mode='wb') as f: - await f.write(json_dumps(item).encode('utf-8')) - - -async def update_request_queue_item( - *, - request_id: str, - request: dict, - entity_directory: str, - persist_storage: bool, -) -> None: - # Skip writing files to the disk if the client has the option set to false - if not persist_storage: - return - - # Ensure the directory for the entity exists - await makedirs(entity_directory, exist_ok=True) - - # Write the request to the file - file_path = os.path.join(entity_directory, f'{request_id}.json') - async with aiofiles.open(file_path, mode='wb') as f: - await f.write(json_dumps(request).encode('utf-8')) - - -async def delete_request(*, request_id: str, entity_directory: str) -> None: - # Ensure the directory for the entity exists - await makedirs(entity_directory, exist_ok=True) - - file_path = os.path.join(entity_directory, f'{request_id}.json') - await force_remove(file_path) diff --git a/src/apify/_memory_storage/memory_storage_client.py b/src/apify/_memory_storage/memory_storage_client.py deleted file mode 100644 index ed55cc46..00000000 --- a/src/apify/_memory_storage/memory_storage_client.py +++ /dev/null @@ -1,219 +0,0 @@ -from __future__ import annotations - -import asyncio -import contextlib -import os -from pathlib import Path - -import aioshutil -from aiofiles import ospath -from aiofiles.os import rename, scandir -from apify_shared.consts import ApifyEnvVars -from apify_shared.utils import ignore_docs - -from apify._memory_storage.resource_clients.dataset import DatasetClient -from apify._memory_storage.resource_clients.dataset_collection import DatasetCollectionClient -from apify._memory_storage.resource_clients.key_value_store import KeyValueStoreClient -from apify._memory_storage.resource_clients.key_value_store_collection import KeyValueStoreCollectionClient -from apify._memory_storage.resource_clients.request_queue import RequestQueueClient -from apify._memory_storage.resource_clients.request_queue_collection import RequestQueueCollectionClient -from apify._utils import maybe_parse_bool - -""" -Memory storage emulates data storages that are available on the Apify platform. -Specifically, it emulates clients for datasets, key-value stores and request queues. -The data are held in-memory and persisted locally if `persist_storage` is True. -The metadata of the storages is also persisted if `write_metadata` is True. -""" - - -@ignore_docs -class MemoryStorageClient: - """Class representing an in-memory storage.""" - - _local_data_directory: str - _datasets_directory: str - _key_value_stores_directory: str - _request_queues_directory: str - _write_metadata: bool - _persist_storage: bool - _datasets_handled: list[DatasetClient] - _key_value_stores_handled: list[KeyValueStoreClient] - _request_queues_handled: list[RequestQueueClient] - - _purged_on_start: bool = False - _purge_lock: asyncio.Lock - - """Indicates whether a purge was already performed on this instance""" - - def __init__( - self: MemoryStorageClient, - *, - local_data_directory: str | None = None, - write_metadata: bool | None = None, - persist_storage: bool | None = None, - ) -> None: - """Initialize the MemoryStorageClient. - - Args: - local_data_directory (str, optional): A local directory where all data will be persisted - persist_storage (bool, optional): Whether to persist the data to the `local_data_directory` or just keep them in memory - write_metadata (bool, optional): Whether to persist metadata of the storages as well - """ - self._local_data_directory = local_data_directory or os.getenv(ApifyEnvVars.LOCAL_STORAGE_DIR) or './storage' - self._datasets_directory = os.path.join(self._local_data_directory, 'datasets') - self._key_value_stores_directory = os.path.join(self._local_data_directory, 'key_value_stores') - self._request_queues_directory = os.path.join(self._local_data_directory, 'request_queues') - self._write_metadata = write_metadata if write_metadata is not None else '*' in os.getenv('DEBUG', '') - self._persist_storage = persist_storage if persist_storage is not None else maybe_parse_bool(os.getenv(ApifyEnvVars.PERSIST_STORAGE, 'true')) - self._datasets_handled = [] - self._key_value_stores_handled = [] - self._request_queues_handled = [] - self._purge_lock = asyncio.Lock() - - def datasets(self: MemoryStorageClient) -> DatasetCollectionClient: - """Retrieve the sub-client for manipulating datasets.""" - return DatasetCollectionClient(base_storage_directory=self._datasets_directory, memory_storage_client=self) - - def dataset(self: MemoryStorageClient, dataset_id: str) -> DatasetClient: - """Retrieve the sub-client for manipulating a single dataset. - - Args: - dataset_id (str): ID of the dataset to be manipulated - """ - return DatasetClient(base_storage_directory=self._datasets_directory, memory_storage_client=self, id=dataset_id) - - def key_value_stores(self: MemoryStorageClient) -> KeyValueStoreCollectionClient: - """Retrieve the sub-client for manipulating key-value stores.""" - return KeyValueStoreCollectionClient(base_storage_directory=self._key_value_stores_directory, memory_storage_client=self) - - def key_value_store(self: MemoryStorageClient, key_value_store_id: str) -> KeyValueStoreClient: - """Retrieve the sub-client for manipulating a single key-value store. - - Args: - key_value_store_id (str): ID of the key-value store to be manipulated - """ - return KeyValueStoreClient(base_storage_directory=self._key_value_stores_directory, memory_storage_client=self, id=key_value_store_id) - - def request_queues(self: MemoryStorageClient) -> RequestQueueCollectionClient: - """Retrieve the sub-client for manipulating request queues.""" - return RequestQueueCollectionClient(base_storage_directory=self._request_queues_directory, memory_storage_client=self) - - def request_queue( - self: MemoryStorageClient, - request_queue_id: str, - *, - client_key: str | None = None, # noqa: ARG002 - ) -> RequestQueueClient: - """Retrieve the sub-client for manipulating a single request queue. - - Args: - request_queue_id (str): ID of the request queue to be manipulated - client_key (str): A unique identifier of the client accessing the request queue - """ - return RequestQueueClient(base_storage_directory=self._request_queues_directory, memory_storage_client=self, id=request_queue_id) - - async def _purge_on_start(self: MemoryStorageClient) -> None: - # Optimistic, non-blocking check - if self._purged_on_start is True: - return - - async with self._purge_lock: - # Another check under the lock just to be sure - if self._purged_on_start is True: - return # type: ignore[unreachable] # Mypy doesn't understand that the _purged_on_start can change while we're getting the async lock - - await self._purge() - self._purged_on_start = True - - async def _purge(self: MemoryStorageClient) -> None: - """Clean up the default storage directories before the run starts. - - Specifically, `purge` cleans up: - - local directory containing the default dataset; - - all records from the default key-value store in the local directory, except for the "INPUT" key; - - local directory containing the default request queue. - """ - # Key-value stores - if await ospath.exists(self._key_value_stores_directory): - key_value_store_folders = await scandir(self._key_value_stores_directory) - for key_value_store_folder in key_value_store_folders: - if key_value_store_folder.name.startswith('__APIFY_TEMPORARY') or key_value_store_folder.name.startswith('__OLD'): - await self._batch_remove_files(key_value_store_folder.path) - elif key_value_store_folder.name == 'default': - await self._handle_default_key_value_store(key_value_store_folder.path) - - # Datasets - if await ospath.exists(self._datasets_directory): - dataset_folders = await scandir(self._datasets_directory) - for dataset_folder in dataset_folders: - if dataset_folder.name == 'default' or dataset_folder.name.startswith('__APIFY_TEMPORARY'): - await self._batch_remove_files(dataset_folder.path) - # Request queues - if await ospath.exists(self._request_queues_directory): - request_queue_folders = await scandir(self._request_queues_directory) - for request_queue_folder in request_queue_folders: - if request_queue_folder.name == 'default' or request_queue_folder.name.startswith('__APIFY_TEMPORARY'): - await self._batch_remove_files(request_queue_folder.path) - - async def _handle_default_key_value_store(self: MemoryStorageClient, folder: str) -> None: - """Remove everything from the default key-value store folder except `possible_input_keys`.""" - folder_exists = await ospath.exists(folder) - temporary_path = os.path.normpath(os.path.join(folder, '../__APIFY_MIGRATING_KEY_VALUE_STORE__')) - - # For optimization, we want to only attempt to copy a few files from the default key-value store - possible_input_keys = [ - 'INPUT', - 'INPUT.json', - 'INPUT.bin', - 'INPUT.txt', - ] - - if folder_exists: - # Create a temporary folder to save important files in - Path(temporary_path).mkdir(parents=True, exist_ok=True) - - # Go through each file and save the ones that are important - for entity in possible_input_keys: - original_file_path = os.path.join(folder, entity) - temp_file_path = os.path.join(temporary_path, entity) - with contextlib.suppress(Exception): - await rename(original_file_path, temp_file_path) - - # Remove the original folder and all its content - counter = 0 - temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__')) - done = False - try: - while not done: - await rename(folder, temp_path_for_old_folder) - done = True - except Exception: - counter += 1 - temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__')) - - # Replace the temporary folder with the original folder - await rename(temporary_path, folder) - - # Remove the old folder - await self._batch_remove_files(temp_path_for_old_folder) - - async def _batch_remove_files(self: MemoryStorageClient, folder: str, counter: int = 0) -> None: - folder_exists = await ospath.exists(folder) - - if folder_exists: - temporary_folder = ( - folder - if os.path.basename(folder).startswith('__APIFY_TEMPORARY_') - else os.path.normpath(os.path.join(folder, f'../__APIFY_TEMPORARY_{counter}__')) - ) - - try: - # Rename the old folder to the new one to allow background deletions - await rename(folder, temporary_folder) - except Exception: - # Folder exists already, try again with an incremented counter - return await self._batch_remove_files(folder, counter + 1) - - await aioshutil.rmtree(temporary_folder, ignore_errors=True) - return None diff --git a/src/apify/_memory_storage/resource_clients/__init__.py b/src/apify/_memory_storage/resource_clients/__init__.py deleted file mode 100644 index 0a79ebe3..00000000 --- a/src/apify/_memory_storage/resource_clients/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from .base_resource_client import BaseResourceClient -from .base_resource_collection_client import BaseResourceCollectionClient -from .dataset import DatasetClient -from .dataset_collection import DatasetCollectionClient -from .key_value_store import KeyValueStoreClient -from .key_value_store_collection import KeyValueStoreCollectionClient -from .request_queue import RequestQueueClient -from .request_queue_collection import RequestQueueCollectionClient - -__all__ = [ - 'BaseResourceClient', - 'BaseResourceCollectionClient', - 'DatasetClient', - 'DatasetCollectionClient', - 'KeyValueStoreClient', - 'KeyValueStoreCollectionClient', - 'RequestQueueClient', - 'RequestQueueCollectionClient', -] diff --git a/src/apify/_memory_storage/resource_clients/base_resource_client.py b/src/apify/_memory_storage/resource_clients/base_resource_client.py deleted file mode 100644 index e877b330..00000000 --- a/src/apify/_memory_storage/resource_clients/base_resource_client.py +++ /dev/null @@ -1,141 +0,0 @@ -from __future__ import annotations - -import json -import os -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING - -from apify_shared.utils import ignore_docs - -if TYPE_CHECKING: - from typing_extensions import Self - - from apify._memory_storage.memory_storage_client import MemoryStorageClient - - -@ignore_docs -class BaseResourceClient(ABC): - """Base class for resource clients.""" - - _id: str - _name: str | None - _resource_directory: str - - @abstractmethod - def __init__( - self: BaseResourceClient, - *, - base_storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> None: - """Initialize the BaseResourceClient.""" - raise NotImplementedError('You must override this method in the subclass!') - - @abstractmethod - async def get(self: BaseResourceClient) -> dict | None: - """Retrieve the storage. - - Returns: - dict, optional: The retrieved storage, or None, if it does not exist - """ - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - @abstractmethod - def _get_storages_dir(cls: type[BaseResourceClient], memory_storage_client: MemoryStorageClient) -> str: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - @abstractmethod - def _get_storage_client_cache( - cls, - memory_storage_client: MemoryStorageClient, - ) -> list[Self]: - raise NotImplementedError('You must override this method in the subclass!') - - @abstractmethod - def _to_resource_info(self: BaseResourceClient) -> dict: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - @abstractmethod - def _create_from_directory( - cls, - storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> Self: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - def _find_or_create_client_by_id_or_name( - cls, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> Self | None: - assert id is not None or name is not None # noqa: S101 - - storage_client_cache = cls._get_storage_client_cache(memory_storage_client) - storages_dir = cls._get_storages_dir(memory_storage_client) - - # First check memory cache - found = next( - ( - storage_client - for storage_client in storage_client_cache - if storage_client._id == id or (storage_client._name and name and storage_client._name.lower() == name.lower()) - ), - None, - ) - - if found is not None: - return found - - storage_path = None - - # First try to find the storage by looking up the directory by name - if name: - possible_storage_path = os.path.join(storages_dir, name) - if os.access(possible_storage_path, os.F_OK): - storage_path = possible_storage_path - - # If it's not found, try going through the storages dir and finding it by metadata - if not storage_path and os.access(storages_dir, os.F_OK): - for entry in os.scandir(storages_dir): - if not entry.is_dir(): - continue - metadata_path = os.path.join(entry.path, '__metadata__.json') - if not os.access(metadata_path, os.F_OK): - continue - with open(metadata_path, encoding='utf-8') as metadata_file: - metadata = json.load(metadata_file) - if id and id == metadata.get('id'): - storage_path = entry.path - name = metadata.get(name) - break - if name and name == metadata.get('name'): - storage_path = entry.path - id = metadata.get(id) # noqa: A001 - break - - # As a last resort, try to check if the accessed storage is the default one, - # and the folder has no metadata - # TODO: make this respect the APIFY_DEFAULT_XXX_ID env var - # https://github.com/apify/apify-sdk-python/issues/149 - if id == 'default': - possible_storage_path = os.path.join(storages_dir, id) - if os.access(possible_storage_path, os.F_OK): - storage_path = possible_storage_path - - if not storage_path: - return None - - resource_client = cls._create_from_directory(storage_path, memory_storage_client, id, name) - - storage_client_cache.append(resource_client) - - return resource_client diff --git a/src/apify/_memory_storage/resource_clients/base_resource_collection_client.py b/src/apify/_memory_storage/resource_clients/base_resource_collection_client.py deleted file mode 100644 index 2f41876a..00000000 --- a/src/apify/_memory_storage/resource_clients/base_resource_collection_client.py +++ /dev/null @@ -1,114 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from operator import itemgetter -from typing import TYPE_CHECKING, Generic, TypeVar, cast - -from apify_shared.models import ListPage -from apify_shared.utils import ignore_docs - -from apify._memory_storage.file_storage_utils import update_metadata -from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient - -if TYPE_CHECKING: - from apify._memory_storage.memory_storage_client import MemoryStorageClient - - -ResourceClientType = TypeVar('ResourceClientType', bound=BaseResourceClient, contravariant=True) # noqa: PLC0105 - - -@ignore_docs -class BaseResourceCollectionClient(ABC, Generic[ResourceClientType]): - """Base class for resource collection clients.""" - - _base_storage_directory: str - _memory_storage_client: MemoryStorageClient - - def __init__( - self: BaseResourceCollectionClient, - *, - base_storage_directory: str, - memory_storage_client: MemoryStorageClient, - ) -> None: - """Initialize the DatasetCollectionClient with the passed arguments.""" - self._base_storage_directory = base_storage_directory - self._memory_storage_client = memory_storage_client - - @abstractmethod - def _get_storage_client_cache(self: BaseResourceCollectionClient) -> list[ResourceClientType]: - raise NotImplementedError('You must override this method in the subclass!') - - @abstractmethod - def _get_resource_client_class(self: BaseResourceCollectionClient) -> type[ResourceClientType]: - raise NotImplementedError('You must override this method in the subclass!') - - @abstractmethod - async def list(self: BaseResourceCollectionClient) -> ListPage: - """List the available storages. - - Returns: - ListPage: The list of available storages matching the specified filters. - """ - storage_client_cache = self._get_storage_client_cache() - - items = [storage._to_resource_info() for storage in storage_client_cache] - - return ListPage( - { - 'total': len(items), - 'count': len(items), - 'offset': 0, - 'limit': len(items), - 'desc': False, - 'items': sorted(items, key=itemgetter('createdAt')), - } - ) - - @abstractmethod - async def get_or_create( - self: BaseResourceCollectionClient, - *, - name: str | None = None, - schema: dict | None = None, - _id: str | None = None, - ) -> dict: - """Retrieve a named storage, or create a new one when it doesn't exist. - - Args: - name (str, optional): The name of the storage to retrieve or create. - schema (Dict, optional): The schema of the storage - - Returns: - dict: The retrieved or newly-created storage. - """ - resource_client_class = self._get_resource_client_class() - storage_client_cache = self._get_storage_client_cache() - - if name or _id: - found = resource_client_class._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, - name=name, - id=_id, - ) - if found: - resource_info = found._to_resource_info() - return cast(dict, resource_info) - - new_resource = resource_client_class( - id=_id, - name=name, - base_storage_directory=self._base_storage_directory, - memory_storage_client=self._memory_storage_client, - ) - storage_client_cache.append(new_resource) - - resource_info = new_resource._to_resource_info() - - # Write to the disk - await update_metadata( - data=resource_info, - entity_directory=new_resource._resource_directory, - write_metadata=self._memory_storage_client._write_metadata, - ) - - return cast(dict, resource_info) diff --git a/src/apify/_memory_storage/resource_clients/dataset.py b/src/apify/_memory_storage/resource_clients/dataset.py deleted file mode 100644 index f8c82655..00000000 --- a/src/apify/_memory_storage/resource_clients/dataset.py +++ /dev/null @@ -1,452 +0,0 @@ -from __future__ import annotations - -import asyncio -import json -import os -from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any, AsyncIterator - -import aioshutil -from apify_shared.models import ListPage -from apify_shared.utils import ignore_docs - -from apify._crypto import crypto_random_object_id -from apify._memory_storage.file_storage_utils import _update_dataset_items, update_metadata -from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient -from apify._utils import force_rename, raise_on_duplicate_storage, raise_on_non_existing_storage -from apify.consts import StorageTypes - -if TYPE_CHECKING: - from apify_shared.types import JSONSerializable - - from apify._memory_storage.memory_storage_client import MemoryStorageClient - -# This is what API returns in the x-apify-pagination-limit -# header when no limit query parameter is used. -LIST_ITEMS_LIMIT = 999_999_999_999 - -# Number of characters of the dataset item file names. -# E.g.: 000000019.json - 9 digits -LOCAL_ENTRY_NAME_DIGITS = 9 - - -@ignore_docs -class DatasetClient(BaseResourceClient): - """Sub-client for manipulating a single dataset.""" - - _id: str - _resource_directory: str - _memory_storage_client: MemoryStorageClient - _name: str | None - _dataset_entries: dict[str, dict] - _created_at: datetime - _accessed_at: datetime - _modified_at: datetime - _item_count = 0 - _file_operation_lock: asyncio.Lock - - def __init__( - self: DatasetClient, - *, - base_storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> None: - """Initialize the DatasetClient.""" - self._id = id or crypto_random_object_id() - self._resource_directory = os.path.join(base_storage_directory, name or self._id) - self._memory_storage_client = memory_storage_client - self._name = name - self._dataset_entries = {} - self._created_at = datetime.now(timezone.utc) - self._accessed_at = datetime.now(timezone.utc) - self._modified_at = datetime.now(timezone.utc) - self._file_operation_lock = asyncio.Lock() - - async def get(self: DatasetClient) -> dict | None: - """Retrieve the dataset. - - Returns: - dict, optional: The retrieved dataset, or None, if it does not exist - """ - found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name) - - if found: - async with found._file_operation_lock: - await found._update_timestamps(has_been_modified=False) - return found._to_resource_info() - - return None - - async def update(self: DatasetClient, *, name: str | None = None) -> dict: - """Update the dataset with specified fields. - - Args: - name (str, optional): The new name for the dataset - - Returns: - dict: The updated dataset - """ - # Check by id - existing_dataset_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, - id=self._id, - name=self._name, - ) - - if existing_dataset_by_id is None: - raise_on_non_existing_storage(StorageTypes.DATASET, self._id) - - # Skip if no changes - if name is None: - return existing_dataset_by_id._to_resource_info() - - async with existing_dataset_by_id._file_operation_lock: - # Check that name is not in use already - existing_dataset_by_name = next( - (dataset for dataset in self._memory_storage_client._datasets_handled if dataset._name and dataset._name.lower() == name.lower()), - None, - ) - - if existing_dataset_by_name is not None: - raise_on_duplicate_storage(StorageTypes.DATASET, 'name', name) - - existing_dataset_by_id._name = name - - previous_dir = existing_dataset_by_id._resource_directory - - existing_dataset_by_id._resource_directory = os.path.join(self._memory_storage_client._datasets_directory, name) - - await force_rename(previous_dir, existing_dataset_by_id._resource_directory) - - # Update timestamps - await existing_dataset_by_id._update_timestamps(has_been_modified=True) - - return existing_dataset_by_id._to_resource_info() - - async def delete(self: DatasetClient) -> None: - """Delete the dataset.""" - dataset = next((dataset for dataset in self._memory_storage_client._datasets_handled if dataset._id == self._id), None) - - if dataset is not None: - async with dataset._file_operation_lock: - self._memory_storage_client._datasets_handled.remove(dataset) - dataset._item_count = 0 - dataset._dataset_entries.clear() - - if os.path.exists(dataset._resource_directory): - await aioshutil.rmtree(dataset._resource_directory) - - async def list_items( - self: DatasetClient, - *, - offset: int | None = 0, - limit: int | None = LIST_ITEMS_LIMIT, - clean: bool | None = None, # noqa: ARG002 - desc: bool | None = None, - fields: list[str] | None = None, # noqa: ARG002 - omit: list[str] | None = None, # noqa: ARG002 - unwind: str | None = None, # noqa: ARG002 - skip_empty: bool | None = None, # noqa: ARG002 - skip_hidden: bool | None = None, # noqa: ARG002 - flatten: list[str] | None = None, # noqa: ARG002 - view: str | None = None, # noqa: ARG002 - ) -> ListPage: - """List the items of the dataset. - - Args: - offset (int, optional): Number of items that should be skipped at the start. The default value is 0 - limit (int, optional): Maximum number of items to return. By default there is no limit. - desc (bool, optional): By default, results are returned in the same order as they were stored. - To reverse the order, set this parameter to True. - clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character). - The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters. - Note that since some objects might be skipped from the output, that the result might contain less items than the limit value. - fields (list of str, optional): A list of fields which should be picked from the items, - only these fields will remain in the resulting record objects. - Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter. - You can use this feature to effectively fix the output format. - omit (list of str, optional): A list of fields which should be omitted from the items. - unwind (str, optional): Name of a field which should be unwound. - If the field is an array then every element of the array will become a separate record and merged with parent object. - If the unwound field is an object then it is merged with the parent object. - If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object, - then the item gets preserved as it is. Note that the unwound items ignore the desc parameter. - skip_empty (bool, optional): If True, then empty items are skipped from the output. - Note that if used, the results might contain less items than the limit value. - skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character. - flatten (list of str, optional): A list of fields that should be flattened - view (str, optional): Name of the dataset view to be used - - Returns: - ListPage: A page of the list of dataset items according to the specified filters. - """ - # Check by id - existing_dataset_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, - id=self._id, - name=self._name, - ) - - if existing_dataset_by_id is None: - raise_on_non_existing_storage(StorageTypes.DATASET, self._id) - - async with existing_dataset_by_id._file_operation_lock: - start, end = existing_dataset_by_id._get_start_and_end_indexes( - max(existing_dataset_by_id._item_count - (offset or 0) - (limit or LIST_ITEMS_LIMIT), 0) if desc else offset or 0, - limit, - ) - - items = [] - - for idx in range(start, end): - entry_number = self._generate_local_entry_name(idx) - items.append(existing_dataset_by_id._dataset_entries[entry_number]) - - await existing_dataset_by_id._update_timestamps(has_been_modified=False) - - if desc: - items.reverse() - - return ListPage( - { - 'count': len(items), - 'desc': desc or False, - 'items': items, - 'limit': limit or LIST_ITEMS_LIMIT, - 'offset': offset or 0, - 'total': existing_dataset_by_id._item_count, - } - ) - - async def iterate_items( - self: DatasetClient, - *, - offset: int = 0, - limit: int | None = None, - clean: bool | None = None, # noqa: ARG002 - desc: bool | None = None, - fields: list[str] | None = None, # noqa: ARG002 - omit: list[str] | None = None, # noqa: ARG002 - unwind: str | None = None, # noqa: ARG002 - skip_empty: bool | None = None, # noqa: ARG002 - skip_hidden: bool | None = None, # noqa: ARG002 - ) -> AsyncIterator[dict]: - """Iterate over the items in the dataset. - - Args: - offset (int, optional): Number of items that should be skipped at the start. The default value is 0 - limit (int, optional): Maximum number of items to return. By default there is no limit. - desc (bool, optional): By default, results are returned in the same order as they were stored. - To reverse the order, set this parameter to True. - clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character). - The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters. - Note that since some objects might be skipped from the output, that the result might contain less items than the limit value. - fields (list of str, optional): A list of fields which should be picked from the items, - only these fields will remain in the resulting record objects. - Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter. - You can use this feature to effectively fix the output format. - omit (list of str, optional): A list of fields which should be omitted from the items. - unwind (str, optional): Name of a field which should be unwound. - If the field is an array then every element of the array will become a separate record and merged with parent object. - If the unwound field is an object then it is merged with the parent object. - If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object, - then the item gets preserved as it is. Note that the unwound items ignore the desc parameter. - skip_empty (bool, optional): If True, then empty items are skipped from the output. - Note that if used, the results might contain less items than the limit value. - skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character. - - Yields: - dict: An item from the dataset - """ - cache_size = 1000 - first_item = offset - - # If there is no limit, set last_item to None until we get the total from the first API response - last_item = None if limit is None else offset + limit - - current_offset = first_item - while last_item is None or current_offset < last_item: - current_limit = cache_size if last_item is None else min(cache_size, last_item - current_offset) - - current_items_page = await self.list_items( - offset=current_offset, - limit=current_limit, - desc=desc, - ) - - current_offset += current_items_page.count - if last_item is None or current_items_page.total < last_item: - last_item = current_items_page.total - - for item in current_items_page.items: - yield item - - async def get_items_as_bytes(self: DatasetClient, *_args: Any, **_kwargs: Any) -> bytes: - raise NotImplementedError('This method is not supported in local memory storage.') - - async def stream_items(self: DatasetClient, *_args: Any, **_kwargs: Any) -> AsyncIterator: - raise NotImplementedError('This method is not supported in local memory storage.') - - async def push_items(self: DatasetClient, items: JSONSerializable) -> None: - """Push items to the dataset. - - Args: - items: The items which to push in the dataset. Either a stringified JSON, a dictionary, or a list of strings or dictionaries. - """ - # Check by id - existing_dataset_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_dataset_by_id is None: - raise_on_non_existing_storage(StorageTypes.DATASET, self._id) - - normalized = self._normalize_items(items) - - added_ids: list[str] = [] - for entry in normalized: - existing_dataset_by_id._item_count += 1 - idx = self._generate_local_entry_name(existing_dataset_by_id._item_count) - - existing_dataset_by_id._dataset_entries[idx] = entry - added_ids.append(idx) - - data_entries = [(id, existing_dataset_by_id._dataset_entries[id]) for id in added_ids] # noqa: A001 - - async with existing_dataset_by_id._file_operation_lock: - await existing_dataset_by_id._update_timestamps(has_been_modified=True) - - await _update_dataset_items( - data=data_entries, - entity_directory=existing_dataset_by_id._resource_directory, - persist_storage=self._memory_storage_client._persist_storage, - ) - - def _to_resource_info(self: DatasetClient) -> dict: - """Retrieve the dataset info.""" - return { - 'id': self._id, - 'name': self._name, - 'itemCount': self._item_count, - 'accessedAt': self._accessed_at, - 'createdAt': self._created_at, - 'modifiedAt': self._modified_at, - } - - async def _update_timestamps(self: DatasetClient, has_been_modified: bool) -> None: # noqa: FBT001 - """Update the timestamps of the dataset.""" - self._accessed_at = datetime.now(timezone.utc) - - if has_been_modified: - self._modified_at = datetime.now(timezone.utc) - - dataset_info = self._to_resource_info() - await update_metadata( - data=dataset_info, - entity_directory=self._resource_directory, - write_metadata=self._memory_storage_client._write_metadata, - ) - - def _get_start_and_end_indexes(self: DatasetClient, offset: int, limit: int | None = None) -> tuple[int, int]: - actual_limit = limit or self._item_count - start = offset + 1 - end = min(offset + actual_limit, self._item_count) + 1 - return (start, end) - - def _generate_local_entry_name(self: DatasetClient, idx: int) -> str: - return str(idx).zfill(LOCAL_ENTRY_NAME_DIGITS) - - def _normalize_items(self: DatasetClient, items: JSONSerializable) -> list[dict]: - def normalize_item(item: Any) -> dict | None: - if isinstance(item, str): - item = json.loads(item) - - if isinstance(item, list): - received = ',\n'.join(item) - raise TypeError(f'Each dataset item can only be a single JSON object, not an array. Received: [{received}]') - - if (not isinstance(item, dict)) and item is not None: - raise TypeError(f'Each dataset item must be a JSON object. Received: {item}') - - return item - - if isinstance(items, str): - items = json.loads(items) - - result = list(map(normalize_item, items)) if isinstance(items, list) else [normalize_item(items)] - # filter(None, ..) returns items that are True - return list(filter(None, result)) - - @classmethod - def _get_storages_dir(cls: type[DatasetClient], memory_storage_client: MemoryStorageClient) -> str: - return memory_storage_client._datasets_directory - - @classmethod - def _get_storage_client_cache( - cls: type[DatasetClient], - memory_storage_client: MemoryStorageClient, - ) -> list[DatasetClient]: - return memory_storage_client._datasets_handled - - @classmethod - def _create_from_directory( - cls: type[DatasetClient], - storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> DatasetClient: - item_count = 0 - created_at = datetime.now(timezone.utc) - accessed_at = datetime.now(timezone.utc) - modified_at = datetime.now(timezone.utc) - entries: dict[str, dict] = {} - - has_seen_metadata_file = False - - # Access the dataset folder - for entry in os.scandir(storage_directory): - if entry.is_file(): - if entry.name == '__metadata__.json': - has_seen_metadata_file = True - - # We have found the dataset's metadata file, build out information based on it - with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f: - metadata = json.load(f) - id = metadata['id'] # noqa: A001 - name = metadata['name'] - item_count = metadata['itemCount'] - created_at = datetime.fromisoformat(metadata['createdAt']) - accessed_at = datetime.fromisoformat(metadata['accessedAt']) - modified_at = datetime.fromisoformat(metadata['modifiedAt']) - - continue - - with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f: - entry_content = json.load(f) - entry_name = entry.name.split('.')[0] - - entries[entry_name] = entry_content - - if not has_seen_metadata_file: - item_count += 1 - - new_client = DatasetClient( - base_storage_directory=memory_storage_client._datasets_directory, - memory_storage_client=memory_storage_client, - id=id, - name=name, - ) - - # Overwrite properties - new_client._accessed_at = accessed_at - new_client._created_at = created_at - new_client._modified_at = modified_at - new_client._item_count = item_count - - for entry_id, content in entries.items(): - new_client._dataset_entries[entry_id] = content - - return new_client diff --git a/src/apify/_memory_storage/resource_clients/dataset_collection.py b/src/apify/_memory_storage/resource_clients/dataset_collection.py deleted file mode 100644 index 0ef7b3f0..00000000 --- a/src/apify/_memory_storage/resource_clients/dataset_collection.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from apify_shared.utils import ignore_docs - -from apify._memory_storage.resource_clients.base_resource_collection_client import BaseResourceCollectionClient -from apify._memory_storage.resource_clients.dataset import DatasetClient - -if TYPE_CHECKING: - from apify_shared.models import ListPage - - -@ignore_docs -class DatasetCollectionClient(BaseResourceCollectionClient): - """Sub-client for manipulating datasets.""" - - def _get_storage_client_cache(self: DatasetCollectionClient) -> list[DatasetClient]: - return self._memory_storage_client._datasets_handled - - def _get_resource_client_class(self: DatasetCollectionClient) -> type[DatasetClient]: - return DatasetClient - - async def list(self: DatasetCollectionClient) -> ListPage: - """List the available datasets. - - Returns: - ListPage: The list of available datasets matching the specified filters. - """ - return await super().list() - - async def get_or_create( - self: DatasetCollectionClient, - *, - name: str | None = None, - schema: dict | None = None, - _id: str | None = None, - ) -> dict: - """Retrieve a named dataset, or create a new one when it doesn't exist. - - Args: - name (str, optional): The name of the dataset to retrieve or create. - schema (dict, optional): The schema of the dataset - - Returns: - dict: The retrieved or newly-created dataset. - """ - return await super().get_or_create(name=name, schema=schema, _id=_id) diff --git a/src/apify/_memory_storage/resource_clients/key_value_store.py b/src/apify/_memory_storage/resource_clients/key_value_store.py deleted file mode 100644 index 2920089d..00000000 --- a/src/apify/_memory_storage/resource_clients/key_value_store.py +++ /dev/null @@ -1,533 +0,0 @@ -from __future__ import annotations - -import asyncio -import io -import json -import mimetypes -import os -import pathlib -from datetime import datetime, timezone -from operator import itemgetter -from typing import TYPE_CHECKING, Any, AsyncIterator, TypedDict - -import aiofiles -import aioshutil -from aiofiles.os import makedirs -from apify_shared.utils import ignore_docs, is_file_or_bytes, json_dumps - -from apify._crypto import crypto_random_object_id -from apify._memory_storage.file_storage_utils import update_metadata -from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient -from apify._utils import ( - force_remove, - force_rename, - guess_file_extension, - maybe_parse_body, - raise_on_duplicate_storage, - raise_on_non_existing_storage, -) -from apify.consts import DEFAULT_API_PARAM_LIMIT, StorageTypes -from apify.log import logger - -if TYPE_CHECKING: - from typing_extensions import NotRequired - - from apify._memory_storage.memory_storage_client import MemoryStorageClient - - -class KeyValueStoreRecord(TypedDict): - key: str - value: Any - contentType: str | None - filename: NotRequired[str] - - -def _filename_from_record(record: KeyValueStoreRecord) -> str: - if record.get('filename') is not None: - return record['filename'] - - content_type = record.get('contentType') - if not content_type or content_type == 'application/octet-stream': - return record['key'] - - extension = guess_file_extension(content_type) - if record['key'].endswith(f'.{extension}'): - return record['key'] - - return f'{record["key"]}.{extension}' - - -@ignore_docs -class KeyValueStoreClient(BaseResourceClient): - """Sub-client for manipulating a single key-value store.""" - - _id: str - _resource_directory: str - _memory_storage_client: MemoryStorageClient - _name: str | None - _records: dict[str, KeyValueStoreRecord] - _created_at: datetime - _accessed_at: datetime - _modified_at: datetime - _file_operation_lock: asyncio.Lock - - def __init__( - self: KeyValueStoreClient, - *, - base_storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> None: - """Initialize the KeyValueStoreClient.""" - self._id = id or crypto_random_object_id() - self._resource_directory = os.path.join(base_storage_directory, name or self._id) - self._memory_storage_client = memory_storage_client - self._name = name - self._records = {} - self._created_at = datetime.now(timezone.utc) - self._accessed_at = datetime.now(timezone.utc) - self._modified_at = datetime.now(timezone.utc) - self._file_operation_lock = asyncio.Lock() - - async def get(self: KeyValueStoreClient) -> dict | None: - """Retrieve the key-value store. - - Returns: - dict, optional: The retrieved key-value store, or None if it does not exist - """ - found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name) - - if found: - async with found._file_operation_lock: - await found._update_timestamps(has_been_modified=False) - return found._to_resource_info() - - return None - - async def update(self: KeyValueStoreClient, *, name: str | None = None) -> dict: - """Update the key-value store with specified fields. - - Args: - name (str, optional): The new name for key-value store - - Returns: - dict: The updated key-value store - """ - # Check by id - existing_store_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) - - # Skip if no changes - if name is None: - return existing_store_by_id._to_resource_info() - - async with existing_store_by_id._file_operation_lock: - # Check that name is not in use already - existing_store_by_name = next( - (store for store in self._memory_storage_client._key_value_stores_handled if store._name and store._name.lower() == name.lower()), - None, - ) - - if existing_store_by_name is not None: - raise_on_duplicate_storage(StorageTypes.KEY_VALUE_STORE, 'name', name) - - existing_store_by_id._name = name - - previous_dir = existing_store_by_id._resource_directory - - existing_store_by_id._resource_directory = os.path.join(self._memory_storage_client._key_value_stores_directory, name) - - await force_rename(previous_dir, existing_store_by_id._resource_directory) - - # Update timestamps - await existing_store_by_id._update_timestamps(has_been_modified=True) - - return existing_store_by_id._to_resource_info() - - async def delete(self: KeyValueStoreClient) -> None: - """Delete the key-value store.""" - store = next((store for store in self._memory_storage_client._key_value_stores_handled if store._id == self._id), None) - - if store is not None: - async with store._file_operation_lock: - self._memory_storage_client._key_value_stores_handled.remove(store) - store._records.clear() - - if os.path.exists(store._resource_directory): - await aioshutil.rmtree(store._resource_directory) - - async def list_keys( - self: KeyValueStoreClient, - *, - limit: int = DEFAULT_API_PARAM_LIMIT, - exclusive_start_key: str | None = None, - ) -> dict: - """List the keys in the key-value store. - - Args: - limit (int, optional): Number of keys to be returned. Maximum value is 1000 - exclusive_start_key (str, optional): All keys up to this one (including) are skipped from the result - - Returns: - dict: The list of keys in the key-value store matching the given arguments - """ - # Check by id - existing_store_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) - - items = [] - - for record in existing_store_by_id._records.values(): - size = len(record['value']) - items.append( - { - 'key': record['key'], - 'size': size, - } - ) - - if len(items) == 0: - return { - 'count': len(items), - 'limit': limit, - 'exclusiveStartKey': exclusive_start_key, - 'isTruncated': False, - 'nextExclusiveStartKey': None, - 'items': items, - } - - # Lexically sort to emulate the API - items = sorted(items, key=itemgetter('key')) - - truncated_items = items - if exclusive_start_key is not None: - key_pos = next((idx for idx, i in enumerate(items) if i['key'] == exclusive_start_key), None) - if key_pos is not None: - truncated_items = items[(key_pos + 1) :] - - limited_items = truncated_items[:limit] - - last_item_in_store = items[-1] - last_selected_item = limited_items[-1] - is_last_selected_item_absolutely_last = last_item_in_store == last_selected_item - next_exclusive_start_key = None if is_last_selected_item_absolutely_last else last_selected_item['key'] - - async with existing_store_by_id._file_operation_lock: - await existing_store_by_id._update_timestamps(has_been_modified=False) - - return { - 'count': len(items), - 'limit': limit, - 'exclusiveStartKey': exclusive_start_key, - 'isTruncated': not is_last_selected_item_absolutely_last, - 'nextExclusiveStartKey': next_exclusive_start_key, - 'items': limited_items, - } - - async def _get_record_internal( - self: KeyValueStoreClient, - key: str, - as_bytes: bool = False, # noqa: FBT001, FBT002 - ) -> dict | None: - # Check by id - existing_store_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) - - stored_record = existing_store_by_id._records.get(key) - - if stored_record is None: - return None - - record = { - 'key': stored_record['key'], - 'value': stored_record['value'], - 'contentType': stored_record.get('contentType'), - } - - if not as_bytes: - try: - record['value'] = maybe_parse_body(record['value'], record['contentType']) - except ValueError: - logger.exception('Error parsing key-value store record') - - async with existing_store_by_id._file_operation_lock: - await existing_store_by_id._update_timestamps(has_been_modified=False) - - return record - - async def get_record(self: KeyValueStoreClient, key: str) -> dict | None: - """Retrieve the given record from the key-value store. - - Args: - key (str): Key of the record to retrieve - - Returns: - dict, optional: The requested record, or None, if the record does not exist - """ - return await self._get_record_internal(key) - - async def get_record_as_bytes(self: KeyValueStoreClient, key: str) -> dict | None: - """Retrieve the given record from the key-value store, without parsing it. - - Args: - key (str): Key of the record to retrieve - - Returns: - dict, optional: The requested record, or None, if the record does not exist - """ - return await self._get_record_internal(key, as_bytes=True) - - async def stream_record(self: KeyValueStoreClient, _key: str) -> AsyncIterator[dict | None]: - raise NotImplementedError('This method is not supported in local memory storage.') - - async def set_record(self: KeyValueStoreClient, key: str, value: Any, content_type: str | None = None) -> None: - """Set a value to the given record in the key-value store. - - Args: - key (str): The key of the record to save the value to - value (Any): The value to save into the record - content_type (str, optional): The content type of the saved value - """ - # Check by id - existing_store_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) - - if isinstance(value, io.IOBase): - raise NotImplementedError('File-like values are not supported in local memory storage') - - if content_type is None: - if is_file_or_bytes(value): - content_type = 'application/octet-stream' - elif isinstance(value, str): - content_type = 'text/plain; charset=utf-8' - else: - content_type = 'application/json; charset=utf-8' - - if 'application/json' in content_type and not is_file_or_bytes(value) and not isinstance(value, str): - value = json_dumps(value).encode('utf-8') - - async with existing_store_by_id._file_operation_lock: - await existing_store_by_id._update_timestamps(has_been_modified=True) - record: KeyValueStoreRecord = { - 'key': key, - 'value': value, - 'contentType': content_type, - } - - old_record = existing_store_by_id._records.get(key) - existing_store_by_id._records[key] = record - - if self._memory_storage_client._persist_storage: - if old_record is not None and _filename_from_record(old_record) != _filename_from_record(record): - await existing_store_by_id._delete_persisted_record(old_record) - - await existing_store_by_id._persist_record(record) - - async def _persist_record(self: KeyValueStoreClient, record: KeyValueStoreRecord) -> None: - store_directory = self._resource_directory - record_filename = _filename_from_record(record) - record['filename'] = record_filename - - # Ensure the directory for the entity exists - await makedirs(store_directory, exist_ok=True) - - # Create files for the record - record_path = os.path.join(store_directory, record_filename) - record_metadata_path = os.path.join(store_directory, record_filename + '.__metadata__.json') - - # Convert to bytes if string - if isinstance(record['value'], str): - record['value'] = record['value'].encode('utf-8') - - async with aiofiles.open(record_path, mode='wb') as f: - await f.write(record['value']) - - if self._memory_storage_client._write_metadata: - async with aiofiles.open(record_metadata_path, mode='wb') as f: - await f.write( - json_dumps( - { - 'key': record['key'], - 'contentType': record['contentType'], - } - ).encode('utf-8') - ) - - async def delete_record(self: KeyValueStoreClient, key: str) -> None: - """Delete the specified record from the key-value store. - - Args: - key (str): The key of the record which to delete - """ - # Check by id - existing_store_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_store_by_id is None: - raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) - - record = existing_store_by_id._records.get(key) - - if record is not None: - async with existing_store_by_id._file_operation_lock: - del existing_store_by_id._records[key] - await existing_store_by_id._update_timestamps(has_been_modified=True) - if self._memory_storage_client._persist_storage: - await existing_store_by_id._delete_persisted_record(record) - - async def _delete_persisted_record(self: KeyValueStoreClient, record: KeyValueStoreRecord) -> None: - store_directory = self._resource_directory - record_filename = _filename_from_record(record) - - # Ensure the directory for the entity exists - await makedirs(store_directory, exist_ok=True) - - # Create files for the record - record_path = os.path.join(store_directory, record_filename) - record_metadata_path = os.path.join(store_directory, record_filename + '.__metadata__.json') - - await force_remove(record_path) - await force_remove(record_metadata_path) - - def _to_resource_info(self: KeyValueStoreClient) -> dict: - """Retrieve the key-value store info.""" - return { - 'id': self._id, - 'name': self._name, - 'accessedAt': self._accessed_at, - 'createdAt': self._created_at, - 'modifiedAt': self._modified_at, - 'userId': '1', - } - - async def _update_timestamps(self: KeyValueStoreClient, has_been_modified: bool) -> None: # noqa: FBT001 - self._accessed_at = datetime.now(timezone.utc) - - if has_been_modified: - self._modified_at = datetime.now(timezone.utc) - - kv_store_info = self._to_resource_info() - await update_metadata( - data=kv_store_info, - entity_directory=self._resource_directory, - write_metadata=self._memory_storage_client._write_metadata, - ) - - @classmethod - def _get_storages_dir(cls: type[KeyValueStoreClient], memory_storage_client: MemoryStorageClient) -> str: - return memory_storage_client._key_value_stores_directory - - @classmethod - def _get_storage_client_cache( - cls: type[KeyValueStoreClient], - memory_storage_client: MemoryStorageClient, - ) -> list[KeyValueStoreClient]: - return memory_storage_client._key_value_stores_handled - - @classmethod - def _create_from_directory( - cls: type[KeyValueStoreClient], - storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> KeyValueStoreClient: - created_at = datetime.now(timezone.utc) - accessed_at = datetime.now(timezone.utc) - modified_at = datetime.now(timezone.utc) - - store_metadata_path = os.path.join(storage_directory, '__metadata__.json') - if os.path.exists(store_metadata_path): - with open(store_metadata_path, encoding='utf-8') as f: - metadata = json.load(f) - id = metadata['id'] # noqa: A001 - name = metadata['name'] - created_at = datetime.fromisoformat(metadata['createdAt']) - accessed_at = datetime.fromisoformat(metadata['accessedAt']) - modified_at = datetime.fromisoformat(metadata['modifiedAt']) - - new_client = KeyValueStoreClient( - base_storage_directory=memory_storage_client._key_value_stores_directory, - memory_storage_client=memory_storage_client, - id=id, - name=name, - ) - - # Overwrite internal properties - new_client._accessed_at = accessed_at - new_client._created_at = created_at - new_client._modified_at = modified_at - - # Scan the key value store folder, check each entry in there and parse it as a store record - for entry in os.scandir(storage_directory): - if not entry.is_file(): - continue - - # Ignore metadata files on their own - if entry.name.endswith('__metadata__.json'): - continue - - with open(os.path.join(storage_directory, entry.name), 'rb') as f: - file_content = f.read() - - # Try checking if this file has a metadata file associated with it - metadata = None - if os.path.exists(os.path.join(storage_directory, entry.name + '.__metadata__.json')): - with open(os.path.join(storage_directory, entry.name + '.__metadata__.json'), encoding='utf-8') as metadata_file: - try: - metadata = json.load(metadata_file) - assert metadata.get('key') is not None # noqa: S101 - assert metadata.get('contentType') is not None # noqa: S101 - except Exception: - logger.warning( - f"""Metadata of key-value store entry "{entry.name}" for store {name or id} could not be parsed.""" - 'The metadata file will be ignored.', - exc_info=True, - ) - - if not metadata: - content_type, _ = mimetypes.guess_type(entry.name) - if content_type is None: - content_type = 'application/octet-stream' - - metadata = { - 'key': pathlib.Path(entry.name).stem, - 'contentType': content_type, - } - - try: - maybe_parse_body(file_content, metadata['contentType']) - except Exception: - metadata['contentType'] = 'application/octet-stream' - logger.warning( - f"""Key-value store entry "{metadata['key']}" for store {name or id} could not be parsed.""" - 'The entry will be assumed as binary.', - exc_info=True, - ) - - new_client._records[metadata['key']] = { - 'key': metadata['key'], - 'contentType': metadata['contentType'], - 'filename': entry.name, - 'value': file_content, - } - - return new_client diff --git a/src/apify/_memory_storage/resource_clients/key_value_store_collection.py b/src/apify/_memory_storage/resource_clients/key_value_store_collection.py deleted file mode 100644 index 9acb156e..00000000 --- a/src/apify/_memory_storage/resource_clients/key_value_store_collection.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from apify_shared.utils import ignore_docs - -from apify._memory_storage.resource_clients.base_resource_collection_client import BaseResourceCollectionClient -from apify._memory_storage.resource_clients.key_value_store import KeyValueStoreClient - -if TYPE_CHECKING: - from apify_shared.models import ListPage - - -@ignore_docs -class KeyValueStoreCollectionClient(BaseResourceCollectionClient): - """Sub-client for manipulating key-value stores.""" - - def _get_storage_client_cache(self: KeyValueStoreCollectionClient) -> list[KeyValueStoreClient]: - return self._memory_storage_client._key_value_stores_handled - - def _get_resource_client_class(self: KeyValueStoreCollectionClient) -> type[KeyValueStoreClient]: - return KeyValueStoreClient - - async def list(self: KeyValueStoreCollectionClient) -> ListPage: - """List the available key-value stores. - - Returns: - ListPage: The list of available key-value stores matching the specified filters. - """ - return await super().list() - - async def get_or_create( - self: KeyValueStoreCollectionClient, - *, - name: str | None = None, - schema: dict | None = None, - _id: str | None = None, - ) -> dict: - """Retrieve a named key-value store, or create a new one when it doesn't exist. - - Args: - name (str, optional): The name of the key-value store to retrieve or create. - schema (Dict, optional): The schema of the key-value store - - Returns: - dict: The retrieved or newly-created key-value store. - """ - return await super().get_or_create(name=name, schema=schema, _id=_id) diff --git a/src/apify/_memory_storage/resource_clients/request_queue.py b/src/apify/_memory_storage/resource_clients/request_queue.py deleted file mode 100644 index 1798c586..00000000 --- a/src/apify/_memory_storage/resource_clients/request_queue.py +++ /dev/null @@ -1,466 +0,0 @@ -from __future__ import annotations - -import asyncio -import json -import os -from datetime import datetime, timezone -from decimal import Decimal -from typing import TYPE_CHECKING - -import aioshutil -from apify_shared.utils import filter_out_none_values_recursively, ignore_docs, json_dumps -from sortedcollections import ValueSortedDict - -from apify._crypto import crypto_random_object_id -from apify._memory_storage.file_storage_utils import delete_request, update_metadata, update_request_queue_item -from apify._memory_storage.resource_clients.base_resource_client import BaseResourceClient -from apify._utils import force_rename, raise_on_duplicate_storage, raise_on_non_existing_storage, unique_key_to_request_id -from apify.consts import StorageTypes - -if TYPE_CHECKING: - from apify._memory_storage.memory_storage_client import MemoryStorageClient - - -@ignore_docs -class RequestQueueClient(BaseResourceClient): - """Sub-client for manipulating a single request queue.""" - - _id: str - _resource_directory: str - _memory_storage_client: MemoryStorageClient - _name: str | None - _requests: ValueSortedDict - _created_at: datetime - _accessed_at: datetime - _modified_at: datetime - _handled_request_count = 0 - _pending_request_count = 0 - _last_used_timestamp = Decimal(0.0) - _file_operation_lock: asyncio.Lock - - def __init__( - self: RequestQueueClient, - *, - base_storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> None: - """Initialize the RequestQueueClient.""" - self._id = id or crypto_random_object_id() - self._resource_directory = os.path.join(base_storage_directory, name or self._id) - self._memory_storage_client = memory_storage_client - self._name = name - self._requests = ValueSortedDict(lambda req: req.get('orderNo') or -float('inf')) - self._created_at = datetime.now(timezone.utc) - self._accessed_at = datetime.now(timezone.utc) - self._modified_at = datetime.now(timezone.utc) - self._file_operation_lock = asyncio.Lock() - - async def get(self: RequestQueueClient) -> dict | None: - """Retrieve the request queue. - - Returns: - dict, optional: The retrieved request queue, or None, if it does not exist - """ - found = self._find_or_create_client_by_id_or_name(memory_storage_client=self._memory_storage_client, id=self._id, name=self._name) - - if found: - async with found._file_operation_lock: - await found._update_timestamps(has_been_modified=False) - return found._to_resource_info() - - return None - - async def update(self: RequestQueueClient, *, name: str | None = None) -> dict: - """Update the request queue with specified fields. - - Args: - name (str, optional): The new name for the request queue - - Returns: - dict: The updated request queue - """ - # Check by id - existing_queue_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - - # Skip if no changes - if name is None: - return existing_queue_by_id._to_resource_info() - - async with existing_queue_by_id._file_operation_lock: - # Check that name is not in use already - existing_queue_by_name = next( - (queue for queue in self._memory_storage_client._request_queues_handled if queue._name and queue._name.lower() == name.lower()), None - ) - - if existing_queue_by_name is not None: - raise_on_duplicate_storage(StorageTypes.REQUEST_QUEUE, 'name', name) - - existing_queue_by_id._name = name - - previous_dir = existing_queue_by_id._resource_directory - - existing_queue_by_id._resource_directory = os.path.join(self._memory_storage_client._request_queues_directory, name) - - await force_rename(previous_dir, existing_queue_by_id._resource_directory) - - # Update timestamps - await existing_queue_by_id._update_timestamps(has_been_modified=True) - - return existing_queue_by_id._to_resource_info() - - async def delete(self: RequestQueueClient) -> None: - """Delete the request queue.""" - queue = next((queue for queue in self._memory_storage_client._request_queues_handled if queue._id == self._id), None) - - if queue is not None: - async with queue._file_operation_lock: - self._memory_storage_client._request_queues_handled.remove(queue) - queue._pending_request_count = 0 - queue._handled_request_count = 0 - queue._requests.clear() - - if os.path.exists(queue._resource_directory): - await aioshutil.rmtree(queue._resource_directory) - - async def list_head(self: RequestQueueClient, *, limit: int | None = None) -> dict: - """Retrieve a given number of requests from the beginning of the queue. - - Args: - limit (int, optional): How many requests to retrieve - - Returns: - dict: The desired number of requests from the beginning of the queue. - """ - existing_queue_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - - async with existing_queue_by_id._file_operation_lock: - await existing_queue_by_id._update_timestamps(has_been_modified=False) - - items: list[dict] = [] - - # Iterate all requests in the queue which have sorted key larger than infinity, which means `orderNo` is not `None` - # This will iterate them in order of `orderNo` - for request_key in existing_queue_by_id._requests.irange_key(min_key=-float('inf'), inclusive=(False, True)): - if len(items) == limit: - break - - request = existing_queue_by_id._requests.get(request_key) - - # Check that the request still exists and was not handled, - # in case something deleted it or marked it as handled concurrenctly - if request and request['orderNo']: - items.append(request) - - return { - 'limit': limit, - 'hadMultipleClients': False, - 'queueModifiedAt': existing_queue_by_id._modified_at, - 'items': [self._json_to_request(item['json']) for item in items], - } - - async def add_request(self: RequestQueueClient, request: dict, *, forefront: bool | None = None) -> dict: - """Add a request to the queue. - - Args: - request (dict): The request to add to the queue - forefront (bool, optional): Whether to add the request to the head or the end of the queue - - Returns: - dict: The added request. - """ - existing_queue_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - - request_model = self._create_internal_request(request, forefront) - - async with existing_queue_by_id._file_operation_lock: - existing_request_with_id = existing_queue_by_id._requests.get(request_model['id']) - - # We already have the request present, so we return information about it - if existing_request_with_id is not None: - await existing_queue_by_id._update_timestamps(has_been_modified=False) - - return { - 'requestId': existing_request_with_id['id'], - 'wasAlreadyHandled': existing_request_with_id['orderNo'] is None, - 'wasAlreadyPresent': True, - } - - existing_queue_by_id._requests[request_model['id']] = request_model - if request_model['orderNo'] is None: - existing_queue_by_id._handled_request_count += 1 - else: - existing_queue_by_id._pending_request_count += 1 - await existing_queue_by_id._update_timestamps(has_been_modified=True) - await update_request_queue_item( - request=request_model, - request_id=request_model['id'], - entity_directory=existing_queue_by_id._resource_directory, - persist_storage=self._memory_storage_client._persist_storage, - ) - - return { - 'requestId': request_model['id'], - # We return wasAlreadyHandled: false even though the request may - # have been added as handled, because that's how API behaves. - 'wasAlreadyHandled': False, - 'wasAlreadyPresent': False, - } - - async def get_request(self: RequestQueueClient, request_id: str) -> dict | None: - """Retrieve a request from the queue. - - Args: - request_id (str): ID of the request to retrieve - - Returns: - dict, optional: The retrieved request, or None, if it did not exist. - """ - existing_queue_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - - async with existing_queue_by_id._file_operation_lock: - await existing_queue_by_id._update_timestamps(has_been_modified=False) - - request = existing_queue_by_id._requests.get(request_id) - return self._json_to_request(request['json'] if request is not None else None) - - async def update_request(self: RequestQueueClient, request: dict, *, forefront: bool | None = None) -> dict: - """Update a request in the queue. - - Args: - request (dict): The updated request - forefront (bool, optional): Whether to put the updated request in the beginning or the end of the queue - - Returns: - dict: The updated request - """ - existing_queue_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - - request_model = self._create_internal_request(request, forefront) - - # First we need to check the existing request to be - # able to return information about its handled state. - - existing_request = existing_queue_by_id._requests.get(request_model['id']) - - # Undefined means that the request is not present in the queue. - # We need to insert it, to behave the same as API. - if existing_request is None: - return await self.add_request(request, forefront=forefront) - - async with existing_queue_by_id._file_operation_lock: - # When updating the request, we need to make sure that - # the handled counts are updated correctly in all cases. - existing_queue_by_id._requests[request_model['id']] = request_model - - pending_count_adjustment = 0 - is_request_handled_state_changing = not isinstance(existing_request['orderNo'], type(request_model['orderNo'])) - request_was_handled_before_update = existing_request['orderNo'] is None - - # We add 1 pending request if previous state was handled - if is_request_handled_state_changing: - pending_count_adjustment = 1 if request_was_handled_before_update else -1 - - existing_queue_by_id._pending_request_count += pending_count_adjustment - existing_queue_by_id._handled_request_count -= pending_count_adjustment - await existing_queue_by_id._update_timestamps(has_been_modified=True) - await update_request_queue_item( - request=request_model, - request_id=request_model['id'], - entity_directory=existing_queue_by_id._resource_directory, - persist_storage=self._memory_storage_client._persist_storage, - ) - - return { - 'requestId': request_model['id'], - 'wasAlreadyHandled': request_was_handled_before_update, - 'wasAlreadyPresent': True, - } - - async def delete_request(self: RequestQueueClient, request_id: str) -> None: - """Delete a request from the queue. - - Args: - request_id (str): ID of the request to delete. - """ - existing_queue_by_id = self._find_or_create_client_by_id_or_name( - memory_storage_client=self._memory_storage_client, id=self._id, name=self._name - ) - - if existing_queue_by_id is None: - raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - - async with existing_queue_by_id._file_operation_lock: - request = existing_queue_by_id._requests.get(request_id) - - if request: - del existing_queue_by_id._requests[request_id] - if request['orderNo'] is None: - existing_queue_by_id._handled_request_count -= 1 - else: - existing_queue_by_id._pending_request_count -= 1 - await existing_queue_by_id._update_timestamps(has_been_modified=True) - await delete_request(entity_directory=existing_queue_by_id._resource_directory, request_id=request_id) - - def _to_resource_info(self: RequestQueueClient) -> dict: - """Retrieve the request queue store info.""" - return { - 'accessedAt': self._accessed_at, - 'createdAt': self._created_at, - 'hadMultipleClients': False, - 'handledRequestCount': self._handled_request_count, - 'id': self._id, - 'modifiedAt': self._modified_at, - 'name': self._name, - 'pendingRequestCount': self._pending_request_count, - 'stats': {}, - 'totalRequestCount': len(self._requests), - 'userId': '1', - } - - async def _update_timestamps(self: RequestQueueClient, has_been_modified: bool) -> None: # noqa: FBT001 - self._accessed_at = datetime.now(timezone.utc) - - if has_been_modified: - self._modified_at = datetime.now(timezone.utc) - - request_queue_info = self._to_resource_info() - await update_metadata( - data=request_queue_info, - entity_directory=self._resource_directory, - write_metadata=self._memory_storage_client._write_metadata, - ) - - def _json_to_request(self: RequestQueueClient, request_json: str | None) -> dict | None: - if request_json is None: - return None - request = json.loads(request_json) - return filter_out_none_values_recursively(request) - - def _create_internal_request(self: RequestQueueClient, request: dict, forefront: bool | None) -> dict: - order_no = self._calculate_order_no(request, forefront) - id = unique_key_to_request_id(request['uniqueKey']) # noqa: A001 - - if request.get('id') is not None and request['id'] != id: - raise ValueError('Request ID does not match its unique_key.') - - json_request = json_dumps({**request, 'id': id}) - return { - 'id': id, - 'json': json_request, - 'method': request.get('method'), - 'orderNo': order_no, - 'retryCount': request.get('retryCount', 0), - 'uniqueKey': request['uniqueKey'], - 'url': request['url'], - } - - def _calculate_order_no(self: RequestQueueClient, request: dict, forefront: bool | None) -> Decimal | None: - if request.get('handledAt') is not None: - return None - - # Get the current timestamp in milliseconds - timestamp = Decimal(datetime.now(timezone.utc).timestamp()) * 1000 - timestamp = round(timestamp, 6) - - # Make sure that this timestamp was not used yet, so that we have unique orderNos - if timestamp <= self._last_used_timestamp: - timestamp = self._last_used_timestamp + Decimal(0.000001) - - self._last_used_timestamp = timestamp - - return -timestamp if forefront else timestamp - - @classmethod - def _get_storages_dir(cls: type[RequestQueueClient], memory_storage_client: MemoryStorageClient) -> str: - return memory_storage_client._request_queues_directory - - @classmethod - def _get_storage_client_cache( - cls: type[RequestQueueClient], - memory_storage_client: MemoryStorageClient, - ) -> list[RequestQueueClient]: - return memory_storage_client._request_queues_handled - - @classmethod - def _create_from_directory( - cls: type[RequestQueueClient], - storage_directory: str, - memory_storage_client: MemoryStorageClient, - id: str | None = None, # noqa: A002 - name: str | None = None, - ) -> RequestQueueClient: - created_at = datetime.now(timezone.utc) - accessed_at = datetime.now(timezone.utc) - modified_at = datetime.now(timezone.utc) - handled_request_count = 0 - pending_request_count = 0 - entries: list[dict] = [] - - # Access the request queue folder - for entry in os.scandir(storage_directory): - if entry.is_file(): - if entry.name == '__metadata__.json': - # We have found the queue's metadata file, build out information based on it - with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f: - metadata = json.load(f) - id = metadata['id'] # noqa: A001 - name = metadata['name'] - created_at = datetime.fromisoformat(metadata['createdAt']) - accessed_at = datetime.fromisoformat(metadata['accessedAt']) - modified_at = datetime.fromisoformat(metadata['modifiedAt']) - handled_request_count = metadata['handledRequestCount'] - pending_request_count = metadata['pendingRequestCount'] - - continue - - with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f: - request = json.load(f) - if request.get('orderNo'): - request['orderNo'] = Decimal(request.get('orderNo')) - entries.append(request) - - new_client = cls( - base_storage_directory=memory_storage_client._request_queues_directory, - memory_storage_client=memory_storage_client, - id=id, - name=name, - ) - - # Overwrite properties - new_client._accessed_at = accessed_at - new_client._created_at = created_at - new_client._modified_at = modified_at - new_client._handled_request_count = handled_request_count - new_client._pending_request_count = pending_request_count - - for request in entries: - new_client._requests[request['id']] = request - - return new_client diff --git a/src/apify/_memory_storage/resource_clients/request_queue_collection.py b/src/apify/_memory_storage/resource_clients/request_queue_collection.py deleted file mode 100644 index dd69c918..00000000 --- a/src/apify/_memory_storage/resource_clients/request_queue_collection.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from apify_shared.utils import ignore_docs - -from apify._memory_storage.resource_clients.base_resource_collection_client import BaseResourceCollectionClient -from apify._memory_storage.resource_clients.request_queue import RequestQueueClient - -if TYPE_CHECKING: - from apify_shared.models import ListPage - - -@ignore_docs -class RequestQueueCollectionClient(BaseResourceCollectionClient): - """Sub-client for manipulating request queues.""" - - def _get_storage_client_cache(self: RequestQueueCollectionClient) -> list[RequestQueueClient]: - return self._memory_storage_client._request_queues_handled - - def _get_resource_client_class(self: RequestQueueCollectionClient) -> type[RequestQueueClient]: - return RequestQueueClient - - async def list(self: RequestQueueCollectionClient) -> ListPage: - """List the available request queues. - - Returns: - ListPage: The list of available request queues matching the specified filters. - """ - return await super().list() - - async def get_or_create( - self: RequestQueueCollectionClient, - *, - name: str | None = None, - schema: dict | None = None, - _id: str | None = None, - ) -> dict: - """Retrieve a named request queue, or create a new one when it doesn't exist. - - Args: - name (str, optional): The name of the request queue to retrieve or create. - schema (dict, optional): The schema of the request queue - - Returns: - dict: The retrieved or newly-created request queue. - """ - return await super().get_or_create(name=name, schema=schema, _id=_id) diff --git a/src/apify/_platform_event_manager.py b/src/apify/_platform_event_manager.py new file mode 100644 index 00000000..0eb0dda6 --- /dev/null +++ b/src/apify/_platform_event_manager.py @@ -0,0 +1,201 @@ +from __future__ import annotations + +import asyncio +from datetime import datetime # noqa: TCH003 +from typing import TYPE_CHECKING, Annotated, Any, Literal, Union + +import websockets.client +from pydantic import BaseModel, Discriminator, Field, TypeAdapter +from typing_extensions import Self, Unpack, override + +from apify_shared.utils import ignore_docs +from crawlee.events._event_manager import EventManager, EventManagerOptions +from crawlee.events._local_event_manager import LocalEventManager +from crawlee.events._types import Event, EventAbortingData, EventExitData, EventMigratingData, EventPersistStateData, EventSystemInfoData + +from apify._log import logger + +if TYPE_CHECKING: + from types import TracebackType + + from apify._configuration import Configuration + + +__all__ = ['EventManager', 'LocalEventManager', 'PlatformEventManager'] + + +class PersistStateEvent(BaseModel): + name: Literal[Event.PERSIST_STATE] + data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))] + + +class SystemInfoEventData(BaseModel): + mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')] + mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')] + mem_max_bytes: Annotated[float, Field(alias='memMaxBytes')] + cpu_avg_usage: Annotated[float, Field(alias='cpuAvgUsage')] + cpu_max_usage: Annotated[float, Field(alias='cpuMaxUsage')] + cpu_current_usage: Annotated[float, Field(alias='cpuCurrentUsage')] + is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')] + created_at: Annotated[datetime, Field(alias='createdAt')] + + def to_crawlee_format(self) -> EventSystemInfoData: + return EventSystemInfoData.model_validate( + { + 'cpu_info': { + 'used_ratio': self.cpu_current_usage, + 'created_at': self.created_at, + }, + 'memory_info': { + 'total_size': self.mem_max_bytes, + 'current_size': self.mem_current_bytes, + 'created_at': self.created_at, + }, + } + ) + + +class SystemInfoEvent(BaseModel): + name: Literal[Event.SYSTEM_INFO] + data: SystemInfoEventData + + +class MigratingEvent(BaseModel): + name: Literal[Event.MIGRATING] + data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)] + + +class AbortingEvent(BaseModel): + name: Literal[Event.ABORTING] + data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)] + + +class ExitEvent(BaseModel): + name: Literal[Event.EXIT] + data: Annotated[EventExitData, Field(default_factory=EventExitData)] + + +class EventWithoutData(BaseModel): + name: Literal[ + Event.SESSION_RETIRED, + Event.BROWSER_LAUNCHED, + Event.BROWSER_RETIRED, + Event.BROWSER_CLOSED, + Event.PAGE_CREATED, + Event.PAGE_CLOSED, + ] + data: Any = None + + +class UnknownEvent(BaseModel): + name: str + data: Annotated[dict[str, Any], Field(default_factory=dict)] + + +EventMessage = Union[ + PersistStateEvent, + SystemInfoEvent, + MigratingEvent, + AbortingEvent, + ExitEvent, + EventWithoutData, +] + + +event_data_adapter: TypeAdapter[EventMessage | UnknownEvent] = TypeAdapter( + Union[ + Annotated[ + EventMessage, + Discriminator('name'), + ], + UnknownEvent, + ] +) + + +@ignore_docs +class PlatformEventManager(EventManager): + """A class for managing Actor events. + + You shouldn't use this class directly, + but instead use it via the `Actor.on()` and `Actor.off()` methods. + """ + + _platform_events_websocket: websockets.client.WebSocketClientProtocol | None = None + _process_platform_messages_task: asyncio.Task | None = None + _send_system_info_interval_task: asyncio.Task | None = None + _connected_to_platform_websocket: asyncio.Future = asyncio.Future() + + def __init__(self, config: Configuration, **kwargs: Unpack[EventManagerOptions]) -> None: + """Create an instance of the EventManager. + + Args: + config: The Actor configuration to be used in this event manager. + kwargs: Event manager options - forwarded to the base class + """ + super().__init__(**kwargs) + + self._config = config + self._listener_tasks = set() + self._connected_to_platform_websocket = asyncio.Future[bool]() + + @override + async def __aenter__(self) -> Self: + await super().__aenter__() + self._connected_to_platform_websocket = asyncio.Future() + + # Run tasks but don't await them + if self._config.actor_events_ws_url: + self._process_platform_messages_task = asyncio.create_task(self._process_platform_messages(self._config.actor_events_ws_url)) + is_connected = await self._connected_to_platform_websocket + if not is_connected: + raise RuntimeError('Error connecting to platform events websocket!') + else: + logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.') + + return self + + @override + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + if self._platform_events_websocket: + await self._platform_events_websocket.close() + + if self._process_platform_messages_task: + await self._process_platform_messages_task + + await super().__aexit__(exc_type, exc_value, exc_traceback) + + async def _process_platform_messages(self, ws_url: str) -> None: + try: + async with websockets.client.connect(ws_url) as websocket: + self._platform_events_websocket = websocket + self._connected_to_platform_websocket.set_result(True) + + async for message in websocket: + try: + parsed_message = event_data_adapter.validate_json(message) + + if isinstance(parsed_message, UnknownEvent): + logger.info(f'Unknown message received: event_name={parsed_message.name}, event_data={parsed_message.data}') + continue + + self.emit( + event=parsed_message.name, + event_data=parsed_message.data + if not isinstance(parsed_message.data, SystemInfoEventData) + else parsed_message.data.to_crawlee_format(), + ) + + if parsed_message.name == Event.MIGRATING: + await self._emit_persist_state_event_rec_task.stop() + self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True)) + except Exception: + logger.exception('Cannot parse Actor event', extra={'message': message}) + except Exception: + logger.exception('Error in websocket connection') + self._connected_to_platform_websocket.set_result(False) diff --git a/src/apify/proxy_configuration.py b/src/apify/_proxy_configuration.py similarity index 52% rename from src/apify/proxy_configuration.py rename to src/apify/_proxy_configuration.py index 51e6b7c0..47347c8b 100644 --- a/src/apify/proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -1,21 +1,26 @@ from __future__ import annotations -import inspect import ipaddress import re -from typing import TYPE_CHECKING, Any, Awaitable, Callable, Pattern, TypedDict +from dataclasses import dataclass, field +from re import Pattern +from typing import TYPE_CHECKING, Any from urllib.parse import urljoin, urlparse import httpx + from apify_shared.consts import ApifyEnvVars from apify_shared.utils import ignore_docs +from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration +from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo +from crawlee.proxy_configuration import _NewUrlFunction -from apify.config import Configuration -from apify.log import logger +from apify._configuration import Configuration +from apify._log import logger if TYPE_CHECKING: from apify_client import ApifyClientAsync - from typing_extensions import NotRequired + from crawlee import Request APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$') COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$') @@ -62,30 +67,16 @@ def _check( raise ValueError(f'{error_str} does not match pattern {pattern.pattern!r}') -class ProxyInfo(TypedDict): +@dataclass +class ProxyInfo(CrawleeProxyInfo): """Provides information about a proxy connection that is used for requests.""" - url: str - """The URL of the proxy.""" - - hostname: str - """The hostname of the proxy.""" - - port: int - """The proxy port.""" - - username: NotRequired[str] - """The username for the proxy.""" - - password: str - """The password for the proxy.""" - - groups: NotRequired[list[str]] + groups: list[str] = field(default_factory=list) """An array of proxy groups to be used by the [Apify Proxy](https://docs.apify.com/proxy). If not provided, the proxy will select the groups automatically. """ - country_code: NotRequired[str] + country_code: str | None = None """If set and relevant proxies are available in your Apify account, all proxied requests will use IP addresses that are geolocated to the specified country. For example `GB` for IPs from Great Britain. Note that online services often have their own rules for handling @@ -96,11 +87,8 @@ class ProxyInfo(TypedDict): This parameter is optional, by default, the proxy uses all available proxy servers from all countries. """ - session_id: NotRequired[str] - """The identifier of the used proxy session, if used. Using the same session ID guarantees getting the same proxy URL.""" - -class ProxyConfiguration: +class ProxyConfiguration(CrawleeProxyConfiguration): """Configures a connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or blacklists. @@ -112,87 +100,76 @@ class ProxyConfiguration: Your list of proxy URLs will be rotated by the configuration, if this option is provided. """ - is_man_in_the_middle = False - - _next_custom_url_index = 0 - _proxy_urls: list[str] - _used_proxy_urls: dict[str, str] - _new_url_function: Callable[[str | None], str] | Callable[[str | None], Awaitable[str]] | None = None - _groups: list[str] - _country_code: str | None = None - _password: str | None = None - _hostname: str - _port: int - _uses_apify_proxy: bool | None = None - _actor_config: Configuration - _apify_client: ApifyClientAsync | None = None + _configuration: Configuration @ignore_docs def __init__( - self: ProxyConfiguration, + self, *, password: str | None = None, groups: list[str] | None = None, country_code: str | None = None, proxy_urls: list[str] | None = None, - new_url_function: Callable[[str | None], str] | Callable[[str | None], Awaitable[str]] | None = None, + new_url_function: _NewUrlFunction | None = None, + tiered_proxy_urls: list[list[str]] | None = None, _actor_config: Configuration | None = None, _apify_client: ApifyClientAsync | None = None, ) -> None: """Create a ProxyConfiguration instance. It is highly recommended to use `Actor.create_proxy_configuration()` instead of this. Args: - password (str, optional): Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available. - groups (list of str, optional): Proxy groups which the Apify Proxy should use, if provided. - country_code (str, optional): Country which the Apify Proxy should use, if provided. - proxy_urls (list of str, optional): Custom proxy server URLs which should be rotated through. - new_url_function (Callable, optional): Function which returns a custom proxy URL to be used. + password: Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available. + groups: Proxy groups which the Apify Proxy should use, if provided. + country_code: Country which the Apify Proxy should use, if provided. + proxy_urls: Custom proxy server URLs which should be rotated through. + new_url_function: Function which returns a custom proxy URL to be used. + tiered_proxy_urls: Proxy URLs arranged into tiers """ + _actor_config = _actor_config or Configuration.get_global_configuration() + if groups: groups = [str(group) for group in groups] for group in groups: _check(group, label='groups', pattern=APIFY_PROXY_VALUE_REGEX) + if country_code: country_code = str(country_code) _check(country_code, label='country_code', pattern=COUNTRY_CODE_REGEX) - if proxy_urls: - for i, url in enumerate(proxy_urls): - if not is_url(url): - raise ValueError(f'proxy_urls[{i}] ("{url}") is not a valid URL') - # Validation - if proxy_urls and new_url_function: - raise ValueError('Cannot combine custom proxies in "proxy_urls" with custom generating function in "new_url_function".') - - if (proxy_urls or new_url_function) and (groups or country_code): + if (proxy_urls or new_url_function or tiered_proxy_urls) and (groups or country_code): raise ValueError( 'Cannot combine custom proxies with Apify Proxy!' ' It is not allowed to set "proxy_urls" or "new_url_function" combined with' ' "groups" or "country_code".' ) - # mypy has a bug with narrowing types for filter (https://github.com/python/mypy/issues/12682) - if proxy_urls and next(filter(lambda url: 'apify.com' in url, proxy_urls), None): # type: ignore + if proxy_urls and any('apify.com' in url for url in proxy_urls): logger.warning( 'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties instead of `proxy_urls`.\n' 'See https://sdk.apify.com/docs/guides/proxy-management#apify-proxy-configuration' ) - self._actor_config = _actor_config or Configuration._get_default_instance() + self._uses_apify_proxy = not (proxy_urls or new_url_function or tiered_proxy_urls) + + super().__init__( + proxy_urls=[f'http://{_actor_config.proxy_hostname}:{_actor_config.proxy_port}'] if self._uses_apify_proxy else proxy_urls, + new_url_function=new_url_function, + tiered_proxy_urls=tiered_proxy_urls, + ) + self._configuration = _actor_config + + self.is_man_in_the_middle = False + self._apify_client = _apify_client - self._hostname = self._actor_config.proxy_hostname - self._port = self._actor_config.proxy_port - self._password = password or self._actor_config.proxy_password + self._hostname = self._configuration.proxy_hostname + self._port = self._configuration.proxy_port + self._password = password or self._configuration.proxy_password - self._proxy_urls = list(proxy_urls) if proxy_urls else [] - self._used_proxy_urls = {} - self._new_url_function = new_url_function self._groups = list(groups) if groups else [] self._country_code = country_code - self._uses_apify_proxy = not (proxy_urls or new_url_function) - async def initialize(self: ProxyConfiguration) -> None: + async def initialize(self) -> None: """Load the Apify Proxy password if the API token is provided and check access to Apify Proxy and provided proxy groups. Only called if Apify Proxy configuration is used. @@ -205,100 +182,65 @@ async def initialize(self: ProxyConfiguration) -> None: await self._maybe_fetch_password() await self._check_access() - async def new_url(self: ProxyConfiguration, session_id: int | str | None = None) -> str: - """Return a new proxy URL based on provided configuration options and the `sessionId` parameter. - - Args: - session_id (int or str, optional): Represents the identifier of a proxy session (https://docs.apify.com/proxy#sessions). - All the HTTP requests going through the proxy with the same session identifier - will use the same target proxy server (i.e. the same IP address). - The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. - - Returns: - str: A string with a proxy URL, including authentication credentials and port number. - For example, `http://bob:password123@proxy.example.com:8000` - """ - if session_id is not None: - session_id = f'{session_id}' - _check(session_id, label='session_id', max_length=SESSION_ID_MAX_LENGTH, pattern=APIFY_PROXY_VALUE_REGEX) - - if self._new_url_function: - try: - res = self._new_url_function(session_id) - if inspect.isawaitable(res): - res = await res - return str(res) - except Exception as exc: - raise ValueError('The provided "new_url_function" did not return a valid URL') from exc - - if self._proxy_urls: - if not session_id: - index = self._next_custom_url_index - self._next_custom_url_index = (self._next_custom_url_index + 1) % len(self._proxy_urls) - return self._proxy_urls[index] - - if session_id not in self._used_proxy_urls: - index = self._next_custom_url_index - self._next_custom_url_index = (self._next_custom_url_index + 1) % len(self._proxy_urls) - self._used_proxy_urls[session_id] = self._proxy_urls[index] - - return self._used_proxy_urls[session_id] - - username = self._get_username(session_id) - - return f'http://{username}:{self._password}@{self._hostname}:{self._port}' - - async def new_proxy_info(self: ProxyConfiguration, session_id: int | str | None = None) -> ProxyInfo: + async def new_proxy_info( + self, + session_id: str | None = None, + request: Request | None = None, + proxy_tier: int | None = None, + ) -> ProxyInfo | None: """Create a new ProxyInfo object. Use it if you want to work with a rich representation of a proxy URL. If you need the URL string only, use `ProxyConfiguration.new_url`. Args: - session_id (int or str, optional): Represents the identifier of a proxy session (https://docs.apify.com/proxy#sessions). - All the HTTP requests going through the proxy with the same session identifier - will use the same target proxy server (i.e. the same IP address). - The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. - - Returns: - ProxyInfo: Dictionary that represents information about the proxy and its configuration. + session_id: Represents the identifier of a proxy session (https://docs.apify.com/proxy#sessions). + All the HTTP requests going through the proxy with the same session identifier + will use the same target proxy server (i.e. the same IP address). + The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. + request: request for which the proxy info is being issued, used in proxy tier handling + proxy_tier: allows forcing the proxy tier to be used + + Returns: Dictionary that represents information about the proxy and its configuration. """ if session_id is not None: - session_id = f'{session_id}' _check(session_id, label='session_id', max_length=SESSION_ID_MAX_LENGTH, pattern=APIFY_PROXY_VALUE_REGEX) - url = await self.new_url(session_id) - res: ProxyInfo + proxy_info = await super().new_proxy_info(session_id=session_id, request=request, proxy_tier=proxy_tier) + + if proxy_info is None: + return None + if self._uses_apify_proxy: - res = { - 'url': url, - 'hostname': self._hostname, - 'port': self._port, - 'username': self._get_username(session_id), - 'password': self._password or '', - 'groups': self._groups, - } - if self._country_code: - res['country_code'] = self._country_code - if session_id is not None: - res['session_id'] = session_id - return res - - parsed_url = urlparse(url) - assert parsed_url.hostname is not None # noqa: S101 - assert parsed_url.port is not None # noqa: S101 - res = { - 'url': url, - 'hostname': parsed_url.hostname, - 'port': parsed_url.port, - 'password': parsed_url.password or '', - } - if parsed_url.username: - res['username'] = parsed_url.username - return res - - async def _maybe_fetch_password(self: ProxyConfiguration) -> None: - token = self._actor_config.token + parsed_url = httpx.URL(proxy_info.url) + username = self._get_username(session_id) + + return ProxyInfo( + url=f'http://{username}:{self._password or ""}@{parsed_url.host}:{parsed_url.port}', + scheme='http', + hostname=proxy_info.hostname, + port=proxy_info.port, + username=username, + password=self._password or '', + session_id=proxy_info.session_id, + proxy_tier=proxy_info.proxy_tier, + groups=self._groups, + country_code=self._country_code or None, + ) + + return ProxyInfo( + url=proxy_info.url, + scheme=proxy_info.scheme, + hostname=proxy_info.hostname, + port=proxy_info.port, + username=proxy_info.username, + password=proxy_info.password, + session_id=proxy_info.session_id, + proxy_tier=proxy_info.proxy_tier, + ) + + async def _maybe_fetch_password(self) -> None: + token = self._configuration.token if token and self._apify_client: user_info = await self._apify_client.user().get() @@ -321,11 +263,15 @@ async def _maybe_fetch_password(self: ProxyConfiguration) -> None: f' If you add the "{ApifyEnvVars.TOKEN}" environment variable, the password will be automatically inferred.' ) - async def _check_access(self: ProxyConfiguration) -> None: - proxy_status_url = f'{self._actor_config.proxy_status_url}/?format=json' + async def _check_access(self) -> None: + proxy_status_url = f'{self._configuration.proxy_status_url}/?format=json' + proxy_info = await self.new_proxy_info() + + if proxy_info is None: + return status = None - async with httpx.AsyncClient(proxies=await self.new_url(), timeout=10) as client: + async with httpx.AsyncClient(proxies=proxy_info.url, timeout=10) as client: for _ in range(2): try: response = await client.get(proxy_status_url) @@ -346,7 +292,7 @@ async def _check_access(self: ProxyConfiguration) -> None: "If you see some, it most likely means you don't have access to either all or some of the proxies you're trying to use." ) - def _get_username(self: ProxyConfiguration, session_id: int | str | None = None) -> str: + def _get_username(self, session_id: int | str | None = None) -> str: if session_id is not None: session_id = f'{session_id}' diff --git a/src/apify/_utils.py b/src/apify/_utils.py index 6322f0ec..687bf93c 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -1,67 +1,8 @@ from __future__ import annotations -import asyncio import builtins -import contextlib -import functools -import inspect -import json -import mimetypes -import os -import re import sys -import time -from base64 import b64encode -from collections import OrderedDict -from collections.abc import MutableMapping -from datetime import datetime, timezone -from hashlib import sha256 from importlib import metadata -from logging import getLogger -from typing import ( - Any, - Callable, - Generic, - ItemsView, - Iterator, - NoReturn, - TypeVar, - ValuesView, - cast, - overload, -) -from typing import OrderedDict as OrderedDictType -from urllib.parse import parse_qsl, urlencode, urlparse - -import aioshutil -import psutil -from aiofiles import ospath -from aiofiles.os import remove, rename -from apify_shared.consts import ( - BOOL_ENV_VARS, - BOOL_ENV_VARS_TYPE, - DATETIME_ENV_VARS, - DATETIME_ENV_VARS_TYPE, - FLOAT_ENV_VARS, - FLOAT_ENV_VARS_TYPE, - INTEGER_ENV_VARS, - INTEGER_ENV_VARS_TYPE, - STRING_ENV_VARS_TYPE, - ActorEnvVars, - ApifyEnvVars, -) -from apify_shared.utils import ( - ignore_docs, - is_content_type_json, - is_content_type_text, - is_content_type_xml, - maybe_extract_enum_member_value, -) - -from apify.consts import REQUEST_ID_LENGTH, StorageTypes - -T = TypeVar('T') -logger = getLogger(__name__) def get_system_info() -> dict: @@ -80,443 +21,5 @@ def get_system_info() -> dict: return system_info -DualPropertyType = TypeVar('DualPropertyType') -DualPropertyOwner = TypeVar('DualPropertyOwner') - - -@ignore_docs -class dualproperty(Generic[DualPropertyType]): # noqa: N801 - """Descriptor combining `property` and `classproperty`. - - When accessing the decorated attribute on an instance, it calls the getter with the instance as the first argument, - and when accessing it on a class, it calls the getter with the class as the first argument. - """ - - def __init__(self: dualproperty, getter: Callable[..., DualPropertyType]) -> None: - """Initialize the dualproperty. - - Args: - getter (Callable): The getter of the property. - It should accept either an instance or a class as its first argument. - """ - self.getter = getter - - def __get__(self: dualproperty, obj: DualPropertyOwner | None, owner: type[DualPropertyOwner]) -> DualPropertyType: - """Call the getter with the right object. - - Args: - obj (T | None): The instance of class T on which the getter will be called - owner (type[T]): The class object of class T on which the getter will be called, if obj is None - - Returns: - The result of the getter. - """ - val = self.getter(obj or owner) - return cast(DualPropertyType, val) - - -@overload -def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE) -> bool | None: - ... - - -@overload -def fetch_and_parse_env_var(env_var: BOOL_ENV_VARS_TYPE, default: bool) -> bool: # noqa: FBT001 - ... - - -@overload -def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE) -> datetime | str | None: - ... - - -@overload -def fetch_and_parse_env_var(env_var: DATETIME_ENV_VARS_TYPE, default: datetime) -> datetime | str: - ... - - -@overload -def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE) -> float | None: - ... - - -@overload -def fetch_and_parse_env_var(env_var: FLOAT_ENV_VARS_TYPE, default: float) -> float: - ... - - -@overload -def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE) -> int | None: - ... - - -@overload -def fetch_and_parse_env_var(env_var: INTEGER_ENV_VARS_TYPE, default: int) -> int: - ... - - -@overload -def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE, default: str) -> str: - ... - - -@overload -def fetch_and_parse_env_var(env_var: STRING_ENV_VARS_TYPE) -> str | None: - ... - - -@overload -def fetch_and_parse_env_var(env_var: ActorEnvVars | ApifyEnvVars) -> Any: - ... - - -def fetch_and_parse_env_var(env_var: Any, default: Any = None) -> Any: - env_var_name = str(maybe_extract_enum_member_value(env_var)) - - val = os.getenv(env_var_name) - if not val: - return default - - if env_var in BOOL_ENV_VARS: - return maybe_parse_bool(val) - if env_var in FLOAT_ENV_VARS: - parsed_float = maybe_parse_float(val) - if parsed_float is None: - return default - return parsed_float - if env_var in INTEGER_ENV_VARS: - parsed_int = maybe_parse_int(val) - if parsed_int is None: - return default - return parsed_int - if env_var in DATETIME_ENV_VARS: - return maybe_parse_datetime(val) - return val - - -def get_cpu_usage_percent() -> float: - return psutil.cpu_percent() - - -def get_memory_usage_bytes() -> int: - current_process = psutil.Process(os.getpid()) - mem = int(current_process.memory_info().rss or 0) - for child in current_process.children(recursive=True): - with contextlib.suppress(psutil.NoSuchProcess): - mem += int(child.memory_info().rss or 0) - return mem - - -def maybe_parse_bool(val: str | None) -> bool: - return val in {'true', 'True', '1'} - - -def maybe_parse_datetime(val: str) -> datetime | str: - try: - return datetime.strptime(val, '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=timezone.utc) - except ValueError: - return val - - -def maybe_parse_float(val: str) -> float | None: - try: - return float(val) - except ValueError: - return None - - -def maybe_parse_int(val: str) -> int | None: - try: - return int(val) - except ValueError: - return None - - -async def run_func_at_interval_async(func: Callable, interval_secs: float) -> None: - started_at = time.perf_counter() - sleep_until = started_at - while True: - now = time.perf_counter() - sleep_until += interval_secs - while sleep_until < now: - sleep_until += interval_secs - - sleep_for_secs = sleep_until - now - await asyncio.sleep(sleep_for_secs) - - res = func() - if inspect.isawaitable(res): - await res - - -async def force_remove(filename: str) -> None: - """JS-like rm(filename, { force: true }).""" - with contextlib.suppress(FileNotFoundError): - await remove(filename) - - -def raise_on_non_existing_storage(client_type: StorageTypes, id: str) -> NoReturn: # noqa: A002 - client_type = maybe_extract_enum_member_value(client_type) - raise ValueError(f'{client_type} with id "{id}" does not exist.') - - -def raise_on_duplicate_storage(client_type: StorageTypes, key_name: str, value: str) -> NoReturn: - client_type = maybe_extract_enum_member_value(client_type) - raise ValueError(f'{client_type} with {key_name} "{value}" already exists.') - - -def guess_file_extension(content_type: str) -> str | None: - """Guess the file extension based on content type.""" - # e.g. mimetypes.guess_extension('application/json ') does not work... - actual_content_type = content_type.split(';')[0].strip() - - # mimetypes.guess_extension returns 'xsl' in this case, because 'application/xxx' is "structured" - # ('text/xml' would be "unstructured" and return 'xml') - # we have to explicitly override it here - if actual_content_type == 'application/xml': - return 'xml' - - # Guess the extension from the mime type - ext = mimetypes.guess_extension(actual_content_type) - - # Remove the leading dot if extension successfully parsed - return ext[1:] if ext is not None else ext - - -def maybe_parse_body(body: bytes, content_type: str) -> Any: - if is_content_type_json(content_type): - return json.loads(body.decode('utf-8')) # Returns any - if is_content_type_xml(content_type) or is_content_type_text(content_type): - return body.decode('utf-8') - return body - - -def unique_key_to_request_id(unique_key: str) -> str: - """Generate request ID based on unique key in a deterministic way.""" - request_id = re.sub(r'(\+|\/|=)', '', b64encode(sha256(unique_key.encode('utf-8')).digest()).decode('utf-8')) - return request_id[:REQUEST_ID_LENGTH] if len(request_id) > REQUEST_ID_LENGTH else request_id - - -async def force_rename(src_dir: str, dst_dir: str) -> None: - """Rename a directory. Checks for existence of soruce directory and removes destination directory if it exists.""" - # Make sure source directory exists - if await ospath.exists(src_dir): - # Remove destination directory if it exists - if await ospath.exists(dst_dir): - await aioshutil.rmtree(dst_dir, ignore_errors=True) - await rename(src_dir, dst_dir) - - -ImplementationType = TypeVar('ImplementationType', bound=Callable) -MetadataType = TypeVar('MetadataType', bound=Callable) - - -def wrap_internal(implementation: ImplementationType, metadata_source: MetadataType) -> MetadataType: - @functools.wraps(metadata_source) - def wrapper(*args: Any, **kwargs: Any) -> Any: - return implementation(*args, **kwargs) - - return cast(MetadataType, wrapper) - - -@ignore_docs -class LRUCache(MutableMapping, Generic[T]): - """Attempt to reimplement LRUCache from `@apify/datastructures` using `OrderedDict`.""" - - _cache: OrderedDictType[str, T] - - _max_length: int - - def __init__(self: LRUCache, max_length: int) -> None: - """Create a LRUCache with a specific max_length.""" - self._cache = OrderedDict() - self._max_length = max_length - - def __getitem__(self: LRUCache, key: str) -> T: - """Get an item from the cache. Move it to the end if present.""" - val = self._cache[key] - # No 'key in cache' condition since the previous line would raise KeyError - self._cache.move_to_end(key) - return cast(T, val) - - # Sadly TS impl returns bool indicating whether the key was already present or not - def __setitem__(self: LRUCache, key: str, value: T) -> None: - """Add an item to the cache. Remove least used item if max_length exceeded.""" - self._cache[key] = value - if len(self._cache) > self._max_length: - self._cache.popitem(last=False) - - def __delitem__(self: LRUCache, key: str) -> None: - """Remove an item from the cache.""" - del self._cache[key] - - def __iter__(self: LRUCache) -> Iterator[str]: - """Iterate over the keys of the cache in order of insertion.""" - return self._cache.__iter__() - - def __len__(self: LRUCache) -> int: - """Get the number of items in the cache.""" - return len(self._cache) - - def values(self: LRUCache) -> ValuesView[T]: # Needed so we don't mutate the cache by __getitem__ - """Iterate over the values in the cache in order of insertion.""" - return self._cache.values() - - def items(self: LRUCache) -> ItemsView[str, T]: # Needed so we don't mutate the cache by __getitem__ - """Iterate over the pairs of (key, value) in the cache in order of insertion.""" - return self._cache.items() - - def is_running_in_ipython() -> bool: return getattr(builtins, '__IPYTHON__', False) - - -@overload -def budget_ow(value: str | float | bool, predicate: tuple[type, bool], value_name: str) -> None: - ... - - -@overload -def budget_ow(value: dict, predicate: dict[str, tuple[type, bool]]) -> None: - ... - - -def budget_ow( - value: dict | str | float | bool, - predicate: dict[str, tuple[type, bool]] | tuple[type, bool], - value_name: str | None = None, -) -> None: - """Budget version of ow.""" - - def validate_single(field_value: Any, expected_type: type, required: bool, name: str) -> None: # noqa: FBT001 - if field_value is None and required: - raise ValueError(f'"{name}" is required!') - if (field_value is not None or required) and not isinstance(field_value, expected_type): - raise ValueError(f'"{name}" must be of type "{expected_type.__name__}" but it is "{type(field_value).__name__}"!') - - # Validate object - if isinstance(value, dict) and isinstance(predicate, dict): - for key, (field_type, required) in predicate.items(): - field_value = value.get(key) - validate_single(field_value, field_type, required, key) - # Validate "primitive" - elif isinstance(value, (int, str, float, bool)) and isinstance(predicate, tuple) and value_name is not None: - field_type, required = predicate - validate_single(value, field_type, required, value_name) - else: - raise ValueError('Wrong input!') - - -PARSE_DATE_FIELDS_MAX_DEPTH = 3 -PARSE_DATE_FIELDS_KEY_SUFFIX = 'At' -ListOrDictOrAny = TypeVar('ListOrDictOrAny', list, dict, Any) - - -def compute_short_hash(data: bytes, *, length: int = 8) -> str: - """Computes a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it. - - Args: - data: The binary data to be hashed. - length: The length of the hash to be returned. - - Returns: - A substring (prefix) of the hexadecimal hash of the data. - """ - hash_object = sha256(data) - return hash_object.hexdigest()[:length] - - -def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str: - """Normalizes a URL. - - This function cleans and standardizes a URL by removing leading and trailing whitespaces, - converting the scheme and netloc to lower case, stripping unwanted tracking parameters - (specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically, - and optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally - identical but differ in trivial ways (such as parameter order or casing) are treated as the same. - - Args: - url: The URL to be normalized. - keep_url_fragment: Flag to determine whether the fragment part of the URL should be retained. - - Returns: - A string containing the normalized URL. - """ - # Parse the URL - parsed_url = urlparse(url.strip()) - search_params = dict(parse_qsl(parsed_url.query)) # Convert query to a dict - - # Remove any 'utm_' parameters - search_params = {k: v for k, v in search_params.items() if not k.startswith('utm_')} - - # Construct the new query string - sorted_keys = sorted(search_params.keys()) - sorted_query = urlencode([(k, search_params[k]) for k in sorted_keys]) - - # Construct the final URL - new_url = ( - parsed_url._replace( - query=sorted_query, - scheme=parsed_url.scheme, - netloc=parsed_url.netloc, - path=parsed_url.path.rstrip('/'), - ) - .geturl() - .lower() - ) - - # Retain the URL fragment if required - if not keep_url_fragment: - new_url = new_url.split('#')[0] - - return new_url - - -def compute_unique_key( - url: str, - method: str = 'GET', - payload: bytes | None = None, - *, - keep_url_fragment: bool = False, - use_extended_unique_key: bool = False, -) -> str: - """Computes a unique key for caching & deduplication of requests. - - This function computes a unique key by normalizing the provided URL and method. - If 'use_extended_unique_key' is True and a payload is provided, the payload is hashed and - included in the key. Otherwise, the unique key is just the normalized URL. - - Args: - url: The request URL. - method: The HTTP method, defaults to 'GET'. - payload: The request payload, defaults to None. - keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False. - use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False. - - Returns: - A string representing the unique key for the request. - """ - # Normalize the URL and method. - try: - normalized_url = normalize_url(url, keep_url_fragment=keep_url_fragment) - except Exception as exc: - logger.warning(f'Failed to normalize URL: {exc}') - normalized_url = url - - normalized_method = method.upper() - - # Compute and return the extended unique key if required. - if use_extended_unique_key: - payload_hash = compute_short_hash(payload) if payload else '' - return f'{normalized_method}({payload_hash}):{normalized_url}' - - # Log information if there is a non-GET request with a payload. - if normalized_method != 'GET' and payload: - logger.info( - f'We have encountered a {normalized_method} Request with a payload. This is fine. Just letting you know ' - 'that if your requests point to the same URL and differ only in method and payload, you should consider ' - 'using the "use_extended_unique_key" option.' - ) - - # Return the normalized URL as the unique key. - return normalized_url diff --git a/src/apify/actor.py b/src/apify/actor.py deleted file mode 100644 index 2c0b2239..00000000 --- a/src/apify/actor.py +++ /dev/null @@ -1,1357 +0,0 @@ -from __future__ import annotations - -import asyncio -import contextlib -import inspect -import os -import sys -from datetime import datetime, timedelta, timezone -from typing import TYPE_CHECKING, Any, Awaitable, Callable, TypeVar, cast - -from apify_client import ApifyClientAsync -from apify_shared.consts import ActorEnvVars, ActorEventTypes, ActorExitCodes, ApifyEnvVars, WebhookEventType -from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value - -from apify._crypto import decrypt_input_secrets, load_private_key -from apify._utils import ( - dualproperty, - fetch_and_parse_env_var, - get_cpu_usage_percent, - get_memory_usage_bytes, - get_system_info, - is_running_in_ipython, - run_func_at_interval_async, - wrap_internal, -) -from apify.config import Configuration -from apify.consts import EVENT_LISTENERS_TIMEOUT_SECS -from apify.event_manager import EventManager -from apify.log import logger -from apify.proxy_configuration import ProxyConfiguration -from apify.storages import Dataset, KeyValueStore, RequestQueue, StorageClientManager - -if TYPE_CHECKING: - import logging - from types import TracebackType - - from apify._memory_storage import MemoryStorageClient - -T = TypeVar('T') -MainReturnType = TypeVar('MainReturnType') - -# This metaclass is needed so you can do `async with Actor: ...` instead of `async with Actor() as a: ...` -# and have automatic `Actor.init()` and `Actor.exit()` - - -class _ActorContextManager(type): - @staticmethod - async def __aenter__() -> type[Actor]: - await Actor.init() - return Actor - - @staticmethod - async def __aexit__( - _exc_type: type[BaseException] | None, - exc_value: BaseException | None, - _exc_traceback: TracebackType | None, - ) -> None: - if not Actor._get_default_instance()._is_exiting: - if exc_value: - await Actor.fail( - exit_code=ActorExitCodes.ERROR_USER_FUNCTION_THREW.value, - exception=exc_value, - ) - else: - await Actor.exit() - - -class Actor(metaclass=_ActorContextManager): - """The main class of the SDK, through which all the actor operations should be done.""" - - _default_instance: Actor | None = None - _apify_client: ApifyClientAsync - _memory_storage_client: MemoryStorageClient - _config: Configuration - _event_manager: EventManager - _send_system_info_interval_task: asyncio.Task | None = None - _send_persist_state_interval_task: asyncio.Task | None = None - _is_exiting = False - _was_final_persist_state_emitted = False - - def __init__(self: Actor, config: Configuration | None = None) -> None: - """Create an Actor instance. - - Note that you don't have to do this, all the methods on this class function as classmethods too, - and that is their preferred usage. - - Args: - config (Configuration, optional): The actor configuration to be used. If not passed, a new Configuration instance will be created. - """ - # To have methods which work the same as classmethods and instance methods, - # so you can do both Actor.xxx() and Actor().xxx(), - # we need to have an `_xxx_internal` instance method which contains the actual implementation of the method, - # and then in the instance constructor overwrite the `xxx` classmethod with the `_xxx_internal` instance method, - # while copying the annotations, types and so on. - self.init = wrap_internal(self._init_internal, self.init) # type: ignore - self.exit = wrap_internal(self._exit_internal, self.exit) # type: ignore - self.fail = wrap_internal(self._fail_internal, self.fail) # type: ignore - self.main = wrap_internal(self._main_internal, self.main) # type: ignore - self.new_client = wrap_internal(self._new_client_internal, self.new_client) # type: ignore - - self.open_dataset = wrap_internal(self._open_dataset_internal, self.open_dataset) # type: ignore - self.open_key_value_store = wrap_internal(self._open_key_value_store_internal, self.open_key_value_store) # type: ignore - self.open_request_queue = wrap_internal(self._open_request_queue_internal, self.open_request_queue) # type: ignore - self.push_data = wrap_internal(self._push_data_internal, self.push_data) # type: ignore - self.get_input = wrap_internal(self._get_input_internal, self.get_input) # type: ignore - self.get_value = wrap_internal(self._get_value_internal, self.get_value) # type: ignore - self.set_value = wrap_internal(self._set_value_internal, self.set_value) # type: ignore - - self.on = wrap_internal(self._on_internal, self.on) # type: ignore - self.off = wrap_internal(self._off_internal, self.off) # type: ignore - - self.is_at_home = wrap_internal(self._is_at_home_internal, self.is_at_home) # type: ignore - self.get_env = wrap_internal(self._get_env_internal, self.get_env) # type: ignore - - self.start = wrap_internal(self._start_internal, self.start) # type: ignore - self.call = wrap_internal(self._call_internal, self.call) # type: ignore - self.call_task = wrap_internal(self._call_task_internal, self.call_task) # type: ignore - self.abort = wrap_internal(self._abort_internal, self.abort) # type: ignore - self.metamorph = wrap_internal(self._metamorph_internal, self.metamorph) # type: ignore - self.reboot = wrap_internal(self._reboot_internal, self.reboot) # type: ignore - self.add_webhook = wrap_internal(self._add_webhook_internal, self.add_webhook) # type: ignore - self.set_status_message = wrap_internal(self._set_status_message_internal, self.set_status_message) # type: ignore - self.create_proxy_configuration = wrap_internal(self._create_proxy_configuration_internal, self.create_proxy_configuration) # type: ignore - - self._config: Configuration = config or Configuration() - self._apify_client = self.new_client() - self._event_manager = EventManager(config=self._config) - - self._is_initialized = False - - @ignore_docs - async def __aenter__(self: Actor) -> Actor: - """Initialize the Actor. - - Automatically initializes the Actor instance when you use it in an `async with ...` statement. - - When you exit the `async with` block, the `Actor.exit()` method is called, - and if any exception happens while executing the block code, - the `Actor.fail` method is called. - """ - await self.init() - return self - - @ignore_docs - async def __aexit__( - self: Actor, - _exc_type: type[BaseException] | None, - exc_value: BaseException | None, - _exc_traceback: TracebackType | None, - ) -> None: - """Exit the Actor, handling any exceptions properly. - - When you exit the `async with` block, the `Actor.exit()` method is called, - and if any exception happens while executing the block code, - the `Actor.fail` method is called. - """ - if not self._is_exiting: - if exc_value: - await self.fail( - exit_code=ActorExitCodes.ERROR_USER_FUNCTION_THREW.value, - exception=exc_value, - ) - else: - await self.exit() - - @classmethod - def _get_default_instance(cls: type[Actor]) -> Actor: - if not cls._default_instance: - cls._default_instance = cls(config=Configuration.get_global_configuration()) - - return cls._default_instance - - @dualproperty - def apify_client(self_or_cls: type[Actor] | Actor) -> ApifyClientAsync: # noqa: N805 - """The ApifyClientAsync instance the Actor instance uses.""" - if isinstance(self_or_cls, type): - return self_or_cls._get_default_instance()._apify_client - return self_or_cls._apify_client - - @dualproperty - def config(self_or_cls: type[Actor] | Actor) -> Configuration: # noqa: N805 - """The Configuration instance the Actor instance uses.""" - if isinstance(self_or_cls, type): - return self_or_cls._get_default_instance()._config - return self_or_cls._config - - @dualproperty - def event_manager(self_or_cls: type[Actor] | Actor) -> EventManager: # noqa: N805 - """The EventManager instance the Actor instance uses.""" - if isinstance(self_or_cls, type): - return self_or_cls._get_default_instance()._event_manager - - return self_or_cls._event_manager - - @dualproperty - def log(_self_or_cls: type[Actor] | Actor) -> logging.Logger: # noqa: N805 - """The logging.Logger instance the Actor uses.""" - return logger - - def _raise_if_not_initialized(self: Actor) -> None: - if not self._is_initialized: - raise RuntimeError('The actor was not initialized!') - - @classmethod - async def init(cls: type[Actor]) -> None: - """Initialize the actor instance. - - This initializes the Actor instance. - It configures the right storage client based on whether the actor is running locally or on the Apify platform, - it initializes the event manager for processing actor events, - and starts an interval for regularly sending `PERSIST_STATE` events, - so that the actor can regularly persist its state in response to these events. - - This method should be called immediately before performing any additional actor actions, - and it should be called only once. - """ - return await cls._get_default_instance().init() - - async def _init_internal(self: Actor) -> None: - if self._is_initialized: - raise RuntimeError('The actor was already initialized!') - - self._is_exiting = False - self._was_final_persist_state_emitted = False - - self.log.info('Initializing actor...') - self.log.info('System info', extra=get_system_info()) - - # TODO: Print outdated SDK version warning (we need a new env var for this) - # https://github.com/apify/apify-sdk-python/issues/146 - - StorageClientManager.set_config(self._config) - if self._config.token: - StorageClientManager.set_cloud_client(self._apify_client) - - await self._event_manager.init() - - self._send_persist_state_interval_task = asyncio.create_task( - run_func_at_interval_async( - lambda: self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': False}), - self._config.persist_state_interval_millis / 1000, - ), - ) - - if not self.is_at_home(): - self._send_system_info_interval_task = asyncio.create_task( - run_func_at_interval_async( - lambda: self._event_manager.emit(ActorEventTypes.SYSTEM_INFO, self.get_system_info()), - self._config.system_info_interval_millis / 1000, - ), - ) - - self._event_manager.on(ActorEventTypes.MIGRATING, self._respond_to_migrating_event) - - # The CPU usage is calculated as an average between two last calls to psutil - # We need to make a first, dummy call, so the next calls have something to compare itself agains - get_cpu_usage_percent() - - self._is_initialized = True - - def get_system_info(self: Actor) -> dict: - """Get the current system info.""" - cpu_usage_percent = get_cpu_usage_percent() - memory_usage_bytes = get_memory_usage_bytes() - # This is in camel case to be compatible with the events from the platform - result = { - 'createdAt': datetime.now(timezone.utc), - 'cpuCurrentUsage': cpu_usage_percent, - 'memCurrentBytes': memory_usage_bytes, - } - if self._config.max_used_cpu_ratio: - result['isCpuOverloaded'] = cpu_usage_percent > 100 * self._config.max_used_cpu_ratio - - return result - - async def _respond_to_migrating_event(self: Actor, _event_data: Any) -> None: - # Don't emit any more regular persist state events - if self._send_persist_state_interval_task and not self._send_persist_state_interval_task.cancelled(): - self._send_persist_state_interval_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await self._send_persist_state_interval_task - - self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': True}) - self._was_final_persist_state_emitted = True - - async def _cancel_event_emitting_intervals(self: Actor) -> None: - if self._send_persist_state_interval_task and not self._send_persist_state_interval_task.cancelled(): - self._send_persist_state_interval_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await self._send_persist_state_interval_task - - if self._send_system_info_interval_task and not self._send_system_info_interval_task.cancelled(): - self._send_system_info_interval_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await self._send_system_info_interval_task - - @classmethod - async def exit( - cls: type[Actor], - *, - exit_code: int = 0, - event_listeners_timeout_secs: float | None = EVENT_LISTENERS_TIMEOUT_SECS, - status_message: str | None = None, - cleanup_timeout: timedelta = timedelta(seconds=30), - ) -> None: - """Exit the actor instance. - - This stops the Actor instance. - It cancels all the intervals for regularly sending `PERSIST_STATE` events, - sends a final `PERSIST_STATE` event, - waits for all the event listeners to finish, - and stops the event manager. - - Args: - exit_code (int, optional): The exit code with which the actor should fail (defaults to `0`). - event_listeners_timeout_secs (float, optional): How long should the actor wait for actor event listeners to finish before exiting. - status_message (str, optional): The final status message that the actor should display. - cleanup_timeout (timedelta, optional): How long we should wait for event listeners. - """ - return await cls._get_default_instance().exit( - exit_code=exit_code, - event_listeners_timeout_secs=event_listeners_timeout_secs, - status_message=status_message, - cleanup_timeout=cleanup_timeout, - ) - - async def _exit_internal( - self: Actor, - *, - exit_code: int = 0, - event_listeners_timeout_secs: float | None = EVENT_LISTENERS_TIMEOUT_SECS, - status_message: str | None = None, - cleanup_timeout: timedelta = timedelta(seconds=30), - ) -> None: - self._raise_if_not_initialized() - - self._is_exiting = True - - exit_code = maybe_extract_enum_member_value(exit_code) - - self.log.info('Exiting actor', extra={'exit_code': exit_code}) - - async def finalize() -> None: - await self._cancel_event_emitting_intervals() - - # Send final persist state event - if not self._was_final_persist_state_emitted: - self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': False}) - self._was_final_persist_state_emitted = True - - if status_message is not None: - await self.set_status_message(status_message, is_terminal=True) - - # Sleep for a bit so that the listeners have a chance to trigger - await asyncio.sleep(0.1) - - await self._event_manager.close(event_listeners_timeout_secs=event_listeners_timeout_secs) - - await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds()) - self._is_initialized = False - - if is_running_in_ipython(): - self.log.debug(f'Not calling sys.exit({exit_code}) because actor is running in IPython') - elif os.getenv('PYTEST_CURRENT_TEST', default=False): # noqa: PLW1508 - self.log.debug(f'Not calling sys.exit({exit_code}) because actor is running in an unit test') - elif hasattr(asyncio, '_nest_patched'): - self.log.debug(f'Not calling sys.exit({exit_code}) because actor is running in a nested event loop') - else: - sys.exit(exit_code) - - @classmethod - async def fail( - cls: type[Actor], - *, - exit_code: int = 1, - exception: BaseException | None = None, - status_message: str | None = None, - ) -> None: - """Fail the actor instance. - - This performs all the same steps as Actor.exit(), - but it additionally sets the exit code to `1` (by default). - - Args: - exit_code (int, optional): The exit code with which the actor should fail (defaults to `1`). - exception (BaseException, optional): The exception with which the actor failed. - status_message (str, optional): The final status message that the actor should display. - """ - return await cls._get_default_instance().fail( - exit_code=exit_code, - exception=exception, - status_message=status_message, - ) - - async def _fail_internal( - self: Actor, - *, - exit_code: int = 1, - exception: BaseException | None = None, - status_message: str | None = None, - ) -> None: - self._raise_if_not_initialized() - - # In IPython, we don't run `sys.exit()` during actor exits, - # so the exception traceback will be printed on its own - if exception and not is_running_in_ipython(): - self.log.exception('Actor failed with an exception', exc_info=exception) - - await self.exit(exit_code=exit_code, status_message=status_message) - - @classmethod - async def main(cls: type[Actor], main_actor_function: Callable[[], MainReturnType]) -> MainReturnType | None: - """Initialize the actor, run the passed function and finish the actor cleanly. - - **The `Actor.main()` function is optional** and is provided merely for your convenience. - It is mainly useful when you're running your code as an actor on the [Apify platform](https://apify.com/actors). - - The `Actor.main()` function performs the following actions: - - - When running on the Apify platform (i.e. `APIFY_IS_AT_HOME` environment variable is set), - it sets up a connection to listen for platform events. - For example, to get a notification about an imminent migration to another server. - - It invokes the user function passed as the `main_actor_function` parameter. - - If the user function was an async function, it awaits it. - - If the user function throws an exception or some other error is encountered, - it prints error details to console so that they are stored to the log, - and finishes the actor cleanly. - - Finally, it exits the Python process, with zero exit code on success and non-zero on errors. - - Args: - main_actor_function (Callable): The user function which should be run in the actor - """ - return await cls._get_default_instance().main( - main_actor_function=main_actor_function, - ) - - async def _main_internal(self: Actor, main_actor_function: Callable[[], MainReturnType]) -> MainReturnType | None: - if not inspect.isfunction(main_actor_function): - raise TypeError(f'First argument passed to Actor.main() must be a function, but instead it was {type(main_actor_function)}') - - await self.init() - try: - if inspect.iscoroutinefunction(main_actor_function): - res = await main_actor_function() - else: - res = main_actor_function() - await self.exit() - return cast(MainReturnType, res) - except Exception as exc: - await self.fail( - exit_code=ActorExitCodes.ERROR_USER_FUNCTION_THREW.value, - exception=exc, - ) - return None - - @classmethod - def new_client( - cls: type[Actor], - *, - token: str | None = None, - api_url: str | None = None, - max_retries: int | None = None, - min_delay_between_retries_millis: int | None = None, - timeout_secs: int | None = None, - ) -> ApifyClientAsync: - """Return a new instance of the Apify API client. - - The `ApifyClientAsync` class is provided by the [apify-client](https://github.com/apify/apify-client-python) package, - and it is automatically configured using the `APIFY_API_BASE_URL` and `APIFY_TOKEN` environment variables. - - You can override the token via the available options. - That's useful if you want to use the client as a different Apify user than the SDK internals are using. - - Args: - token (str, optional): The Apify API token - api_url (str, optional): The URL of the Apify API server to which to connect to. Defaults to https://api.apify.com - max_retries (int, optional): How many times to retry a failed request at most - min_delay_between_retries_millis (int, optional): How long will the client wait between retrying requests - (increases exponentially from this value) - timeout_secs (int, optional): The socket timeout of the HTTP requests sent to the Apify API - """ - return cls._get_default_instance().new_client( - token=token, - api_url=api_url, - max_retries=max_retries, - min_delay_between_retries_millis=min_delay_between_retries_millis, - timeout_secs=timeout_secs, - ) - - def _new_client_internal( - self: Actor, - *, - token: str | None = None, - api_url: str | None = None, - max_retries: int | None = None, - min_delay_between_retries_millis: int | None = None, - timeout_secs: int | None = None, - ) -> ApifyClientAsync: - token = token or self._config.token - api_url = api_url or self._config.api_base_url - return ApifyClientAsync( - token=token, - api_url=api_url, - max_retries=max_retries, - min_delay_between_retries_millis=min_delay_between_retries_millis, - timeout_secs=timeout_secs, - ) - - def _get_storage_client(self: Actor, force_cloud: bool) -> ApifyClientAsync | None: # noqa: FBT001 - return self._apify_client if force_cloud else None - - @classmethod - async def open_dataset( - cls: type[Actor], - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - ) -> Dataset: - """Open a dataset. - - Datasets are used to store structured data where each object stored has the same attributes, - such as online store products or real estate offers. - The actual data is stored either on the local filesystem or in the Apify cloud. - - Args: - id (str, optional): ID of the dataset to be opened. - If neither `id` nor `name` are provided, the method returns the default dataset associated with the actor run. - name (str, optional): Name of the dataset to be opened. - If neither `id` nor `name` are provided, the method returns the default dataset associated with the actor run. - force_cloud (bool, optional): If set to `True` then the Apify cloud storage is always used. - This way it is possible to combine local and cloud storage. - - Returns: - Dataset: An instance of the `Dataset` class for the given ID or name. - - """ - return await cls._get_default_instance().open_dataset(id=id, name=name, force_cloud=force_cloud) - - async def _open_dataset_internal( - self: Actor, - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - ) -> Dataset: - self._raise_if_not_initialized() - - return await Dataset.open(id=id, name=name, force_cloud=force_cloud, config=self._config) - - @classmethod - async def open_key_value_store( - cls: type[Actor], - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - ) -> KeyValueStore: - """Open a key-value store. - - Key-value stores are used to store records or files, along with their MIME content type. - The records are stored and retrieved using a unique key. - The actual data is stored either on a local filesystem or in the Apify cloud. - - Args: - id (str, optional): ID of the key-value store to be opened. - If neither `id` nor `name` are provided, the method returns the default key-value store associated with the actor run. - name (str, optional): Name of the key-value store to be opened. - If neither `id` nor `name` are provided, the method returns the default key-value store associated with the actor run. - force_cloud (bool, optional): If set to `True` then the Apify cloud storage is always used. - This way it is possible to combine local and cloud storage. - - Returns: - KeyValueStore: An instance of the `KeyValueStore` class for the given ID or name. - """ - return await cls._get_default_instance().open_key_value_store(id=id, name=name, force_cloud=force_cloud) - - async def _open_key_value_store_internal( - self: Actor, - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - ) -> KeyValueStore: - self._raise_if_not_initialized() - - return await KeyValueStore.open(id=id, name=name, force_cloud=force_cloud, config=self._config) - - @classmethod - async def open_request_queue( - cls: type[Actor], - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - ) -> RequestQueue: - """Open a request queue. - - Request queue represents a queue of URLs to crawl, which is stored either on local filesystem or in the Apify cloud. - The queue is used for deep crawling of websites, where you start with several URLs and then - recursively follow links to other pages. The data structure supports both breadth-first - and depth-first crawling orders. - - Args: - id (str, optional): ID of the request queue to be opened. - If neither `id` nor `name` are provided, the method returns the default request queue associated with the actor run. - name (str, optional): Name of the request queue to be opened. - If neither `id` nor `name` are provided, the method returns the default request queue associated with the actor run. - force_cloud (bool, optional): If set to `True` then the Apify cloud storage is always used. - This way it is possible to combine local and cloud storage. - - Returns: - RequestQueue: An instance of the `RequestQueue` class for the given ID or name. - """ - return await cls._get_default_instance().open_request_queue(id=id, name=name, force_cloud=force_cloud) - - async def _open_request_queue_internal( - self: Actor, - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - ) -> RequestQueue: - self._raise_if_not_initialized() - - return await RequestQueue.open(id=id, name=name, force_cloud=force_cloud, config=self._config) - - @classmethod - async def push_data(cls: type[Actor], data: Any) -> None: - """Store an object or a list of objects to the default dataset of the current actor run. - - Args: - data (object or list of objects, optional): The data to push to the default dataset. - """ - return await cls._get_default_instance().push_data(data=data) - - async def _push_data_internal(self: Actor, data: Any) -> None: - self._raise_if_not_initialized() - - if not data: - return - - dataset = await self.open_dataset() - await dataset.push_data(data) - - @classmethod - async def get_input(cls: type[Actor]) -> Any: - """Get the actor input value from the default key-value store associated with the current actor run.""" - return await cls._get_default_instance().get_input() - - async def _get_input_internal(self: Actor) -> Any: - self._raise_if_not_initialized() - - input_value = await self.get_value(self._config.input_key) - input_secrets_private_key = self._config.input_secrets_private_key_file - input_secrets_key_passphrase = self._config.input_secrets_private_key_passphrase - if input_secrets_private_key and input_secrets_key_passphrase: - private_key = load_private_key( - input_secrets_private_key, - input_secrets_key_passphrase, - ) - input_value = decrypt_input_secrets(private_key, input_value) - - return input_value - - @classmethod - async def get_value(cls: type[Actor], key: str, default_value: Any = None) -> Any: - """Get a value from the default key-value store associated with the current actor run. - - Args: - key (str): The key of the record which to retrieve. - default_value (Any, optional): Default value returned in case the record does not exist. - """ - return await cls._get_default_instance().get_value(key=key, default_value=default_value) - - async def _get_value_internal(self: Actor, key: str, default_value: Any = None) -> Any: - self._raise_if_not_initialized() - - key_value_store = await self.open_key_value_store() - return await key_value_store.get_value(key, default_value) - - @classmethod - async def set_value( - cls: type[Actor], - key: str, - value: Any, - *, - content_type: str | None = None, - ) -> None: - """Set or delete a value in the default key-value store associated with the current actor run. - - Args: - key (str): The key of the record which to set. - value (any): The value of the record which to set, or None, if the record should be deleted. - content_type (str, optional): The content type which should be set to the value. - """ - return await cls._get_default_instance().set_value( - key=key, - value=value, - content_type=content_type, - ) - - async def _set_value_internal( - self: Actor, - key: str, - value: Any, - *, - content_type: str | None = None, - ) -> None: - self._raise_if_not_initialized() - - key_value_store = await self.open_key_value_store() - return await key_value_store.set_value(key, value, content_type=content_type) - - @classmethod - def on(cls: type[Actor], event_name: ActorEventTypes, listener: Callable) -> Callable: - """Add an event listener to the actor's event manager. - - The following events can be emitted: - - `ActorEventTypes.SYSTEM_INFO`: - Emitted every minute, the event data contains info about the resource usage of the actor. - - `ActorEventTypes.MIGRATING`: - Emitted when the actor running on the Apify platform is going to be migrated to another worker server soon. - You can use it to persist the state of the actor and gracefully stop your in-progress tasks, - so that they are not interrupted by the migration.. - - `ActorEventTypes.PERSIST_STATE`: - Emitted in regular intervals (by default 60 seconds) to notify the actor that it should persist its state, - in order to avoid repeating all work when the actor restarts. - This event is automatically emitted together with the migrating event, - in which case the `isMigrating` flag in the event data is set to True, otherwise the flag is False. - Note that this event is provided merely for your convenience, - you can achieve the same effect using an interval and listening for the migrating event. - - `ActorEventTypes.ABORTING`: - When a user aborts an actor run on the Apify platform, - they can choose to abort it gracefully, to allow the actor some time before getting terminated. - This graceful abort emits the aborting event, which you can use to clean up the actor state. - - Args: - event_name (ActorEventTypes): The actor event for which to listen to. - listener (Callable): The function which is to be called when the event is emitted (can be async). - """ - return cls._get_default_instance().on(event_name, listener) - - def _on_internal(self: Actor, event_name: ActorEventTypes, listener: Callable) -> Callable: - self._raise_if_not_initialized() - - return self._event_manager.on(event_name, listener) - - @classmethod - def off(cls: type[Actor], event_name: ActorEventTypes, listener: Callable | None = None) -> None: - """Remove a listener, or all listeners, from an actor event. - - Args: - event_name (ActorEventTypes): The actor event for which to remove listeners. - listener (Callable, optional): The listener which is supposed to be removed. If not passed, all listeners of this event are removed. - """ - return cls._get_default_instance().off(event_name, listener) - - def _off_internal(self: Actor, event_name: ActorEventTypes, listener: Callable | None = None) -> None: - self._raise_if_not_initialized() - - return self._event_manager.off(event_name, listener) - - @classmethod - def is_at_home(cls: type[Actor]) -> bool: - """Return `True` when the actor is running on the Apify platform, and `False` otherwise (for example when running locally).""" - return cls._get_default_instance().is_at_home() - - def _is_at_home_internal(self: Actor) -> bool: - return self._config.is_at_home - - @classmethod - def get_env(cls: type[Actor]) -> dict: - """Return a dictionary with information parsed from all the `APIFY_XXX` environment variables. - - For a list of all the environment variables, - see the [Actor documentation](https://docs.apify.com/actors/development/environment-variables). - If some variables are not defined or are invalid, the corresponding value in the resulting dictionary will be None. - """ - return cls._get_default_instance().get_env() - - def _get_env_internal(self: Actor) -> dict: - self._raise_if_not_initialized() - - return {env_var.name.lower(): fetch_and_parse_env_var(env_var) for env_var in [*ActorEnvVars, *ApifyEnvVars]} - - @classmethod - async def start( - cls: type[Actor], - actor_id: str, - run_input: Any = None, - *, - token: str | None = None, - content_type: str | None = None, - build: str | None = None, - memory_mbytes: int | None = None, - timeout_secs: int | None = None, - wait_for_finish: int | None = None, - webhooks: list[dict] | None = None, - ) -> dict: - """Run an actor on the Apify platform. - - Unlike `Actor.call`, this method just starts the run without waiting for finish. - - Args: - actor_id (str): The ID of the actor to be run. - run_input (Any, optional): The input to pass to the actor run. - token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). - content_type (str, optional): The content type of the input. - build (str, optional): Specifies the actor build to run. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the actor (typically latest). - memory_mbytes (int, optional): Memory limit for the run, in megabytes. - By default, the run uses a memory limit specified in the default run configuration for the actor. - timeout_secs (int, optional): Optional timeout for the run, in seconds. - By default, the run uses timeout specified in the default run configuration for the actor. - wait_for_finish (int, optional): The maximum number of seconds the server waits for the run to finish. - By default, it is 0, the maximum value is 300. - webhooks (list of dict, optional): Optional ad-hoc webhooks (https://docs.apify.com/webhooks/ad-hoc-webhooks) - associated with the actor run which can be used to receive a notification, - e.g. when the actor finished or failed. - If you already have a webhook set up for the actor or task, you do not have to add it again here. - Each webhook is represented by a dictionary containing these items: - * ``event_types``: list of ``WebhookEventType`` values which trigger the webhook - * ``request_url``: URL to which to send the webhook HTTP request - * ``payload_template`` (optional): Optional template for the request payload - - Returns: - dict: Info about the started actor run - """ - return await cls._get_default_instance().start( - actor_id=actor_id, - run_input=run_input, - token=token, - content_type=content_type, - build=build, - memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, - wait_for_finish=wait_for_finish, - webhooks=webhooks, - ) - - async def _start_internal( - self: Actor, - actor_id: str, - run_input: Any = None, - *, - token: str | None = None, - content_type: str | None = None, - build: str | None = None, - memory_mbytes: int | None = None, - timeout_secs: int | None = None, - wait_for_finish: int | None = None, - webhooks: list[dict] | None = None, - ) -> dict: - self._raise_if_not_initialized() - - client = self.new_client(token=token) if token else self._apify_client - - return await client.actor(actor_id).start( - run_input=run_input, - content_type=content_type, - build=build, - memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, - wait_for_finish=wait_for_finish, - webhooks=webhooks, - ) - - @classmethod - async def abort( - cls: type[Actor], - run_id: str, - *, - token: str | None = None, - gracefully: bool | None = None, - ) -> dict: - """Abort given actor run on the Apify platform using the current user account (determined by the `APIFY_TOKEN` environment variable). - - Args: - run_id (str): The ID of the actor run to be aborted. - token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). - gracefully (bool, optional): If True, the actor run will abort gracefully. - It will send ``aborting`` and ``persistStates`` events into the run and force-stop the run after 30 seconds. - It is helpful in cases where you plan to resurrect the run later. - - Returns: - dict: Info about the aborted actor run - """ - return await cls._get_default_instance().abort( - run_id=run_id, - token=token, - gracefully=gracefully, - ) - - async def _abort_internal( - self: Actor, - run_id: str, - *, - token: str | None = None, - status_message: str | None = None, - gracefully: bool | None = None, - ) -> dict: - self._raise_if_not_initialized() - - client = self.new_client(token=token) if token else self._apify_client - - if status_message: - await client.run(run_id).update(status_message=status_message) - - return await client.run(run_id).abort(gracefully=gracefully) - - @classmethod - async def call( - cls: type[Actor], - actor_id: str, - run_input: Any = None, - *, - token: str | None = None, - content_type: str | None = None, - build: str | None = None, - memory_mbytes: int | None = None, - timeout_secs: int | None = None, - webhooks: list[dict] | None = None, - wait_secs: int | None = None, - ) -> dict | None: - """Start an actor on the Apify Platform and wait for it to finish before returning. - - It waits indefinitely, unless the wait_secs argument is provided. - - Args: - actor_id (str): The ID of the actor to be run. - run_input (Any, optional): The input to pass to the actor run. - token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). - content_type (str, optional): The content type of the input. - build (str, optional): Specifies the actor build to run. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the actor (typically latest). - memory_mbytes (int, optional): Memory limit for the run, in megabytes. - By default, the run uses a memory limit specified in the default run configuration for the actor. - timeout_secs (int, optional): Optional timeout for the run, in seconds. - By default, the run uses timeout specified in the default run configuration for the actor. - webhooks (list, optional): Optional webhooks (https://docs.apify.com/webhooks) associated with the actor run, - which can be used to receive a notification, e.g. when the actor finished or failed. - If you already have a webhook set up for the actor, you do not have to add it again here. - wait_secs (int, optional): The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. - - Returns: - dict: Info about the started actor run - """ - return await cls._get_default_instance().call( - actor_id=actor_id, - token=token, - run_input=run_input, - content_type=content_type, - build=build, - memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, - webhooks=webhooks, - wait_secs=wait_secs, - ) - - async def _call_internal( - self: Actor, - actor_id: str, - run_input: Any = None, - *, - token: str | None = None, - content_type: str | None = None, - build: str | None = None, - memory_mbytes: int | None = None, - timeout_secs: int | None = None, - webhooks: list[dict] | None = None, - wait_secs: int | None = None, - ) -> dict | None: - self._raise_if_not_initialized() - - client = self.new_client(token=token) if token else self._apify_client - - return await client.actor(actor_id).call( - run_input=run_input, - content_type=content_type, - build=build, - memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, - webhooks=webhooks, - wait_secs=wait_secs, - ) - - @classmethod - async def call_task( - cls: type[Actor], - task_id: str, - task_input: dict | None = None, - *, - build: str | None = None, - memory_mbytes: int | None = None, - timeout_secs: int | None = None, - webhooks: list[dict] | None = None, - wait_secs: int | None = None, - token: str | None = None, - ) -> dict | None: - """Start an actor task on the Apify Platform and wait for it to finish before returning. - - It waits indefinitely, unless the wait_secs argument is provided. - - Note that an actor task is a saved input configuration and options for an actor. - If you want to run an actor directly rather than an actor task, please use the `Actor.call` - - Args: - task_id (str): The ID of the actor to be run. - task_input (Any, optional): Overrides the input to pass to the actor run. - token (str, optional): The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). - content_type (str, optional): The content type of the input. - build (str, optional): Specifies the actor build to run. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the actor (typically latest). - memory_mbytes (int, optional): Memory limit for the run, in megabytes. - By default, the run uses a memory limit specified in the default run configuration for the actor. - timeout_secs (int, optional): Optional timeout for the run, in seconds. - By default, the run uses timeout specified in the default run configuration for the actor. - webhooks (list, optional): Optional webhooks (https://docs.apify.com/webhooks) associated with the actor run, - which can be used to receive a notification, e.g. when the actor finished or failed. - If you already have a webhook set up for the actor, you do not have to add it again here. - wait_secs (int, optional): The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. - - Returns: - dict: Info about the started actor run - """ - return await cls._get_default_instance().call_task( - task_id=task_id, - task_input=task_input, - token=token, - build=build, - memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, - webhooks=webhooks, - wait_secs=wait_secs, - ) - - async def _call_task_internal( - self: Actor, - task_id: str, - task_input: dict | None = None, - *, - build: str | None = None, - memory_mbytes: int | None = None, - timeout_secs: int | None = None, - webhooks: list[dict] | None = None, - wait_secs: int | None = None, - token: str | None = None, - ) -> dict | None: - self._raise_if_not_initialized() - - client = self.new_client(token=token) if token else self._apify_client - - return await client.task(task_id).call( - task_input=task_input, - build=build, - memory_mbytes=memory_mbytes, - timeout_secs=timeout_secs, - webhooks=webhooks, - wait_secs=wait_secs, - ) - - @classmethod - async def metamorph( - cls: type[Actor], - target_actor_id: str, - run_input: Any = None, - *, - target_actor_build: str | None = None, - content_type: str | None = None, - custom_after_sleep_millis: int | None = None, - ) -> None: - """Transform this actor run to an actor run of a different actor. - - The platform stops the current actor container and starts a new container with the new actor instead. - All the default storages are preserved, - and the new input is stored under the `INPUT-METAMORPH-1` key in the same default key-value store. - - Args: - target_actor_id (str): ID of the target actor that the run should be transformed into - run_input (Any, optional): The input to pass to the new run. - target_actor_build (str, optional): The build of the target actor. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the target actor (typically the latest build). - content_type (str, optional): The content type of the input. - custom_after_sleep_millis (int, optional): How long to sleep for after the metamorph, to wait for the container to be stopped. - - Returns: - dict: The actor run data. - """ - return await cls._get_default_instance().metamorph( - target_actor_id=target_actor_id, - target_actor_build=target_actor_build, - run_input=run_input, - content_type=content_type, - custom_after_sleep_millis=custom_after_sleep_millis, - ) - - async def _metamorph_internal( - self: Actor, - target_actor_id: str, - run_input: Any = None, - *, - target_actor_build: str | None = None, - content_type: str | None = None, - custom_after_sleep_millis: int | None = None, - ) -> None: - self._raise_if_not_initialized() - - if not self.is_at_home(): - self.log.error('Actor.metamorph() is only supported when running on the Apify platform.') - return - - if not custom_after_sleep_millis: - custom_after_sleep_millis = self._config.metamorph_after_sleep_millis - - # If is_at_home() is True, config.actor_run_id is always set - assert self._config.actor_run_id is not None # noqa: S101 - - await self._apify_client.run(self._config.actor_run_id).metamorph( - target_actor_id=target_actor_id, - run_input=run_input, - target_actor_build=target_actor_build, - content_type=content_type, - ) - - if custom_after_sleep_millis: - await asyncio.sleep(custom_after_sleep_millis / 1000) - - @classmethod - async def reboot( - cls: type[Actor], - *, - event_listeners_timeout_secs: int | None = EVENT_LISTENERS_TIMEOUT_SECS, - custom_after_sleep_millis: int | None = None, - ) -> None: - """Internally reboot this actor. - - The system stops the current container and starts a new one, with the same run ID and default storages. - - Args: - event_listeners_timeout_secs (int, optional): How long should the actor wait for actor event listeners to finish before exiting - custom_after_sleep_millis (int, optional): How long to sleep for after the reboot, to wait for the container to be stopped. - """ - return await cls._get_default_instance().reboot( - event_listeners_timeout_secs=event_listeners_timeout_secs, - custom_after_sleep_millis=custom_after_sleep_millis, - ) - - async def _reboot_internal( - self: Actor, - *, - event_listeners_timeout_secs: int | None = EVENT_LISTENERS_TIMEOUT_SECS, - custom_after_sleep_millis: int | None = None, - ) -> None: - self._raise_if_not_initialized() - - if not self.is_at_home(): - self.log.error('Actor.reboot() is only supported when running on the Apify platform.') - return - - if not custom_after_sleep_millis: - custom_after_sleep_millis = self._config.metamorph_after_sleep_millis - - await self._cancel_event_emitting_intervals() - - self._event_manager.emit(ActorEventTypes.PERSIST_STATE, {'isMigrating': True}) - self._was_final_persist_state_emitted = True - - await self._event_manager.close(event_listeners_timeout_secs=event_listeners_timeout_secs) - - assert self._config.actor_run_id is not None # noqa: S101 - await self._apify_client.run(self._config.actor_run_id).reboot() - - if custom_after_sleep_millis: - await asyncio.sleep(custom_after_sleep_millis / 1000) - - @classmethod - async def add_webhook( - cls: type[Actor], - *, - event_types: list[WebhookEventType], - request_url: str, - payload_template: str | None = None, - ignore_ssl_errors: bool | None = None, - do_not_retry: bool | None = None, - idempotency_key: str | None = None, - ) -> dict: - """Create an ad-hoc webhook for the current actor run. - - This webhook lets you receive a notification when the actor run finished or failed. - - Note that webhooks are only supported for actors running on the Apify platform. - When running the actor locally, the function will print a warning and have no effect. - - For more information about Apify actor webhooks, please see the [documentation](https://docs.apify.com/webhooks). - - Args: - event_types (list of WebhookEventType): List of event types that should trigger the webhook. At least one is required. - request_url (str): URL that will be invoked once the webhook is triggered. - payload_template (str, optional): Specification of the payload that will be sent to request_url - ignore_ssl_errors (bool, optional): Whether the webhook should ignore SSL errors returned by request_url - do_not_retry (bool, optional): Whether the webhook should retry sending the payload to request_url upon - failure. - idempotency_key (str, optional): A unique identifier of a webhook. You can use it to ensure that you won't - create the same webhook multiple times. - - Returns: - dict: The created webhook - """ - return await cls._get_default_instance().add_webhook( - event_types=event_types, - request_url=request_url, - payload_template=payload_template, - ignore_ssl_errors=ignore_ssl_errors, - do_not_retry=do_not_retry, - idempotency_key=idempotency_key, - ) - - async def _add_webhook_internal( - self: Actor, - *, - event_types: list[WebhookEventType], - request_url: str, - payload_template: str | None = None, - ignore_ssl_errors: bool | None = None, - do_not_retry: bool | None = None, - idempotency_key: str | None = None, - ) -> dict | None: - self._raise_if_not_initialized() - - if not self.is_at_home(): - self.log.error('Actor.add_webhook() is only supported when running on the Apify platform.') - return None - - # If is_at_home() is True, config.actor_run_id is always set - assert self._config.actor_run_id is not None # noqa: S101 - - return await self._apify_client.webhooks().create( - actor_run_id=self._config.actor_run_id, - event_types=event_types, - request_url=request_url, - payload_template=payload_template, - ignore_ssl_errors=ignore_ssl_errors, - do_not_retry=do_not_retry, - idempotency_key=idempotency_key, - ) - - @classmethod - async def set_status_message( - cls: type[Actor], - status_message: str, - *, - is_terminal: bool | None = None, - ) -> dict | None: - """Set the status message for the current actor run. - - Args: - status_message (str): The status message to set to the run. - is_terminal (bool, optional): Set this flag to True if this is the final status message of the Actor run. - - Returns: - dict: The updated actor run object - """ - return await cls._get_default_instance().set_status_message(status_message=status_message, is_terminal=is_terminal) - - async def _set_status_message_internal( - self: Actor, - status_message: str, - *, - is_terminal: bool | None = None, - ) -> dict | None: - self._raise_if_not_initialized() - - if not self.is_at_home(): - title = 'Terminal status message' if is_terminal else 'Status message' - self.log.info(f'[{title}]: {status_message}') - return None - - # If is_at_home() is True, config.actor_run_id is always set - assert self._config.actor_run_id is not None # noqa: S101 - - return await self._apify_client.run(self._config.actor_run_id).update(status_message=status_message, is_status_message_terminal=is_terminal) - - @classmethod - async def create_proxy_configuration( - cls: type[Actor], - *, - actor_proxy_input: dict | None = None, # this is the raw proxy input from the actor run input, it is not spread or snake_cased in here - password: str | None = None, - groups: list[str] | None = None, - country_code: str | None = None, - proxy_urls: list[str] | None = None, - new_url_function: Callable[[str | None], str] | Callable[[str | None], Awaitable[str]] | None = None, - ) -> ProxyConfiguration | None: - """Create a ProxyConfiguration object with the passed proxy configuration. - - Configures connection to a proxy server with the provided options. - Proxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or blacklists. - - For more details and code examples, see the `ProxyConfiguration` class. - - Args: - actor_proxy_input (dict, optional): Proxy configuration field from the actor input, if actor has such input field. - If you pass this argument, all the other arguments will be inferred from it. - password (str, optional): Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available. - groups (list of str, optional): Proxy groups which the Apify Proxy should use, if provided. - country_code (str, optional): Country which the Apify Proxy should use, if provided. - proxy_urls (list of str, optional): Custom proxy server URLs which should be rotated through. - new_url_function (Callable, optional): Function which returns a custom proxy URL to be used. - - Returns: - ProxyConfiguration, optional: ProxyConfiguration object with the passed configuration, - or None, if no proxy should be used based on the configuration. - """ - return await cls._get_default_instance().create_proxy_configuration( - password=password, - groups=groups, - country_code=country_code, - proxy_urls=proxy_urls, - new_url_function=new_url_function, - actor_proxy_input=actor_proxy_input, - ) - - async def _create_proxy_configuration_internal( - self: Actor, - *, - actor_proxy_input: dict | None = None, # this is the raw proxy input from the actor run input, it is not spread or snake_cased in here - password: str | None = None, - groups: list[str] | None = None, - country_code: str | None = None, - proxy_urls: list[str] | None = None, - new_url_function: Callable[[str | None], str] | Callable[[str | None], Awaitable[str]] | None = None, - ) -> ProxyConfiguration | None: - self._raise_if_not_initialized() - - if actor_proxy_input is not None: - if actor_proxy_input.get('useApifyProxy', False): - country_code = country_code or actor_proxy_input.get('apifyProxyCountry') - groups = groups or actor_proxy_input.get('apifyProxyGroups') - else: - proxy_urls = actor_proxy_input.get('proxyUrls', []) - if not proxy_urls: - return None - - proxy_configuration = ProxyConfiguration( - password=password, - groups=groups, - country_code=country_code, - proxy_urls=proxy_urls, - new_url_function=new_url_function, - _actor_config=self._config, - _apify_client=self._apify_client, - ) - - await proxy_configuration.initialize() - - return proxy_configuration diff --git a/src/apify/apify_storage_client/__init__.py b/src/apify/apify_storage_client/__init__.py new file mode 100644 index 00000000..8b6d517c --- /dev/null +++ b/src/apify/apify_storage_client/__init__.py @@ -0,0 +1,3 @@ +from apify.apify_storage_client._apify_storage_client import ApifyStorageClient + +__all__ = ['ApifyStorageClient'] diff --git a/src/apify/apify_storage_client/_apify_storage_client.py b/src/apify/apify_storage_client/_apify_storage_client.py new file mode 100644 index 00000000..1153e95d --- /dev/null +++ b/src/apify/apify_storage_client/_apify_storage_client.py @@ -0,0 +1,56 @@ +from typing_extensions import override + +from apify_client import ApifyClientAsync +from crawlee._utils.crypto import crypto_random_object_id +from crawlee.base_storage_client import BaseStorageClient + +from apify._configuration import Configuration +from apify.apify_storage_client._dataset_client import DatasetClient +from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient +from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient +from apify.apify_storage_client._key_value_store_collection_client import KeyValueStoreCollectionClient +from apify.apify_storage_client._request_queue_client import RequestQueueClient +from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient + + +class ApifyStorageClient(BaseStorageClient): + """A storage client implementation based on the Apify platform storage.""" + + def __init__(self, *, configuration: Configuration) -> None: + self._client_key = crypto_random_object_id() + self._apify_client = ApifyClientAsync( + token=configuration.token, + api_url=configuration.api_base_url, + max_retries=8, + min_delay_between_retries_millis=500, + timeout_secs=360, + ) + self._configuration = configuration + + @override + def dataset(self, id: str) -> DatasetClient: + return DatasetClient(self._apify_client.dataset(id)) + + @override + def datasets(self) -> DatasetCollectionClient: + return DatasetCollectionClient(self._apify_client.datasets()) + + @override + def key_value_store(self, id: str) -> KeyValueStoreClient: + return KeyValueStoreClient(self._apify_client.key_value_store(id), self._configuration.api_public_base_url) + + @override + def key_value_stores(self) -> KeyValueStoreCollectionClient: + return KeyValueStoreCollectionClient(self._apify_client.key_value_stores()) + + @override + def request_queue(self, id: str) -> RequestQueueClient: + return RequestQueueClient(self._apify_client.request_queue(id, client_key=self._client_key)) + + @override + def request_queues(self) -> RequestQueueCollectionClient: + return RequestQueueCollectionClient(self._apify_client.request_queues()) + + @override + async def purge_on_start(self) -> None: + pass diff --git a/src/apify/apify_storage_client/_dataset_client.py b/src/apify/apify_storage_client/_dataset_client.py new file mode 100644 index 00000000..dd10ced8 --- /dev/null +++ b/src/apify/apify_storage_client/_dataset_client.py @@ -0,0 +1,186 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from typing_extensions import override + +from crawlee.base_storage_client import BaseDatasetClient, DatasetItemsListPage, DatasetMetadata + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + from contextlib import AbstractAsyncContextManager + + from httpx import Response + + from apify_client.clients import DatasetClientAsync + from crawlee._types import JsonSerializable + + +class DatasetClient(BaseDatasetClient): + """Dataset resource client implementation based on the Apify platform storage.""" + + def __init__(self, apify_dataset_client: DatasetClientAsync) -> None: + self._client = apify_dataset_client + + @override + async def get(self) -> DatasetMetadata | None: + result = await self._client.get() + return DatasetMetadata.model_validate(result) if result else None + + @override + async def update( + self, + *, + name: str | None = None, + ) -> DatasetMetadata: + return DatasetMetadata.model_validate( + await self._client.update( + name=name, + ) + ) + + @override + async def delete(self) -> None: + await self._client.delete() + + @override + async def list_items( + self, + *, + offset: int | None = 0, + limit: int | None = BaseDatasetClient._LIST_ITEMS_LIMIT, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + flatten: list[str] | None = None, + view: str | None = None, + ) -> DatasetItemsListPage: + return DatasetItemsListPage.model_validate( + await self._client.list_items( + offset=offset, + limit=limit, + clean=clean, + desc=desc, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_hidden=skip_hidden, + flatten=flatten, + view=view, + ) + ) + + @override + async def iterate_items( + self, + *, + offset: int = 0, + limit: int | None = None, + clean: bool = False, + desc: bool = False, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_hidden: bool = False, + ) -> AsyncIterator[dict]: + return self._client.iterate_items( + offset=offset, + limit=limit, + clean=clean, + desc=desc, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_hidden=skip_hidden, + ) + + @override + async def get_items_as_bytes( + self, + *, + item_format: str = 'json', + offset: int | None = None, + limit: int | None = None, + desc: bool = False, + clean: bool = False, + bom: bool = False, + delimiter: str | None = None, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_header_row: bool = False, + skip_hidden: bool = False, + xml_root: str | None = None, + xml_row: str | None = None, + flatten: list[str] | None = None, + ) -> bytes: + return await self._client.get_items_as_bytes( + item_format=item_format, + offset=offset, + limit=limit, + desc=desc, + clean=clean, + bom=bom, + delimiter=delimiter, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_header_row=skip_header_row, + skip_hidden=skip_hidden, + xml_root=xml_root, + xml_row=xml_row, + flatten=flatten, + ) + + @override + async def stream_items( + self, + *, + item_format: str = 'json', + offset: int | None = None, + limit: int | None = None, + desc: bool = False, + clean: bool = False, + bom: bool = False, + delimiter: str | None = None, + fields: list[str] | None = None, + omit: list[str] | None = None, + unwind: str | None = None, + skip_empty: bool = False, + skip_header_row: bool = False, + skip_hidden: bool = False, + xml_root: str | None = None, + xml_row: str | None = None, + ) -> AbstractAsyncContextManager[Response | None]: + return self._client.stream_items( + item_format=item_format, + offset=offset, + limit=limit, + desc=desc, + clean=clean, + bom=bom, + delimiter=delimiter, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_header_row=skip_header_row, + skip_hidden=skip_hidden, + xml_root=xml_root, + xml_row=xml_row, + ) + + @override + async def push_items(self, items: JsonSerializable) -> None: + await self._client.push_items( + items=items, + ) diff --git a/src/apify/apify_storage_client/_dataset_collection_client.py b/src/apify/apify_storage_client/_dataset_collection_client.py new file mode 100644 index 00000000..1a6fb27a --- /dev/null +++ b/src/apify/apify_storage_client/_dataset_collection_client.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from typing_extensions import override + +from crawlee.base_storage_client import BaseDatasetCollectionClient, DatasetListPage, DatasetMetadata + +if TYPE_CHECKING: + from apify_client.clients import DatasetCollectionClientAsync + + +class DatasetCollectionClient(BaseDatasetCollectionClient): + """Dataset collection resource client implementation based on the Apify platform storage.""" + + def __init__(self, apify_dataset_collection_client: DatasetCollectionClientAsync) -> None: + self._client = apify_dataset_collection_client + + @override + async def get_or_create( + self, + *, + id: str | None = None, + name: str | None = None, + schema: dict | None = None, + ) -> DatasetMetadata: + return DatasetMetadata.model_validate( + await self._client.get_or_create( + name=id if id is not None else name, + schema=schema, + ) + ) + + @override + async def list( + self, + *, + unnamed: bool = False, + limit: int | None = None, + offset: int | None = None, + desc: bool = False, + ) -> DatasetListPage: + return DatasetListPage.model_validate( + await self._client.list( + unnamed=unnamed, + limit=limit, + offset=offset, + desc=desc, + ) + ) diff --git a/src/apify/apify_storage_client/_key_value_store_client.py b/src/apify/apify_storage_client/_key_value_store_client.py new file mode 100644 index 00000000..d02d18cb --- /dev/null +++ b/src/apify/apify_storage_client/_key_value_store_client.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +from contextlib import asynccontextmanager +from typing import TYPE_CHECKING, Any + +from typing_extensions import override + +from crawlee.base_storage_client import BaseKeyValueStoreClient, KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + from contextlib import AbstractAsyncContextManager + + from httpx import Response + + from apify_client.clients import KeyValueStoreClientAsync + + +class KeyValueStoreClient(BaseKeyValueStoreClient): + """Key-value store resource client implementation based on the Apify platform storage.""" + + def __init__(self, apify_key_value_store_client: KeyValueStoreClientAsync, api_public_base_url: str) -> None: + self._client = apify_key_value_store_client + self._api_public_base_url = api_public_base_url + + @override + async def get(self) -> KeyValueStoreMetadata | None: + result = await self._client.get() + return KeyValueStoreMetadata.model_validate(result) if result else None + + @override + async def update( + self, + *, + name: str | None = None, + ) -> KeyValueStoreMetadata: + return KeyValueStoreMetadata.model_validate(await self._client.update()) + + @override + async def delete(self) -> None: + await self._client.delete() + + @override + async def list_keys( + self, + *, + limit: int = 1000, + exclusive_start_key: str | None = None, + ) -> KeyValueStoreListKeysPage: + return KeyValueStoreListKeysPage.model_validate(await self._client.list_keys()) + + @override + async def get_record(self, key: str) -> KeyValueStoreRecord | None: + result = await self._client.get_record(key) + return KeyValueStoreRecord.model_validate(result) if result else None + + @override + async def get_record_as_bytes(self, key: str) -> KeyValueStoreRecord | None: + result = await self._client.get_record_as_bytes(key) + return KeyValueStoreRecord.model_validate(result) if result else None + + @override + async def stream_record(self, key: str) -> AbstractAsyncContextManager[KeyValueStoreRecord[Response] | None]: + return self._stream_record_internal(key) + + @asynccontextmanager + async def _stream_record_internal(self, key: str) -> AsyncIterator[KeyValueStoreRecord[Response] | None]: + async with self._client.stream_record(key) as response: + yield KeyValueStoreRecord.model_validate(response) + + @override + async def set_record(self, key: str, value: Any, content_type: str | None = None) -> None: + await self._client.set_record( + key=key, + value=value, + content_type=content_type, + ) + + @override + async def delete_record(self, key: str) -> None: + await self._client.delete_record( + key=key, + ) + + async def get_public_url(self, key: str) -> str: + """Get a URL for the given key that may be used to publicly access the value in the remote key-value store. + + Args: + key: The key for which the URL should be generated. + """ + public_api_url = self._api_public_base_url + + return f'{public_api_url}/v2/key-value-stores/{self._client.resource_id}/records/{key}' diff --git a/src/apify/apify_storage_client/_key_value_store_collection_client.py b/src/apify/apify_storage_client/_key_value_store_collection_client.py new file mode 100644 index 00000000..27f76f37 --- /dev/null +++ b/src/apify/apify_storage_client/_key_value_store_collection_client.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from typing_extensions import override + +from crawlee.base_storage_client import BaseKeyValueStoreCollectionClient, KeyValueStoreListPage, KeyValueStoreMetadata + +if TYPE_CHECKING: + from apify_client.clients import KeyValueStoreCollectionClientAsync + + +class KeyValueStoreCollectionClient(BaseKeyValueStoreCollectionClient): + """Key-value store collection resource client implementation based on the Apify platform storage.""" + + def __init__(self, apify_dataset_collection_client: KeyValueStoreCollectionClientAsync) -> None: + self._client = apify_dataset_collection_client + + @override + async def get_or_create( + self, + *, + id: str | None = None, + name: str | None = None, + schema: dict | None = None, + ) -> KeyValueStoreMetadata: + return KeyValueStoreMetadata.model_validate( + await self._client.get_or_create( + name=id if id is not None else name, + schema=schema, + ) + ) + + @override + async def list( + self, + *, + unnamed: bool = False, + limit: int | None = None, + offset: int | None = None, + desc: bool = False, + ) -> KeyValueStoreListPage: + return KeyValueStoreListPage.model_validate(await self._client.list(unnamed=unnamed, limit=limit, offset=offset, desc=desc)) diff --git a/src/apify/apify_storage_client/_request_queue_client.py b/src/apify/apify_storage_client/_request_queue_client.py new file mode 100644 index 00000000..2cdbe58d --- /dev/null +++ b/src/apify/apify_storage_client/_request_queue_client.py @@ -0,0 +1,196 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from typing_extensions import override + +from crawlee import Request +from crawlee.base_storage_client import ( + BaseRequestQueueClient, + BatchRequestsOperationResponse, + ProcessedRequest, + ProlongRequestLockResponse, + RequestQueueHead, + RequestQueueHeadWithLocks, + RequestQueueMetadata, +) + +if TYPE_CHECKING: + from collections.abc import Sequence + + from apify_client.clients import RequestQueueClientAsync + + +class RequestQueueClient(BaseRequestQueueClient): + """Request queue resource client implementation based on the Apify platform storage.""" + + def __init__(self, apify_request_queue_client: RequestQueueClientAsync) -> None: + self._client = apify_request_queue_client + + @override + async def get(self) -> RequestQueueMetadata | None: + result = await self._client.get() + return RequestQueueMetadata.model_validate({'resourceDirectory': ''} | result) if result else None + + @override + async def update( + self, + *, + name: str | None = None, + ) -> RequestQueueMetadata: + return RequestQueueMetadata.model_validate( + {'resourceDirectory': ''} + | await self._client.update( + name=name, + ) + ) + + @override + async def delete(self) -> None: + await self._client.delete() + + @override + async def list_head(self, *, limit: int | None = None) -> RequestQueueHead: + return RequestQueueHead.model_validate( + await self._client.list_head( + limit=limit, + ), + ) + + @override + async def list_and_lock_head(self, *, lock_secs: int, limit: int | None = None) -> RequestQueueHeadWithLocks: + return RequestQueueHeadWithLocks.model_validate( + await self._client.list_and_lock_head( + lock_secs=lock_secs, + limit=limit, + ) + ) + + @override + async def add_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest: + return ProcessedRequest.model_validate( + {'id': request.id, 'uniqueKey': request.unique_key} + | await self._client.add_request( + request=request.model_dump( + by_alias=True, + exclude={ + 'id', + 'json_', + 'order_no', + 'query_params', + 'data', + }, + ), + forefront=forefront, + ) + ) + + @override + async def get_request(self, request_id: str) -> Request | None: + result = await self._client.get_request(request_id) + return Request.model_validate(result) if result else None + + @override + async def update_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest: + return ProcessedRequest.model_validate( + {'id': request.id, 'uniqueKey': request.unique_key} + | await self._client.update_request( + request=request.model_dump( + by_alias=True, + exclude={ + 'json_', + 'order_no', + 'query_params', + 'data', + }, + ), + forefront=forefront, + ) + ) + + @override + async def delete_request(self, request_id: str) -> None: + await self._client.delete_request(request_id) + + @override + async def prolong_request_lock( + self, + request_id: str, + *, + forefront: bool = False, + lock_secs: int, + ) -> ProlongRequestLockResponse: + return ProlongRequestLockResponse.model_validate( + await self._client.prolong_request_lock( + request_id=request_id, + forefront=forefront, + lock_secs=lock_secs, + ) + ) + + @override + async def delete_request_lock( + self, + request_id: str, + *, + forefront: bool = False, + ) -> None: + await self._client.delete_request_lock( + request_id=request_id, + forefront=forefront, + ) + + @override + async def batch_add_requests( + self, + requests: Sequence[Request], + *, + forefront: bool = False, + ) -> BatchRequestsOperationResponse: + return BatchRequestsOperationResponse.model_validate( + await self._client.batch_add_requests( + requests=[ + r.model_dump( + by_alias=True, + exclude={ + 'id', + 'json_', + 'order_no', + 'query_params', + 'data', + }, + ) + for r in requests + ], + forefront=forefront, + ) + ) + + @override + async def batch_delete_requests(self, requests: list[Request]) -> BatchRequestsOperationResponse: + return BatchRequestsOperationResponse.model_validate( + await self._client.batch_delete_requests( + requests=[ + r.model_dump( + by_alias=True, + exclude={ + 'json_', + 'order_no', + 'query_params', + 'data', + }, + ) + for r in requests + ], + ) + ) diff --git a/src/apify/apify_storage_client/_request_queue_collection_client.py b/src/apify/apify_storage_client/_request_queue_collection_client.py new file mode 100644 index 00000000..50aad1aa --- /dev/null +++ b/src/apify/apify_storage_client/_request_queue_collection_client.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from typing_extensions import override + +from crawlee.base_storage_client import BaseRequestQueueCollectionClient, RequestQueueListPage, RequestQueueMetadata + +if TYPE_CHECKING: + from apify_client.clients import RequestQueueCollectionClientAsync + + +class RequestQueueCollectionClient(BaseRequestQueueCollectionClient): + """Request queue collection resource client implementation based on the Apify platform storage.""" + + def __init__(self, apify_request_queue_collection_client: RequestQueueCollectionClientAsync) -> None: + self._client = apify_request_queue_collection_client + + @override + async def get_or_create( + self, + *, + id: str | None = None, + name: str | None = None, + schema: dict | None = None, + ) -> RequestQueueMetadata: + return RequestQueueMetadata.model_validate( + {'resourceDirectory': ''} + | await self._client.get_or_create( + name=id if id is not None else name, + ) + ) + + @override + async def list( + self, + *, + unnamed: bool = False, + limit: int | None = None, + offset: int | None = None, + desc: bool = False, + ) -> RequestQueueListPage: + return RequestQueueListPage.model_validate( + await self._client.list( + unnamed=unnamed, + limit=limit, + offset=offset, + desc=desc, + ) + ) diff --git a/src/apify/config.py b/src/apify/config.py deleted file mode 100644 index a94e2152..00000000 --- a/src/apify/config.py +++ /dev/null @@ -1,130 +0,0 @@ -from __future__ import annotations - -from apify_shared.consts import ActorEnvVars, ApifyEnvVars - -from apify._utils import fetch_and_parse_env_var - - -class Configuration: - """A class for specifying the configuration of an actor. - - Can be used either globally via `Configuration.get_global_configuration()`, - or it can be specific to each `Actor` instance on the `actor.config` property. - """ - - _default_instance: Configuration | None = None - - def __init__( - self: Configuration, - *, - api_base_url: str | None = None, - api_public_base_url: str | None = None, - container_port: int | None = None, - container_url: str | None = None, - default_dataset_id: str | None = None, - default_key_value_store_id: str | None = None, - default_request_queue_id: str | None = None, - input_key: str | None = None, - max_used_cpu_ratio: float | None = None, - metamorph_after_sleep_millis: int | None = None, - persist_state_interval_millis: int | None = None, - persist_storage: bool | None = None, - proxy_hostname: str | None = None, - proxy_password: str | None = None, - proxy_port: int | None = None, - proxy_status_url: str | None = None, - purge_on_start: bool | None = None, - token: str | None = None, - standby_port: int | None = None, - system_info_interval_millis: int | None = None, - ) -> None: - """Create a `Configuration` instance. - - All the parameters are loaded by default from environment variables when running on the Apify platform. - You can override them here in the Configuration constructor, which might be useful for local testing of your actors. - - Args: - api_base_url (str, optional): The URL of the Apify API. - This is the URL actually used for connecting to the API, so it can contain an IP address when running in a container on the platform. - api_public_base_url (str, optional): The public URL of the Apify API. - This will always contain the public URL of the API, even when running in a container on the platform. - Useful for generating shareable URLs to key-value store records or datasets. - container_port (int, optional): The port on which the container can listen for HTTP requests. - container_url (str, optional): The URL on which the container can listen for HTTP requests. - default_dataset_id (str, optional): The ID of the default dataset for the actor. - default_key_value_store_id (str, optional): The ID of the default key-value store for the actor. - default_request_queue_id (str, optional): The ID of the default request queue for the actor. - input_key (str, optional): The key of the input record in the actor's default key-value store - max_used_cpu_ratio (float, optional): The CPU usage above which the SYSTEM_INFO event will report the CPU is overloaded. - metamorph_after_sleep_millis (int, optional): How long should the actor sleep after calling metamorph. - persist_state_interval_millis (int, optional): How often should the actor emit the PERSIST_STATE event. - persist_storage (bool, optional): Whether the actor should persist its used storages to the filesystem when running locally. - proxy_hostname (str, optional): The hostname of Apify Proxy. - proxy_password (str, optional): The password for Apify Proxy. - proxy_port (str, optional): The port of Apify Proxy. - proxy_status_url (str, optional): The URL on which the Apify Proxy status page is available. - purge_on_start (str, optional): Whether the actor should purge its default storages on startup, when running locally. - token (str, optional): The API token for the Apify API this actor should use. - system_info_interval_millis (str, optional): How often should the actor emit the SYSTEM_INFO event when running locally. - standby_port (int, optional): The port on which the container can listen for Actor Standby HTTP requests. - """ - # TODO: Document all these members - # https://github.com/apify/apify-sdk-python/issues/147 - self.actor_build_id = fetch_and_parse_env_var(ActorEnvVars.BUILD_ID) - self.actor_build_number = fetch_and_parse_env_var(ActorEnvVars.BUILD_NUMBER) - self.actor_events_ws_url = fetch_and_parse_env_var(ActorEnvVars.EVENTS_WEBSOCKET_URL) - self.actor_id = fetch_and_parse_env_var(ActorEnvVars.ID) - self.actor_run_id = fetch_and_parse_env_var(ActorEnvVars.RUN_ID) - self.actor_task_id = fetch_and_parse_env_var(ActorEnvVars.TASK_ID) - self.api_base_url = api_base_url or fetch_and_parse_env_var(ApifyEnvVars.API_BASE_URL, 'https://api.apify.com') - self.api_public_base_url = api_public_base_url or fetch_and_parse_env_var(ApifyEnvVars.API_PUBLIC_BASE_URL, 'https://api.apify.com') - self.chrome_executable_path = fetch_and_parse_env_var(ApifyEnvVars.CHROME_EXECUTABLE_PATH) - self.container_port = container_port or fetch_and_parse_env_var(ActorEnvVars.WEB_SERVER_PORT, 4321) - self.container_url = container_url or fetch_and_parse_env_var(ActorEnvVars.WEB_SERVER_URL, 'http://localhost:4321') - self.dedicated_cpus = fetch_and_parse_env_var(ApifyEnvVars.DEDICATED_CPUS) - self.default_browser_path = fetch_and_parse_env_var(ApifyEnvVars.DEFAULT_BROWSER_PATH) - self.default_dataset_id = default_dataset_id or fetch_and_parse_env_var(ActorEnvVars.DEFAULT_DATASET_ID, 'default') - self.default_key_value_store_id = default_key_value_store_id or fetch_and_parse_env_var(ActorEnvVars.DEFAULT_KEY_VALUE_STORE_ID, 'default') - self.default_request_queue_id = default_request_queue_id or fetch_and_parse_env_var(ActorEnvVars.DEFAULT_REQUEST_QUEUE_ID, 'default') - self.disable_browser_sandbox = fetch_and_parse_env_var(ApifyEnvVars.DISABLE_BROWSER_SANDBOX, default=False) - self.headless = fetch_and_parse_env_var(ApifyEnvVars.HEADLESS, default=True) - self.input_key = input_key or fetch_and_parse_env_var(ActorEnvVars.INPUT_KEY, 'INPUT') - self.input_secrets_private_key_file = fetch_and_parse_env_var(ApifyEnvVars.INPUT_SECRETS_PRIVATE_KEY_FILE) - self.input_secrets_private_key_passphrase = fetch_and_parse_env_var(ApifyEnvVars.INPUT_SECRETS_PRIVATE_KEY_PASSPHRASE) - self.is_at_home = fetch_and_parse_env_var(ApifyEnvVars.IS_AT_HOME, default=False) - self.max_used_cpu_ratio = max_used_cpu_ratio or fetch_and_parse_env_var(ApifyEnvVars.MAX_USED_CPU_RATIO, 0.95) - self.memory_mbytes = fetch_and_parse_env_var(ActorEnvVars.MEMORY_MBYTES) - self.meta_origin = fetch_and_parse_env_var(ApifyEnvVars.META_ORIGIN) - self.metamorph_after_sleep_millis = metamorph_after_sleep_millis or fetch_and_parse_env_var(ApifyEnvVars.METAMORPH_AFTER_SLEEP_MILLIS, 300000) - self.persist_state_interval_millis = persist_state_interval_millis or fetch_and_parse_env_var( - ApifyEnvVars.PERSIST_STATE_INTERVAL_MILLIS, 60000 - ) - self.persist_storage = persist_storage or fetch_and_parse_env_var(ApifyEnvVars.PERSIST_STORAGE, default=True) - self.proxy_hostname = proxy_hostname or fetch_and_parse_env_var(ApifyEnvVars.PROXY_HOSTNAME, 'proxy.apify.com') - self.proxy_password = proxy_password or fetch_and_parse_env_var(ApifyEnvVars.PROXY_PASSWORD) - self.proxy_port = proxy_port or fetch_and_parse_env_var(ApifyEnvVars.PROXY_PORT, 8000) - self.proxy_status_url = proxy_status_url or fetch_and_parse_env_var(ApifyEnvVars.PROXY_STATUS_URL, 'http://proxy.apify.com') - self.purge_on_start = purge_on_start or fetch_and_parse_env_var(ApifyEnvVars.PURGE_ON_START, default=False) - self.standby_port = standby_port or fetch_and_parse_env_var(ActorEnvVars.STANDBY_PORT, 4322) - self.started_at = fetch_and_parse_env_var(ActorEnvVars.STARTED_AT) - self.timeout_at = fetch_and_parse_env_var(ActorEnvVars.TIMEOUT_AT) - self.token = token or fetch_and_parse_env_var(ApifyEnvVars.TOKEN) - self.user_id = fetch_and_parse_env_var(ApifyEnvVars.USER_ID) - self.xvfb = fetch_and_parse_env_var(ApifyEnvVars.XVFB, default=False) - self.system_info_interval_millis = system_info_interval_millis or fetch_and_parse_env_var(ApifyEnvVars.SYSTEM_INFO_INTERVAL_MILLIS, 60000) - - @classmethod - def _get_default_instance(cls: type[Configuration]) -> Configuration: - if cls._default_instance is None: - cls._default_instance = cls() - - return cls._default_instance - - @classmethod - def get_global_configuration(cls: type[Configuration]) -> Configuration: - """Retrive the global configuration. - - The global configuration applies when you call actor methods via their static versions, e.g. `Actor.init()`. - Also accessible via `Actor.config`. - """ - return cls._get_default_instance() diff --git a/src/apify/consts.py b/src/apify/consts.py deleted file mode 100644 index 47d2ca7b..00000000 --- a/src/apify/consts.py +++ /dev/null @@ -1,67 +0,0 @@ -from __future__ import annotations - -import re -import warnings -from enum import Enum -from typing import Any - -from apify_shared.consts import BOOL_ENV_VARS as _BOOL_ENV_VARS # noqa: F401 -from apify_shared.consts import DATETIME_ENV_VARS as _DATETIME_ENV_VARS # noqa: F401 -from apify_shared.consts import FLOAT_ENV_VARS as _FLOAT_ENV_VARS # noqa: F401 -from apify_shared.consts import INTEGER_ENV_VARS as _INTEGER_ENV_VARS # noqa: F401 -from apify_shared.consts import STRING_ENV_VARS as _STRING_ENV_VARS # noqa: F401 -from apify_shared.consts import ActorEventTypes as _ActorEventTypes # noqa: F401 -from apify_shared.consts import ActorExitCodes as _ActorExitCodes # noqa: F401 -from apify_shared.consts import ApifyEnvVars as _ApifyEnvVars # noqa: F401 - -DEPRECATED_NAMES = [ - 'BOOL_ENV_VARS', - 'DATETIME_ENV_VARS', - 'FLOAT_ENV_VARS', - 'INTEGER_ENV_VARS', - 'STRING_ENV_VARS', - 'ActorEventTypes', - 'ActorExitCodes', - 'ApifyEnvVars', -] - - -# The following piece of code is highly inspired by the example in https://peps.python.org/pep-0562. -# The else branch is missing intentionally! Check the following discussion for details: -# https://github.com/apify/apify-client-python/pull/132#discussion_r1277294315. -def __getattr__(name: str) -> Any: - if name in DEPRECATED_NAMES: - warnings.warn( - ( - f'Importing "{name}" from "apify_client.consts" is deprecated and will be removed in the future. ' - 'Please use "apify_shared" library instead.' - ), - category=DeprecationWarning, - stacklevel=2, - ) - return globals()[f'_{name}'] - raise AttributeError(f'module {__name__!r} has no attribute {name!r}') - - -class StorageTypes(str, Enum): - """Possible Apify storage types.""" - - DATASET = 'Dataset' - KEY_VALUE_STORE = 'Key-value store' - REQUEST_QUEUE = 'Request queue' - - -DEFAULT_API_PARAM_LIMIT = 1000 - -REQUEST_ID_LENGTH = 15 - -REQUEST_QUEUE_HEAD_MAX_LIMIT = 1000 - -EVENT_LISTENERS_TIMEOUT_SECS = 5 - -BASE64_REGEXP = '[-A-Za-z0-9+/]*={0,3}' -ENCRYPTED_INPUT_VALUE_PREFIX = 'ENCRYPTED_VALUE' -ENCRYPTED_INPUT_VALUE_REGEXP = re.compile(f'^{ENCRYPTED_INPUT_VALUE_PREFIX}:({BASE64_REGEXP}):({BASE64_REGEXP})$') - -# 9MB -MAX_PAYLOAD_SIZE_BYTES = 9437184 diff --git a/src/apify/event_manager.py b/src/apify/event_manager.py deleted file mode 100644 index edb2595f..00000000 --- a/src/apify/event_manager.py +++ /dev/null @@ -1,236 +0,0 @@ -from __future__ import annotations - -import asyncio -import contextlib -import inspect -import json -from collections import defaultdict -from typing import TYPE_CHECKING, Any, Callable, Coroutine, Union - -import websockets.client -from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value, parse_date_fields -from pyee.asyncio import AsyncIOEventEmitter - -from apify.log import logger - -if TYPE_CHECKING: - from apify_shared.consts import ActorEventTypes - - from apify.config import Configuration - -ListenerType = Union[Callable[[], None], Callable[[Any], None], Callable[[], Coroutine[Any, Any, None]], Callable[[Any], Coroutine[Any, Any, None]]] - - -@ignore_docs -class EventManager: - """A class for managing actor events. - - You shouldn't use this class directly, - but instead use it via the `Actor.on()` and `Actor.off()` methods. - """ - - _platform_events_websocket: websockets.client.WebSocketClientProtocol | None = None - _process_platform_messages_task: asyncio.Task | None = None - _send_persist_state_interval_task: asyncio.Task | None = None - _send_system_info_interval_task: asyncio.Task | None = None - _listener_tasks: set[asyncio.Task] - _listeners_to_wrappers: dict[ActorEventTypes, dict[Callable, list[Callable]]] - _connected_to_platform_websocket: asyncio.Future | None = None - - def __init__(self: EventManager, config: Configuration) -> None: - """Create an instance of the EventManager. - - Args: - config (Configuration): The actor configuration to be used in this event manager. - """ - self._config = config - self._event_emitter = AsyncIOEventEmitter() - self._initialized = False - self._listener_tasks = set() - self._listeners_to_wrappers = defaultdict(lambda: defaultdict(list)) - - async def init(self: EventManager) -> None: - """Initialize the event manager. - - When running this on the Apify Platform, this will start processing events - send by the platform to the events websocket and emitting them as events - that can be listened to by the `Actor.on()` method. - """ - if self._initialized: - raise RuntimeError('EventManager was already initialized!') - - # Run tasks but don't await them - if self._config.actor_events_ws_url: - self._connected_to_platform_websocket = asyncio.Future() - self._process_platform_messages_task = asyncio.create_task(self._process_platform_messages()) - is_connected = await self._connected_to_platform_websocket - if not is_connected: - raise RuntimeError('Error connecting to platform events websocket!') - else: - logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.') - - self._initialized = True - - async def close(self: EventManager, event_listeners_timeout_secs: float | None = None) -> None: - """Initialize the event manager. - - This will stop listening for the platform events, - and it will wait for all the event listeners to finish. - - Args: - event_listeners_timeout_secs (float, optional): Optional timeout after which the pending event listeners are canceled. - """ - if not self._initialized: - raise RuntimeError('EventManager was not initialized!') - - if self._platform_events_websocket: - await self._platform_events_websocket.close() - - if self._process_platform_messages_task: - await self._process_platform_messages_task - - await self.wait_for_all_listeners_to_complete(timeout_secs=event_listeners_timeout_secs) - - self._event_emitter.remove_all_listeners() - - self._initialized = False - - def on(self: EventManager, event_name: ActorEventTypes, listener: ListenerType) -> Callable: - """Add an event listener to the event manager. - - Args: - event_name (ActorEventTypes): The actor event for which to listen to. - listener (Callable): The function which is to be called when the event is emitted (can be async). - Must accept either zero or one arguments (the first argument will be the event data). - """ - if not self._initialized: - raise RuntimeError('EventManager was not initialized!') - - # Detect whether the listener will accept the event_data argument - try: - signature = inspect.signature(listener) - except (ValueError, TypeError): - # If we can't determine the listener argument count (e.g. for the built-in `print` function), - # let's assume the listener will accept the argument - listener_argument_count = 1 - else: - try: - dummy_event_data: dict = {} - signature.bind(dummy_event_data) - listener_argument_count = 1 - except TypeError: - try: - signature.bind() - listener_argument_count = 0 - except TypeError as err: - raise ValueError('The "listener" argument must be a callable which accepts 0 or 1 arguments!') from err - - event_name = maybe_extract_enum_member_value(event_name) - - async def inner_wrapper(event_data: Any) -> None: - if inspect.iscoroutinefunction(listener): - if listener_argument_count == 0: - await listener() - else: - await listener(event_data) - elif listener_argument_count == 0: - listener() # type: ignore[call-arg] - else: - listener(event_data) # type: ignore[call-arg] - - async def outer_wrapper(event_data: Any) -> None: - listener_task = asyncio.create_task(inner_wrapper(event_data)) - self._listener_tasks.add(listener_task) - try: - await listener_task - except asyncio.CancelledError: - raise - except Exception: - # We need to swallow the exception and just log it here, since it could break the event emitter otherwise - logger.exception('Exception in event listener', extra={'event_name': event_name, 'listener_name': listener.__name__}) - finally: - self._listener_tasks.remove(listener_task) - - self._listeners_to_wrappers[event_name][listener].append(outer_wrapper) - - return self._event_emitter.add_listener(event_name, outer_wrapper) - - def off(self: EventManager, event_name: ActorEventTypes, listener: Callable | None = None) -> None: - """Remove a listener, or all listeners, from an actor event. - - Args: - event_name (ActorEventTypes): The actor event for which to remove listeners. - listener (Callable, optional): The listener which is supposed to be removed. If not passed, all listeners of this event are removed. - """ - if not self._initialized: - raise RuntimeError('EventManager was not initialized!') - - event_name = maybe_extract_enum_member_value(event_name) - - if listener: - for listener_wrapper in self._listeners_to_wrappers[event_name][listener]: - self._event_emitter.remove_listener(event_name, listener_wrapper) - self._listeners_to_wrappers[event_name][listener] = [] - else: - self._listeners_to_wrappers[event_name] = defaultdict(list) - self._event_emitter.remove_all_listeners(event_name) - - def emit(self: EventManager, event_name: ActorEventTypes, data: Any) -> None: - """Emit an actor event manually. - - Args: - event_name (ActorEventTypes): The actor event which should be emitted. - data (Any): The data that should be emitted with the event. - """ - event_name = maybe_extract_enum_member_value(event_name) - - self._event_emitter.emit(event_name, data) - - async def wait_for_all_listeners_to_complete(self: EventManager, *, timeout_secs: float | None = None) -> None: - """Wait for all event listeners which are currently being executed to complete. - - Args: - timeout_secs (float, optional): Timeout for the wait. If the event listeners don't finish until the timeout, they will be canceled. - """ - - async def _wait_for_listeners() -> None: - results = await asyncio.gather(*self._listener_tasks, return_exceptions=True) - for result in results: - if result is Exception: - logger.exception('Event manager encountered an exception in one of the event listeners', exc_info=result) - - if timeout_secs: - _, pending = await asyncio.wait([asyncio.create_task(_wait_for_listeners())], timeout=timeout_secs) - if pending: - logger.warning('Timed out waiting for event listeners to complete, unfinished event listeners will be canceled') - for pending_task in pending: - pending_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await pending_task - else: - await _wait_for_listeners() - - async def _process_platform_messages(self: EventManager) -> None: - # This should be called only on the platform, where we have the ACTOR_EVENTS_WS_URL configured - assert self._config.actor_events_ws_url is not None # noqa: S101 - assert self._connected_to_platform_websocket is not None # noqa: S101 - - try: - async with websockets.client.connect(self._config.actor_events_ws_url) as websocket: - self._platform_events_websocket = websocket - self._connected_to_platform_websocket.set_result(True) - async for message in websocket: - try: - parsed_message = json.loads(message) - assert isinstance(parsed_message, dict) # noqa: S101 - parsed_message = parse_date_fields(parsed_message) - event_name = parsed_message['name'] - event_data = parsed_message.get('data') # 'data' can be missing - - self._event_emitter.emit(event_name, event_data) - - except Exception: - logger.exception('Cannot parse actor event', extra={'message': message}) - except Exception: - logger.exception('Error in websocket connection') - self._connected_to_platform_websocket.set_result(False) diff --git a/src/apify/log.py b/src/apify/log.py deleted file mode 100644 index 180ab826..00000000 --- a/src/apify/log.py +++ /dev/null @@ -1,124 +0,0 @@ -from __future__ import annotations - -import json -import logging -import textwrap -import traceback -from typing import Any - -from apify_shared.utils import ignore_docs -from colorama import Fore, Style, just_fix_windows_console - -just_fix_windows_console() - - -# Name of the logger used throughout the library (resolves to 'apify') -logger_name = __name__.split('.')[0] - -# Logger used throughout the library -logger = logging.getLogger(logger_name) - -_LOG_NAME_COLOR = Fore.LIGHTBLACK_EX - -_LOG_LEVEL_COLOR = { - logging.DEBUG: Fore.BLUE, - logging.INFO: Fore.GREEN, - logging.WARNING: Fore.YELLOW, - logging.ERROR: Fore.RED, - logging.CRITICAL: Fore.RED, -} - -_LOG_LEVEL_SHORT_ALIAS = { - logging.DEBUG: 'DEBUG', - logging.INFO: 'INFO ', - logging.WARNING: 'WARN ', - logging.ERROR: 'ERROR', -} - -# So that all the log messages have the same alignment -_LOG_MESSAGE_INDENT = ' ' * 6 - - -class ActorLogFormatter(logging.Formatter): - """Log formatter that prints out the log message nicely formatted, with colored level and stringified extra fields. - - It formats the log records so that they: - - start with the level (colorized, and padded to 5 chars so that it is nicely aligned) - - then have the actual log message, if it's multiline then it's nicely indented - - then have the stringified extra log fields - - then, if an exception is a part of the log record, prints the formatted exception. - """ - - # The fields that are added to the log record with `logger.log(..., extra={...})` - # are just merged in the log record with the other log record properties, and you can't get them in some nice, isolated way. - # So, to get the extra fields, we just compare all the properties present in the log record - # with properties present in an empty log record, - # and extract all the extra ones not present in the empty log record - empty_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None) - - def __init__( - self: ActorLogFormatter, - include_logger_name: bool = False, # noqa: FBT001, FBT002 - *args: Any, - **kwargs: Any, - ) -> None: - """Create an instance of the ActorLogFormatter. - - Args: - include_logger_name: Include logger name at the beginning of the log line. Defaults to False. - args: Arguments passed to the parent class. - kwargs: Keyword arguments passed to the parent class. - """ - super().__init__(*args, **kwargs) - self.include_logger_name = include_logger_name - - def _get_extra_fields(self: ActorLogFormatter, record: logging.LogRecord) -> dict[str, Any]: - extra_fields: dict[str, Any] = {} - for key, value in record.__dict__.items(): - if key not in self.empty_record.__dict__: - extra_fields[key] = value # noqa: PERF403 - - return extra_fields - - @ignore_docs - def format(self: ActorLogFormatter, record: logging.LogRecord) -> str: - """Format the log record nicely. - - This formats the log record so that it: - - starts with the level (colorized, and padded to 5 chars so that it is nicely aligned) - - then has the actual log message, if it's multiline then it's nicely indented - - then has the stringified extra log fields - - then, if an exception is a part of the log record, prints the formatted exception. - """ - logger_name_string = f'{_LOG_NAME_COLOR}[{record.name}]{Style.RESET_ALL} ' - - # Colorize the log level, and shorten it to 6 chars tops - level_color_code = _LOG_LEVEL_COLOR.get(record.levelno, '') - level_short_alias = _LOG_LEVEL_SHORT_ALIAS.get(record.levelno, record.levelname) - level_string = f'{level_color_code}{level_short_alias}{Style.RESET_ALL} ' - - # Format the exception, if there is some - # Basically just print the traceback and indent it a bit - exception_string = '' - if record.exc_info: - exc_info = record.exc_info - record.exc_info = None - exception_string = ''.join(traceback.format_exception(*exc_info)).rstrip() - exception_string = '\n' + textwrap.indent(exception_string, _LOG_MESSAGE_INDENT) - - # Format the extra log record fields, if there were some - # Just stringify them to JSON and color them gray - extra_string = '' - extra = self._get_extra_fields(record) - if extra: - extra_string = f' {Fore.LIGHTBLACK_EX}({json.dumps(extra, ensure_ascii=False, default=str)}){Style.RESET_ALL}' - - # Format the actual log message, and indent everything but the first line - log_string = super().format(record) - log_string = textwrap.indent(log_string, _LOG_MESSAGE_INDENT).lstrip() - - if self.include_logger_name: - # Include logger name at the beginning of the log line - return f'{logger_name_string}{level_string}{log_string}{extra_string}{exception_string}' - - return f'{level_string}{log_string}{extra_string}{exception_string}' diff --git a/src/apify/scrapy/__init__.py b/src/apify/scrapy/__init__.py index 70ee1cfb..a1d065c2 100644 --- a/src/apify/scrapy/__init__.py +++ b/src/apify/scrapy/__init__.py @@ -1,3 +1,11 @@ -from .requests import to_apify_request, to_scrapy_request -from .scheduler import ApifyScheduler -from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client +from apify.scrapy.requests import to_apify_request, to_scrapy_request +from apify.scrapy.scheduler import ApifyScheduler +from apify.scrapy.utils import get_basic_auth_header, get_running_event_loop_id + +__all__ = [ + 'to_apify_request', + 'to_scrapy_request', + 'ApifyScheduler', + 'get_basic_auth_header', + 'get_running_event_loop_id', +] diff --git a/src/apify/scrapy/middlewares/__init__.py b/src/apify/scrapy/middlewares/__init__.py index 257252d5..c1d82a7e 100644 --- a/src/apify/scrapy/middlewares/__init__.py +++ b/src/apify/scrapy/middlewares/__init__.py @@ -1 +1,3 @@ -from .apify_proxy import ApifyHttpProxyMiddleware +from apify.scrapy.middlewares.apify_proxy import ApifyHttpProxyMiddleware + +__all__ = ['ApifyHttpProxyMiddleware'] diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index c2aeca65..3120f972 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -12,8 +12,7 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from apify.actor import Actor -from apify.proxy_configuration import ProxyConfiguration +from apify import Actor, ProxyConfiguration from apify.scrapy.utils import get_basic_auth_header @@ -43,11 +42,9 @@ def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> Apify """Create an instance of ApifyHttpProxyMiddleware from a Scrapy Crawler. Args: - cls: Class type. crawler: Scrapy Crawler object. - Returns: - ApifyHttpProxyMiddleware: Instance of the class. + Returns: Instance of the class. """ proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS') @@ -74,9 +71,6 @@ async def process_request(self: ApifyHttpProxyMiddleware, request: Request, spid Raises: ValueError: If username and password are not provided in the proxy URL. - - Returns: - None: The request is processed and middleware pipeline can continue. """ Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}') url = await self._get_new_proxy_url() @@ -123,8 +117,7 @@ async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult: Raises: NotConfigured: If creation of the proxy configuration fails. - Returns: - ParseResult: New proxy URL. + Returns: New proxy URL. """ # Get proxy configuration, creating it if necessary proxy_cfg = ( @@ -144,4 +137,4 @@ async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult: # Get a new proxy URL and return it new_url = await proxy_cfg.new_url() - return urlparse(new_url) + return urlparse(str(new_url)) diff --git a/src/apify/scrapy/pipelines/__init__.py b/src/apify/scrapy/pipelines/__init__.py index fa2c95eb..7a94b771 100644 --- a/src/apify/scrapy/pipelines/__init__.py +++ b/src/apify/scrapy/pipelines/__init__.py @@ -1 +1,3 @@ -from .actor_dataset_push import ActorDatasetPushPipeline +from apify.scrapy.pipelines.actor_dataset_push import ActorDatasetPushPipeline + +__all__ = ['ActorDatasetPushPipeline'] diff --git a/src/apify/scrapy/pipelines/actor_dataset_push.py b/src/apify/scrapy/pipelines/actor_dataset_push.py index e75262da..8f371788 100644 --- a/src/apify/scrapy/pipelines/actor_dataset_push.py +++ b/src/apify/scrapy/pipelines/actor_dataset_push.py @@ -9,7 +9,7 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from apify.actor import Actor +from apify import Actor class ActorDatasetPushPipeline: diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 688c2bc4..6d2fd348 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -2,6 +2,7 @@ import codecs import pickle +from typing import Any, cast try: from scrapy import Request, Spider @@ -12,9 +13,11 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from apify._crypto import crypto_random_object_id -from apify._utils import compute_unique_key -from apify.actor import Actor +from crawlee import Request as CrawleeRequest +from crawlee._utils.crypto import crypto_random_object_id +from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id + +from apify import Actor def _is_request_produced_by_middleware(scrapy_request: Request) -> bool: @@ -25,7 +28,7 @@ def _is_request_produced_by_middleware(scrapy_request: Request) -> bool: return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times')) -def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None: +def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest | None: """Convert a Scrapy request to an Apify request. Args: @@ -35,7 +38,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None: Returns: The converted Apify request if the conversion was successful, otherwise None. """ - if not isinstance(scrapy_request, Request): + if not isinstance(cast(Any, scrapy_request), Request): Actor.log.warning('Failed to convert to Apify request: Scrapy request must be a Request instance.') return None @@ -43,39 +46,39 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None: Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...') try: - apify_request = { - 'url': scrapy_request.url, - 'method': scrapy_request.method, - 'payload': scrapy_request.body, - 'userData': scrapy_request.meta.get('userData', {}), - } - - # Convert Scrapy's headers to a dictionary and store them in the apify_request - if isinstance(scrapy_request.headers, Headers): - apify_request['headers'] = dict(scrapy_request.headers.to_unicode_dict()) - else: - Actor.log.warning(f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}') - - # If the request was produced by the middleware (e.g. retry or redirect), we must compute the unique key here if _is_request_produced_by_middleware(scrapy_request): - apify_request['uniqueKey'] = compute_unique_key( + unique_key = compute_unique_key( url=scrapy_request.url, method=scrapy_request.method, payload=scrapy_request.body, use_extended_unique_key=True, ) - # Othwerwise, we can use the unique key (also the id) from the meta + elif scrapy_request.dont_filter: + unique_key = crypto_random_object_id(8) + elif scrapy_request.meta.get('apify_request_unique_key'): + unique_key = scrapy_request.meta['apify_request_unique_key'] else: - if scrapy_request.meta.get('apify_request_id'): - apify_request['id'] = scrapy_request.meta['apify_request_id'] + unique_key = crypto_random_object_id(8) - if scrapy_request.meta.get('apify_request_unique_key'): - apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key'] + if scrapy_request.meta.get('apify_request_id'): # noqa: SIM108 + request_id = scrapy_request.meta['apify_request_id'] + else: + request_id = unique_key_to_request_id(unique_key) + + apify_request = CrawleeRequest( + url=scrapy_request.url, + method=scrapy_request.method, + payload=scrapy_request.body, + user_data=scrapy_request.meta.get('userData', {}), + unique_key=unique_key, + id=request_id, + ) - # If the request's dont_filter field is set, we must generate a random `uniqueKey` to avoid deduplication - # of the request in the Request Queue. - if scrapy_request.dont_filter: - apify_request['uniqueKey'] = crypto_random_object_id(8) + # Convert Scrapy's headers to a dictionary and store them in the apify_request + if isinstance(scrapy_request.headers, Headers): + apify_request.headers = dict(scrapy_request.headers.to_unicode_dict()) + else: + Actor.log.warning(f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}') # Serialize the Scrapy Request and store it in the apify_request. # - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64, @@ -83,7 +86,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None: # - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/. scrapy_request_dict = scrapy_request.to_dict(spider=spider) scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode() - apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded + apify_request.user_data['scrapy_request'] = scrapy_request_dict_encoded except Exception as exc: Actor.log.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}') @@ -93,7 +96,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None: return apify_request -def to_scrapy_request(apify_request: dict, spider: Spider) -> Request: +def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: """Convert an Apify request to a Scrapy request. Args: @@ -101,32 +104,26 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request: spider: The Scrapy spider that the request is associated with. Raises: - TypeError: If the apify_request is not a dictionary. + TypeError: If the apify_request is not a crawlee request. ValueError: If the apify_request does not contain the required keys. Returns: The converted Scrapy request. """ - if not isinstance(apify_request, dict): - raise TypeError('apify_request must be a dictionary') - - required_keys = ['url', 'method', 'id', 'uniqueKey'] - missing_keys = [key for key in required_keys if key not in apify_request] - - if missing_keys: - raise ValueError(f'apify_request must contain {", ".join(map(repr, missing_keys))} key(s)') + if not isinstance(cast(Any, apify_request), CrawleeRequest): + raise TypeError('apify_request must be a crawlee.Request instance') call_id = crypto_random_object_id(8) Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...') # If the apify_request comes from the Scrapy - if 'userData' in apify_request and 'scrapy_request' in apify_request['userData']: + if 'scrapy_request' in apify_request.user_data: # Deserialize the Scrapy Request from the apify_request. # - This process involves decoding the base64-encoded request data and reconstructing # the Scrapy Request object from its dictionary representation. Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...') - scrapy_request_dict_encoded = apify_request['userData']['scrapy_request'] + scrapy_request_dict_encoded = apify_request.user_data['scrapy_request'] if not isinstance(scrapy_request_dict_encoded, str): raise TypeError('scrapy_request_dict_encoded must be a string') @@ -142,7 +139,7 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request: # Update the meta field with the meta field from the apify_request meta = scrapy_request.meta or {} - meta.update({'apify_request_id': apify_request['id'], 'apify_request_unique_key': apify_request['uniqueKey']}) + meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key}) scrapy_request._meta = meta # scrapy_request.meta is a property, so we have to set it like this # If the apify_request comes directly from the Request Queue, typically start URLs @@ -150,26 +147,26 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request: Actor.log.debug(f'[{call_id}]: gonna create a new Scrapy Request (cannot be restored)') scrapy_request = Request( - url=apify_request['url'], - method=apify_request['method'], + url=apify_request.url, + method=apify_request.method, meta={ - 'apify_request_id': apify_request['id'], - 'apify_request_unique_key': apify_request['uniqueKey'], + 'apify_request_id': apify_request.id, + 'apify_request_unique_key': apify_request.unique_key, }, ) # Add optional 'headers' field - if 'headers' in apify_request: - if isinstance(apify_request['headers'], dict): - scrapy_request.headers = Headers(apify_request['headers']) + if apify_request.headers: + if isinstance(cast(Any, apify_request.headers), dict): + scrapy_request.headers = Headers(apify_request.headers) else: Actor.log.warning( - f'apify_request[headers] is not an instance of the dict class, apify_request[headers] = {apify_request["headers"]}', + f'apify_request[headers] is not an instance of the dict class, apify_request[headers] = {apify_request.headers}', ) # Add optional 'userData' field - if 'userData' in apify_request: - scrapy_request.meta['userData'] = apify_request['userData'] + if apify_request.user_data: + scrapy_request.meta['userData'] = apify_request.user_data Actor.log.debug(f'[{call_id}]: an apify_request was converted to the scrapy_request={scrapy_request}') return scrapy_request diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 4b280d8a..db8f6ad0 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -2,6 +2,9 @@ import traceback +from apify._configuration import Configuration +from apify.apify_storage_client import ApifyStorageClient + try: from scrapy import Spider from scrapy.core.scheduler import BaseScheduler @@ -12,10 +15,11 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from apify._crypto import crypto_random_object_id -from apify.actor import Actor +from crawlee._utils.crypto import crypto_random_object_id + +from apify import Actor from apify.scrapy.requests import to_apify_request, to_scrapy_request -from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client +from apify.scrapy.utils import nested_event_loop from apify.storages import RequestQueue @@ -44,8 +48,12 @@ def open(self: ApifyScheduler, spider: Spider) -> None: # this has to be named """ self.spider = spider + async def open_queue() -> RequestQueue: + custom_loop_apify_client = ApifyStorageClient(configuration=Configuration.get_global_configuration()) + return await RequestQueue.open(storage_client=custom_loop_apify_client) + try: - self._rq = nested_event_loop.run_until_complete(open_queue_with_custom_client()) + self._rq = nested_event_loop.run_until_complete(open_queue()) except BaseException: traceback.print_exc() raise @@ -95,18 +103,13 @@ def enqueue_request(self: ApifyScheduler, request: Request) -> bool: raise TypeError('self._rq must be an instance of the RequestQueue class') try: - result = nested_event_loop.run_until_complete( - self._rq.add_request( - apify_request, - use_extended_unique_key=True, - ) - ) + result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request)) except BaseException: traceback.print_exc() raise Actor.log.debug(f'[{call_id}]: rq.add_request.result={result}...') - return bool(result['wasAlreadyPresent']) + return bool(result.was_already_present) def next_request(self: ApifyScheduler) -> Request | None: """Fetch the next request from the scheduler. diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index 405e59a3..dbd43a2b 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -10,11 +10,9 @@ from scrapy.utils.python import to_bytes except ImportError as exc: raise ImportError( - 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', + 'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run "pip install apify[scrapy]".', ) from exc -from apify.actor import Actor -from apify.storages import RequestQueue, StorageClientManager nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop() @@ -71,31 +69,3 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict settings['APIFY_PROXY_SETTINGS'] = proxy_config return settings - - -async def open_queue_with_custom_client() -> RequestQueue: - """Open a Request Queue with custom Apify Client. - - TODO: add support for custom client to Actor.open_request_queue(), so that - we don't have to do this hacky workaround - """ - # Create a new Apify Client with its httpx client in the custom event loop - custom_loop_apify_client = Actor.new_client() - - # Set the new Apify Client as the default client, back up the old client - old_client = Actor.apify_client - StorageClientManager.set_cloud_client(custom_loop_apify_client) - - # Create a new Request Queue in the custom event loop, - # replace its Apify client with the custom loop's Apify client - rq = await Actor.open_request_queue() - - if Actor.config.is_at_home: - rq._request_queue_client = custom_loop_apify_client.request_queue( - rq._id, - client_key=rq._client_key, - ) - - # Restore the old Apify Client as the default client - StorageClientManager.set_cloud_client(old_client) - return rq diff --git a/src/apify/storages/__init__.py b/src/apify/storages/__init__.py index e954ef20..2ed85e84 100644 --- a/src/apify/storages/__init__.py +++ b/src/apify/storages/__init__.py @@ -1,11 +1,3 @@ -from .dataset import Dataset -from .key_value_store import KeyValueStore -from .request_queue import RequestQueue -from .storage_client_manager import StorageClientManager +from crawlee.storages import Dataset, KeyValueStore, RequestQueue -__all__ = [ - 'Dataset', - 'KeyValueStore', - 'RequestQueue', - 'StorageClientManager', -] +__all__ = ['Dataset', 'KeyValueStore', 'RequestQueue'] diff --git a/src/apify/storages/base_storage.py b/src/apify/storages/base_storage.py deleted file mode 100644 index 54697511..00000000 --- a/src/apify/storages/base_storage.py +++ /dev/null @@ -1,181 +0,0 @@ -from __future__ import annotations - -import asyncio -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Generic, TypeVar, cast - -from apify_shared.utils import ignore_docs - -from apify._memory_storage import MemoryStorageClient -from apify._memory_storage.resource_clients import BaseResourceClient, BaseResourceCollectionClient -from apify.config import Configuration -from apify.storages.storage_client_manager import StorageClientManager - -if TYPE_CHECKING: - from apify_client import ApifyClientAsync - -BaseResourceClientType = TypeVar('BaseResourceClientType', bound=BaseResourceClient) -BaseResourceCollectionClientType = TypeVar('BaseResourceCollectionClientType', bound=BaseResourceCollectionClient) - - -@ignore_docs -class BaseStorage(ABC, Generic[BaseResourceClientType, BaseResourceCollectionClientType]): - """A class for managing storages.""" - - _id: str - _name: str | None - _storage_client: ApifyClientAsync | MemoryStorageClient - _config: Configuration - - _cache_by_id: dict | None = None - _cache_by_name: dict | None = None - _storage_creating_lock: asyncio.Lock | None = None - - def __init__( - self: BaseStorage, - id: str, # noqa: A002 - name: str | None, - client: ApifyClientAsync | MemoryStorageClient, - config: Configuration, - ) -> None: - """Initialize the storage. - - Do not use this method directly, but use `Actor.open_()` instead. - - Args: - id (str): The storage id - name (str, optional): The storage name - client (ApifyClientAsync or MemoryStorageClient): The storage client - config (Configuration): The configuration - """ - self._id = id - self._name = name - self._storage_client = client - self._config = config - - @classmethod - @abstractmethod - def _get_human_friendly_label(cls: type[BaseStorage]) -> str: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - @abstractmethod - def _get_default_id(cls: type[BaseStorage], config: Configuration) -> str: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - @abstractmethod - def _get_single_storage_client( - cls: type[BaseStorage], - id: str, # noqa: A002 - client: ApifyClientAsync | MemoryStorageClient, - ) -> BaseResourceClientType: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - @abstractmethod - def _get_storage_collection_client( - cls: type[BaseStorage], - client: ApifyClientAsync | MemoryStorageClient, - ) -> BaseResourceCollectionClientType: - raise NotImplementedError('You must override this method in the subclass!') - - @classmethod - def _ensure_class_initialized(cls: type[BaseStorage]) -> None: - if cls._cache_by_id is None: - cls._cache_by_id = {} - if cls._cache_by_name is None: - cls._cache_by_name = {} - if cls._storage_creating_lock is None: - cls._storage_creating_lock = asyncio.Lock() - - @classmethod - @abstractmethod - async def open( - cls: type[BaseStorage], - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - config: Configuration | None = None, - ) -> BaseStorage: - """Open a storage, or return a cached storage object if it was opened before. - - Opens a storage with the given ID or name. - Returns the cached storage object if the storage was opened before. - - Args: - id (str, optional): ID of the storage to be opened. - If neither `id` nor `name` are provided, the method returns the default storage associated with the actor run. - If the storage with the given ID does not exist, it raises an error. - name (str, optional): Name of the storage to be opened. - If neither `id` nor `name` are provided, the method returns the default storage associated with the actor run. - If the storage with the given name does not exist, it is created. - force_cloud (bool, optional): If set to True, it will open a storage on the Apify Platform even when running the actor locally. - Defaults to False. - config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted. - - Returns: - An instance of the storage. - """ - cls._ensure_class_initialized() - assert cls._cache_by_id is not None # noqa: S101 - assert cls._cache_by_name is not None # noqa: S101 - assert not (id and name) # noqa: S101 - - used_config = config or Configuration.get_global_configuration() - used_client = StorageClientManager.get_storage_client(force_cloud=force_cloud) - - is_default_storage_on_local = False - # Fetch default ID if no ID or name was passed - if not id and not name: - if isinstance(used_client, MemoryStorageClient): - is_default_storage_on_local = True - id = cls._get_default_id(used_config) # noqa: A001 - - # Try to get the storage instance from cache - cached_storage = None - if id: - cached_storage = cls._cache_by_id.get(id) - elif name: - cached_storage = cls._cache_by_name.get(name) - - if cached_storage is not None: - # This cast is needed since MyPy doesn't understand very well that Self and Storage are the same - return cast(BaseStorage, cached_storage) - - # Purge default storages if configured - if used_config.purge_on_start and isinstance(used_client, MemoryStorageClient): - await used_client._purge_on_start() - - assert cls._storage_creating_lock is not None # noqa: S101 - async with cls._storage_creating_lock: - # Create the storage - if id and not is_default_storage_on_local: - single_storage_client = cls._get_single_storage_client(id, used_client) - storage_info = await single_storage_client.get() - if not storage_info: - storage_label = cls._get_human_friendly_label() - raise RuntimeError(f'{storage_label} with id "{id}" does not exist!') - elif is_default_storage_on_local: - storage_collection_client = cls._get_storage_collection_client(used_client) - storage_info = await storage_collection_client.get_or_create(name=name, _id=id) - else: - storage_collection_client = cls._get_storage_collection_client(used_client) - storage_info = await storage_collection_client.get_or_create(name=name) - - storage = cls(storage_info['id'], storage_info.get('name'), used_client, used_config) - - # Cache by id and name - cls._cache_by_id[storage._id] = storage - if storage._name is not None: - cls._cache_by_name[storage._name] = storage - - return storage - - def _remove_from_cache(self: BaseStorage) -> None: - if self.__class__._cache_by_id is not None: - del self.__class__._cache_by_id[self._id] - - if self._name and self.__class__._cache_by_name is not None: - del self.__class__._cache_by_name[self._name] diff --git a/src/apify/storages/dataset.py b/src/apify/storages/dataset.py deleted file mode 100644 index ce4429c7..00000000 --- a/src/apify/storages/dataset.py +++ /dev/null @@ -1,494 +0,0 @@ -from __future__ import annotations - -import csv -import io -import math -from typing import TYPE_CHECKING, AsyncIterator, Iterable, Iterator - -from apify_shared.utils import ignore_docs, json_dumps - -from apify._utils import wrap_internal -from apify.consts import MAX_PAYLOAD_SIZE_BYTES -from apify.storages.base_storage import BaseStorage -from apify.storages.key_value_store import KeyValueStore - -if TYPE_CHECKING: - from apify_client import ApifyClientAsync - from apify_client.clients import DatasetClientAsync, DatasetCollectionClientAsync - from apify_shared.models import ListPage - from apify_shared.types import JSONSerializable - - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import DatasetClient, DatasetCollectionClient - from apify.config import Configuration - -# 0.01% -SAFETY_BUFFER_PERCENT = 0.01 / 100 -EFFECTIVE_LIMIT_BYTES = MAX_PAYLOAD_SIZE_BYTES - math.ceil(MAX_PAYLOAD_SIZE_BYTES * SAFETY_BUFFER_PERCENT) - - -def _check_and_serialize(item: JSONSerializable, index: int | None = None) -> str: - """Accept a JSON serializable object as an input, validate its serializability and its serialized size against `EFFECTIVE_LIMIT_BYTES`.""" - s = ' ' if index is None else f' at index {index} ' - - try: - payload = json_dumps(item) - except Exception as exc: - raise ValueError(f'Data item{s}is not serializable to JSON.') from exc - - length_bytes = len(payload.encode('utf-8')) - if length_bytes > EFFECTIVE_LIMIT_BYTES: - raise ValueError(f'Data item{s}is too large (size: {length_bytes} bytes, limit: {EFFECTIVE_LIMIT_BYTES} bytes)') - - return payload - - -def _chunk_by_size(items: Iterable[str]) -> Iterator[str]: - """Take an array of JSONs, produce iterator of chunked JSON arrays respecting `EFFECTIVE_LIMIT_BYTES`. - - Takes an array of JSONs (payloads) as input and produces an iterator of JSON strings - where each string is a JSON array of payloads with a maximum size of `EFFECTIVE_LIMIT_BYTES` per one - JSON array. Fits as many payloads as possible into a single JSON array and then moves - on to the next, preserving item order. - - The function assumes that none of the items is larger than `EFFECTIVE_LIMIT_BYTES` and does not validate. - """ - last_chunk_bytes = 2 # Add 2 bytes for [] wrapper. - current_chunk = [] - - for payload in items: - length_bytes = len(payload.encode('utf-8')) - - if last_chunk_bytes + length_bytes <= EFFECTIVE_LIMIT_BYTES: - current_chunk.append(payload) - last_chunk_bytes += length_bytes + 1 # Add 1 byte for ',' separator. - else: - yield f'[{",".join(current_chunk)}]' - current_chunk = [payload] - last_chunk_bytes = length_bytes + 2 # Add 2 bytes for [] wrapper. - - yield f'[{",".join(current_chunk)}]' - - -class Dataset(BaseStorage): - """The `Dataset` class represents a store for structured data where each object stored has the same attributes. - - You can imagine it as a table, where each object is a row and its attributes are columns. - Dataset is an append-only storage - you can only add new records to it but you cannot modify or remove existing records. - Typically it is used to store crawling results. - - Do not instantiate this class directly, use the `Actor.open_dataset()` function instead. - - `Dataset` stores its data either on local disk or in the Apify cloud, - depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set. - - If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in - the local directory in the following files: - ``` - {APIFY_LOCAL_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json - ``` - Note that `{DATASET_ID}` is the name or ID of the dataset. The default dataset has ID: `default`, - unless you override it by setting the `APIFY_DEFAULT_DATASET_ID` environment variable. - Each dataset item is stored as a separate JSON file, where `{INDEX}` is a zero-based index of the item in the dataset. - - If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the - [Apify Dataset](https://docs.apify.com/storage/dataset) cloud storage. - """ - - _id: str - _name: str | None - _dataset_client: DatasetClientAsync | DatasetClient - - @ignore_docs - def __init__( - self: Dataset, - id: str, # noqa: A002 - name: str | None, - client: ApifyClientAsync | MemoryStorageClient, - config: Configuration, - ) -> None: - """Create a `Dataset` instance. - - Do not use the constructor directly, use the `Actor.open_dataset()` function instead. - - Args: - id (str): ID of the dataset. - name (str, optional): Name of the dataset. - client (ApifyClientAsync or MemoryStorageClient): The storage client which should be used. - config (Configuration): The configuration which should be used. - """ - super().__init__(id=id, name=name, client=client, config=config) - - self.get_data = wrap_internal(self._get_data_internal, self.get_data) # type: ignore - self.push_data = wrap_internal(self._push_data_internal, self.push_data) # type: ignore - self.export_to_json = wrap_internal(self._export_to_json_internal, self.export_to_json) # type: ignore - self.export_to_csv = wrap_internal(self._export_to_csv_internal, self.export_to_csv) # type: ignore - - self._dataset_client = client.dataset(self._id) - - @classmethod - def _get_human_friendly_label(cls: type[Dataset]) -> str: - return 'Dataset' - - @classmethod - def _get_default_id(cls: type[Dataset], config: Configuration) -> str: - return config.default_dataset_id - - @classmethod - def _get_single_storage_client( - cls: type[Dataset], - id: str, # noqa: A002 - client: ApifyClientAsync | MemoryStorageClient, - ) -> DatasetClientAsync | DatasetClient: - return client.dataset(id) - - @classmethod - def _get_storage_collection_client( - cls: type[Dataset], - client: ApifyClientAsync | MemoryStorageClient, - ) -> DatasetCollectionClientAsync | DatasetCollectionClient: - return client.datasets() - - @classmethod - async def push_data(cls: type[Dataset], data: JSONSerializable) -> None: - """Store an object or an array of objects to the dataset. - - The size of the data is limited by the receiving API and therefore `push_data()` will only - allow objects whose JSON representation is smaller than 9MB. When an array is passed, - none of the included objects may be larger than 9MB, but the array itself may be of any size. - - Args: - data (JSONSerializable): dict or array of dicts containing data to be stored in the default dataset. - The JSON representation of each item must be smaller than 9MB. - """ - dataset = await cls.open() - return await dataset.push_data(data) - - async def _push_data_internal(self: Dataset, data: JSONSerializable) -> None: - # Handle singular items - if not isinstance(data, list): - payload = _check_and_serialize(data) - return await self._dataset_client.push_items(payload) - - # Handle lists - payloads_generator = (_check_and_serialize(item, index) for index, item in enumerate(data)) - - # Invoke client in series to preserve the order of data - for chunk in _chunk_by_size(payloads_generator): - await self._dataset_client.push_items(chunk) - return None - - @classmethod - async def get_data( - cls: type[Dataset], - *, - offset: int | None = None, - limit: int | None = None, - clean: bool | None = None, - desc: bool | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool | None = None, - skip_hidden: bool | None = None, - flatten: list[str] | None = None, - view: str | None = None, - ) -> ListPage: - """Get items from the dataset. - - Args: - offset (int, optional): Number of items that should be skipped at the start. The default value is 0 - limit (int, optional): Maximum number of items to return. By default there is no limit. - desc (bool, optional): By default, results are returned in the same order as they were stored. - To reverse the order, set this parameter to True. - clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character). - The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters. - Note that since some objects might be skipped from the output, that the result might contain less items than the limit value. - fields (list of str, optional): A list of fields which should be picked from the items, - only these fields will remain in the resulting record objects. - Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter. - You can use this feature to effectively fix the output format. - omit (list of str, optional): A list of fields which should be omitted from the items. - unwind (str, optional): Name of a field which should be unwound. - If the field is an array then every element of the array will become a separate record and merged with parent object. - If the unwound field is an object then it is merged with the parent object. - If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object, - then the item gets preserved as it is. Note that the unwound items ignore the desc parameter. - skip_empty (bool, optional): If True, then empty items are skipped from the output. - Note that if used, the results might contain less items than the limit value. - skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character. - flatten (list of str, optional): A list of fields that should be flattened - view (str, optional): Name of the dataset view to be used - - Returns: - ListPage: A page of the list of dataset items according to the specified filters. - """ - dataset = await cls.open() - return await dataset.get_data( - offset=offset, - limit=limit, - desc=desc, - clean=clean, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_hidden=skip_hidden, - flatten=flatten, - view=view, - ) - - async def _get_data_internal( - self: Dataset, - *, - offset: int | None = None, - limit: int | None = None, - clean: bool | None = None, - desc: bool | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool | None = None, - skip_hidden: bool | None = None, - flatten: list[str] | None = None, - view: str | None = None, - ) -> ListPage: - # TODO: Improve error handling here - # https://github.com/apify/apify-sdk-python/issues/140 - return await self._dataset_client.list_items( - offset=offset, - limit=limit, - desc=desc, - clean=clean, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_hidden=skip_hidden, - flatten=flatten, - view=view, - ) - - async def export_to( - self: Dataset, - key: str, - *, - to_key_value_store_id: str | None = None, - to_key_value_store_name: str | None = None, - content_type: str | None = None, - ) -> None: - """Save the entirety of the dataset's contents into one file within a key-value store. - - Args: - key (str): The key to save the data under. - to_key_value_store_id (str, optional): The id of the key-value store in which the result will be saved. - to_key_value_store_name (str, optional): The name of the key-value store in which the result will be saved. - You must specify only one of `to_key_value_store_id` and `to_key_value_store_name` arguments. - If you omit both, it uses the default key-value store. - content_type (str, optional): Either 'text/csv' or 'application/json'. Defaults to JSON. - """ - key_value_store = await KeyValueStore.open(id=to_key_value_store_id, name=to_key_value_store_name) - items: list[dict] = [] - limit = 1000 - offset = 0 - while True: - list_items = await self._dataset_client.list_items(limit=limit, offset=offset) - items.extend(list_items.items) - if list_items.total <= offset + list_items.count: - break - offset += list_items.count - - if len(items) == 0: - raise ValueError('Cannot export an empty dataset') - - if content_type == 'text/csv': - output = io.StringIO() - writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL) - writer.writerows([items[0].keys(), *[item.values() for item in items]]) - value = output.getvalue() - return await key_value_store.set_value(key, value, content_type) - - if content_type == 'application/json': - return await key_value_store.set_value(key, items) - - raise ValueError(f'Unsupported content type: {content_type}') - - @classmethod - async def export_to_json( - cls: type[Dataset], - key: str, - *, - from_dataset_id: str | None = None, - from_dataset_name: str | None = None, - to_key_value_store_id: str | None = None, - to_key_value_store_name: str | None = None, - ) -> None: - """Save the entirety of the dataset's contents into one JSON file within a key-value store. - - Args: - key (str): The key to save the data under. - from_dataset_id (str, optional): The ID of the dataset in case of calling the class method. Uses default dataset if omitted. - from_dataset_name (str, optional): The name of the dataset in case of calling the class method. Uses default dataset if omitted. - You must specify only one of `from_dataset_id` and `from_dataset_name` arguments. - If you omit both, it uses the default dataset. - to_key_value_store_id (str, optional): The id of the key-value store in which the result will be saved. - to_key_value_store_name (str, optional): The name of the key-value store in which the result will be saved. - You must specify only one of `to_key_value_store_id` and `to_key_value_store_name` arguments. - If you omit both, it uses the default key-value store. - """ - dataset = await cls.open(id=from_dataset_id, name=from_dataset_name) - await dataset.export_to_json(key, to_key_value_store_id=to_key_value_store_id, to_key_value_store_name=to_key_value_store_name) - - async def _export_to_json_internal( - self: Dataset, - key: str, - *, - from_dataset_id: str | None = None, # noqa: ARG002 - from_dataset_name: str | None = None, # noqa: ARG002 - to_key_value_store_id: str | None = None, - to_key_value_store_name: str | None = None, - ) -> None: - await self.export_to( - key, - to_key_value_store_id=to_key_value_store_id, - to_key_value_store_name=to_key_value_store_name, - content_type='application/json', - ) - - @classmethod - async def export_to_csv( - cls: type[Dataset], - key: str, - *, - from_dataset_id: str | None = None, - from_dataset_name: str | None = None, - to_key_value_store_id: str | None = None, - to_key_value_store_name: str | None = None, - ) -> None: - """Save the entirety of the dataset's contents into one CSV file within a key-value store. - - Args: - key (str): The key to save the data under. - from_dataset_id (str, optional): The ID of the dataset in case of calling the class method. Uses default dataset if omitted. - from_dataset_name (str, optional): The name of the dataset in case of calling the class method. Uses default dataset if omitted. - You must specify only one of `from_dataset_id` and `from_dataset_name` arguments. - If you omit both, it uses the default dataset. - to_key_value_store_id (str, optional): The id of the key-value store in which the result will be saved. - to_key_value_store_name (str, optional): The name of the key-value store in which the result will be saved. - You must specify only one of `to_key_value_store_id` and `to_key_value_store_name` arguments. - If you omit both, it uses the default key-value store. - """ - dataset = await cls.open(id=from_dataset_id, name=from_dataset_name) - await dataset.export_to_csv(key, to_key_value_store_id=to_key_value_store_id, to_key_value_store_name=to_key_value_store_name) - - async def _export_to_csv_internal( - self: Dataset, - key: str, - *, - from_dataset_id: str | None = None, # noqa: ARG002 - from_dataset_name: str | None = None, # noqa: ARG002 - to_key_value_store_id: str | None = None, - to_key_value_store_name: str | None = None, - ) -> None: - await self.export_to( - key, - to_key_value_store_id=to_key_value_store_id, - to_key_value_store_name=to_key_value_store_name, - content_type='text/csv', - ) - - async def get_info(self: Dataset) -> dict | None: - """Get an object containing general information about the dataset. - - Returns: - dict: Object returned by calling the GET dataset API endpoint. - """ - return await self._dataset_client.get() - - def iterate_items( - self: Dataset, - *, - offset: int = 0, - limit: int | None = None, - clean: bool | None = None, - desc: bool | None = None, - fields: list[str] | None = None, - omit: list[str] | None = None, - unwind: str | None = None, - skip_empty: bool | None = None, - skip_hidden: bool | None = None, - ) -> AsyncIterator[dict]: - """Iterate over the items in the dataset. - - Args: - offset (int, optional): Number of items that should be skipped at the start. The default value is 0 - limit (int, optional): Maximum number of items to return. By default there is no limit. - desc (bool, optional): By default, results are returned in the same order as they were stored. - To reverse the order, set this parameter to True. - clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character). - The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters. - Note that since some objects might be skipped from the output, that the result might contain less items than the limit value. - fields (list of str, optional): A list of fields which should be picked from the items, - only these fields will remain in the resulting record objects. - Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter. - You can use this feature to effectively fix the output format. - omit (list of str, optional): A list of fields which should be omitted from the items. - unwind (str, optional): Name of a field which should be unwound. - If the field is an array then every element of the array will become a separate record and merged with parent object. - If the unwound field is an object then it is merged with the parent object. - If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object, - then the item gets preserved as it is. Note that the unwound items ignore the desc parameter. - skip_empty (bool, optional): If True, then empty items are skipped from the output. - Note that if used, the results might contain less items than the limit value. - skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character. - - Yields: - dict: An item from the dataset - """ - return self._dataset_client.iterate_items( - offset=offset, - limit=limit, - clean=clean, - desc=desc, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_hidden=skip_hidden, - ) - - async def drop(self: Dataset) -> None: - """Remove the dataset either from the Apify cloud storage or from the local directory.""" - await self._dataset_client.delete() - self._remove_from_cache() - - @classmethod - async def open( - cls: type[Dataset], - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - config: Configuration | None = None, - ) -> Dataset: - """Open a dataset. - - Datasets are used to store structured data where each object stored has the same attributes, - such as online store products or real estate offers. - The actual data is stored either on the local filesystem or in the Apify cloud. - - Args: - id (str, optional): ID of the dataset to be opened. - If neither `id` nor `name` are provided, the method returns the default dataset associated with the actor run. - If the dataset with the given ID does not exist, it raises an error. - name (str, optional): Name of the dataset to be opened. - If neither `id` nor `name` are provided, the method returns the default dataset associated with the actor run. - If the dataset with the given name does not exist, it is created. - force_cloud (bool, optional): If set to True, it will open a dataset on the Apify Platform even when running the actor locally. - Defaults to False. - config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted. - - Returns: - Dataset: An instance of the `Dataset` class for the given ID or name. - """ - return await super().open(id=id, name=name, force_cloud=force_cloud, config=config) # type: ignore diff --git a/src/apify/storages/key_value_store.py b/src/apify/storages/key_value_store.py deleted file mode 100644 index 71d843ae..00000000 --- a/src/apify/storages/key_value_store.py +++ /dev/null @@ -1,257 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any, AsyncIterator, NamedTuple, TypedDict, TypeVar, overload - -from apify_client.clients import KeyValueStoreClientAsync, KeyValueStoreCollectionClientAsync -from apify_shared.utils import ignore_docs - -from apify._utils import wrap_internal -from apify.storages.base_storage import BaseStorage - -if TYPE_CHECKING: - from apify_client import ApifyClientAsync - - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import KeyValueStoreClient, KeyValueStoreCollectionClient - from apify.config import Configuration - - -T = TypeVar('T') - - -class IterateKeysInfo(TypedDict): - """Contains information about a key-value store record.""" - - size: int - - -class IterateKeysTuple(NamedTuple): - """A tuple representing a key-value store record.""" - - key: str - info: IterateKeysInfo - - -class KeyValueStore(BaseStorage): - """The `KeyValueStore` class represents a key-value store. - - You can imagine it as a simple data storage that is used - for saving and reading data records or files. Each data record is - represented by a unique key and associated with a MIME content type. - - Do not instantiate this class directly, use the `Actor.open_key_value_store()` function instead. - - Each crawler run is associated with a default key-value store, which is created exclusively - for the run. By convention, the crawler input and output are stored into the - default key-value store under the `INPUT` and `OUTPUT` key, respectively. - Typically, input and output are JSON files, although it can be any other format. - To access the default key-value store directly, you can use the - `KeyValueStore.get_value` and `KeyValueStore.set_value` convenience functions. - - `KeyValueStore` stores its data either on local disk or in the Apify cloud, - depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set. - - If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in - the local directory in the following files: - ``` - {APIFY_LOCAL_STORAGE_DIR}/key_value_stores/{STORE_ID}/{INDEX}.{EXT} - ``` - Note that `{STORE_ID}` is the name or ID of the key-value store. The default key-value store has ID: `default`, - unless you override it by setting the `APIFY_DEFAULT_KEY_VALUE_STORE_ID` environment variable. - The `{KEY}` is the key of the record and `{EXT}` corresponds to the MIME content type of the data value. - - If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the - [Apify Key-value store](https://docs.apify.com/storage/key-value-store) cloud storage. - """ - - _id: str - _name: str | None - _key_value_store_client: KeyValueStoreClientAsync | KeyValueStoreClient - - @ignore_docs - def __init__( - self: KeyValueStore, - id: str, # noqa: A002 - name: str | None, - client: ApifyClientAsync | MemoryStorageClient, - config: Configuration, - ) -> None: - """Create a `KeyValueStore` instance. - - Do not use the constructor directly, use the `Actor.open_key_value_store()` function instead. - - Args: - id (str): ID of the key-value store. - name (str, optional): Name of the key-value store. - client (ApifyClientAsync or MemoryStorageClient): The storage client which should be used. - config (Configuration): The configuration which should be used. - """ - super().__init__(id=id, name=name, client=client, config=config) - - self.get_value = wrap_internal(self._get_value_internal, self.get_value) # type: ignore - self.set_value = wrap_internal(self._set_value_internal, self.set_value) # type: ignore - self.get_public_url = wrap_internal(self._get_public_url_internal, self.get_public_url) # type: ignore - self._id = id - self._name = name - self._key_value_store_client = client.key_value_store(self._id) - - @classmethod - def _get_human_friendly_label(cls: type[KeyValueStore]) -> str: - return 'Key-value store' - - @classmethod - def _get_default_id(cls: type[KeyValueStore], config: Configuration) -> str: - return config.default_key_value_store_id - - @classmethod - def _get_single_storage_client( - cls: type[KeyValueStore], - id: str, # noqa: A002 - client: ApifyClientAsync | MemoryStorageClient, - ) -> KeyValueStoreClientAsync | KeyValueStoreClient: - return client.key_value_store(id) - - @classmethod - def _get_storage_collection_client( - cls: type[KeyValueStore], - client: ApifyClientAsync | MemoryStorageClient, - ) -> KeyValueStoreCollectionClientAsync | KeyValueStoreCollectionClient: - return client.key_value_stores() - - @overload - @classmethod - async def get_value(cls: type[KeyValueStore], key: str) -> Any: - ... - - @overload - @classmethod - async def get_value(cls: type[KeyValueStore], key: str, default_value: T) -> T: - ... - - @overload - @classmethod - async def get_value(cls: type[KeyValueStore], key: str, default_value: T | None = None) -> T | None: - ... - - @classmethod - async def get_value(cls: type[KeyValueStore], key: str, default_value: T | None = None) -> T | None: - """Get a value from the key-value store. - - Args: - key (str): Key of the record to retrieve. - default_value (Any, optional): Default value returned in case the record does not exist. - - Returns: - Any: The value associated with the given key. `default_value` is used in case the record does not exist. - """ - store = await cls.open() - return await store.get_value(key, default_value) - - async def _get_value_internal(self: KeyValueStore, key: str, default_value: T | None = None) -> T | None: - record = await self._key_value_store_client.get_record(key) - return record['value'] if record else default_value - - async def iterate_keys( - self: KeyValueStore, - exclusive_start_key: str | None = None, - ) -> AsyncIterator[IterateKeysTuple]: - """Iterate over the keys in the key-value store. - - Args: - exclusive_start_key (str, optional): All keys up to this one (including) are skipped from the result. - - Yields: - IterateKeysTuple: A tuple `(key, info)`, - where `key` is the record key, and `info` is an object that contains a single property `size` - indicating size of the record in bytes. - """ - while True: - list_keys = await self._key_value_store_client.list_keys(exclusive_start_key=exclusive_start_key) - for item in list_keys['items']: - yield IterateKeysTuple(item['key'], {'size': item['size']}) - - if not list_keys['isTruncated']: - break - exclusive_start_key = list_keys['nextExclusiveStartKey'] - - @classmethod - async def set_value( - cls: type[KeyValueStore], - key: str, - value: Any, - content_type: str | None = None, - ) -> None: - """Set or delete a value in the key-value store. - - Args: - key (str): The key under which the value should be saved. - value (Any): The value to save. If the value is `None`, the corresponding key-value pair will be deleted. - content_type (str, optional): The content type of the saved value. - """ - store = await cls.open() - return await store.set_value(key, value, content_type) - - async def _set_value_internal( - self: KeyValueStore, - key: str, - value: Any, - content_type: str | None = None, - ) -> None: - if value is None: - return await self._key_value_store_client.delete_record(key) - - return await self._key_value_store_client.set_record(key, value, content_type) - - @classmethod - async def get_public_url(cls: type[KeyValueStore], key: str) -> str: - """Get a URL for the given key that may be used to publicly access the value in the remote key-value store. - - Args: - key (str): The key for which the URL should be generated. - """ - store = await cls.open() - return await store.get_public_url(key) - - async def _get_public_url_internal(self: KeyValueStore, key: str) -> str: - if not isinstance(self._key_value_store_client, KeyValueStoreClientAsync): - raise RuntimeError('Cannot generate a public URL for this key-value store as it is not on the Apify Platform!') # noqa: TRY004 - - public_api_url = self._config.api_public_base_url - - return f'{public_api_url}/v2/key-value-stores/{self._id}/records/{key}' - - async def drop(self: KeyValueStore) -> None: - """Remove the key-value store either from the Apify cloud storage or from the local directory.""" - await self._key_value_store_client.delete() - self._remove_from_cache() - - @classmethod - async def open( - cls: type[KeyValueStore], - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - config: Configuration | None = None, - ) -> KeyValueStore: - """Open a key-value store. - - Key-value stores are used to store records or files, along with their MIME content type. - The records are stored and retrieved using a unique key. - The actual data is stored either on a local filesystem or in the Apify cloud. - - Args: - id (str, optional): ID of the key-value store to be opened. - If neither `id` nor `name` are provided, the method returns the default key-value store associated with the actor run. - If the key-value store with the given ID does not exist, it raises an error. - name (str, optional): Name of the key-value store to be opened. - If neither `id` nor `name` are provided, the method returns the default key-value store associated with the actor run. - If the key-value store with the given name does not exist, it is created. - force_cloud (bool, optional): If set to True, it will open a key-value store on the Apify Platform even when running the actor locally. - Defaults to False. - config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted. - - Returns: - KeyValueStore: An instance of the `KeyValueStore` class for the given ID or name. - """ - return await super().open(id=id, name=name, force_cloud=force_cloud, config=config) # type: ignore diff --git a/src/apify/storages/request_queue.py b/src/apify/storages/request_queue.py deleted file mode 100644 index 79d64b5e..00000000 --- a/src/apify/storages/request_queue.py +++ /dev/null @@ -1,602 +0,0 @@ -from __future__ import annotations - -import asyncio -from collections import OrderedDict -from datetime import datetime, timezone -from typing import TYPE_CHECKING -from typing import OrderedDict as OrderedDictType - -from apify_shared.utils import ignore_docs - -from apify._crypto import crypto_random_object_id -from apify._utils import LRUCache, budget_ow, compute_unique_key, unique_key_to_request_id -from apify.consts import REQUEST_QUEUE_HEAD_MAX_LIMIT -from apify.log import logger -from apify.storages.base_storage import BaseStorage - -if TYPE_CHECKING: - from apify_client import ApifyClientAsync - from apify_client.clients import RequestQueueClientAsync, RequestQueueCollectionClientAsync - - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import RequestQueueClient, RequestQueueCollectionClient - from apify.config import Configuration - - -MAX_CACHED_REQUESTS = 1_000_000 - -# When requesting queue head we always fetch requestsInProgressCount * QUERY_HEAD_BUFFER number of requests. -QUERY_HEAD_MIN_LENGTH = 100 - -QUERY_HEAD_BUFFER = 3 - -# If queue was modified (request added/updated/deleted) before more than API_PROCESSED_REQUESTS_DELAY_MILLIS -# then we assume the get head operation to be consistent. -API_PROCESSED_REQUESTS_DELAY_MILLIS = 10_000 - -# How many times we try to get queue head with queueModifiedAt older than API_PROCESSED_REQUESTS_DELAY_MILLIS. -MAX_QUERIES_FOR_CONSISTENCY = 6 - -# This number must be large enough so that processing of all these requests cannot be done in -# a time lower than expected maximum latency of DynamoDB, but low enough not to waste too much memory. -RECENTLY_HANDLED_CACHE_SIZE = 1000 - -# Indicates how long it usually takes for the underlying storage to propagate all writes -# to be available to subsequent reads. -STORAGE_CONSISTENCY_DELAY_MILLIS = 3000 - - -class RequestQueue(BaseStorage): - """Represents a queue of URLs to crawl. - - Can be used for deep crawling of websites where you start with several URLs and then recursively - follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders. - - Each URL is represented using an instance of the {@apilink Request} class. - The queue can only contain unique URLs. More precisely, it can only contain request dictionaries - with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden. - To add a single URL multiple times to the queue, - corresponding request dictionary will need to have different `uniqueKey` properties. - - Do not instantiate this class directly, use the `Actor.open_request_queue()` function instead. - - `RequestQueue` stores its data either on local disk or in the Apify cloud, - depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set. - - If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in - the local directory in the following files: - ``` - {APIFY_LOCAL_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json - ``` - Note that `{QUEUE_ID}` is the name or ID of the request queue. The default request queue has ID: `default`, - unless you override it by setting the `APIFY_DEFAULT_REQUEST_QUEUE_ID` environment variable. - The `{REQUEST_ID}` is the id of the request. - - If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the - [Apify Request Queue](https://docs.apify.com/storage/request-queue) - cloud storage. - """ - - _request_queue_client: RequestQueueClientAsync | RequestQueueClient - _client_key = crypto_random_object_id() - _queue_head_dict: OrderedDictType[str, str] - _query_queue_head_task: asyncio.Task | None - _in_progress: set[str] - _last_activity: datetime - _internal_timeout_seconds = 5 * 60 - _recently_handled: LRUCache[bool] - _assumed_total_count = 0 - _assumed_handled_count = 0 - _requests_cache: LRUCache[dict] - - @ignore_docs - def __init__( - self: RequestQueue, - id: str, # noqa: A002 - name: str | None, - client: ApifyClientAsync | MemoryStorageClient, - config: Configuration, - ) -> None: - """Create a `RequestQueue` instance. - - Do not use the constructor directly, use the `Actor.open_request_queue()` function instead. - - Args: - id (str): ID of the request queue. - name (str, optional): Name of the request queue. - client (ApifyClientAsync or MemoryStorageClient): The storage client which should be used. - config (Configuration): The configuration which should be used. - """ - super().__init__(id=id, name=name, client=client, config=config) - - self._request_queue_client = client.request_queue(self._id, client_key=self._client_key) - self._queue_head_dict = OrderedDict() - self._query_queue_head_task = None - self._in_progress = set() - self._last_activity = datetime.now(timezone.utc) - self._recently_handled = LRUCache[bool](max_length=RECENTLY_HANDLED_CACHE_SIZE) - self._requests_cache = LRUCache(max_length=MAX_CACHED_REQUESTS) - - @classmethod - def _get_human_friendly_label(cls: type[RequestQueue]) -> str: - return 'Request queue' - - @classmethod - def _get_default_id(cls: type[RequestQueue], config: Configuration) -> str: - return config.default_request_queue_id - - @classmethod - def _get_single_storage_client( - cls: type[RequestQueue], - id: str, # noqa: A002 - client: ApifyClientAsync | MemoryStorageClient, - ) -> RequestQueueClientAsync | RequestQueueClient: - return client.request_queue(id) - - @classmethod - def _get_storage_collection_client( - cls: type[RequestQueue], - client: ApifyClientAsync | MemoryStorageClient, - ) -> RequestQueueCollectionClientAsync | RequestQueueCollectionClient: - return client.request_queues() - - async def add_request( - self: RequestQueue, - request: dict, - *, - forefront: bool = False, - keep_url_fragment: bool = False, - use_extended_unique_key: bool = False, - ) -> dict: - """Adds a request to the `RequestQueue` while managing deduplication and positioning within the queue. - - The deduplication of requests relies on the `uniqueKey` field within the request dictionary. If `uniqueKey` - exists, it remains unchanged; if it does not, it is generated based on the request's `url`, `method`, - and `payload` fields. The generation of `uniqueKey` can be influenced by the `keep_url_fragment` and - `use_extended_unique_key` flags, which dictate whether to include the URL fragment and the request's method - and payload, respectively, in its computation. - - The request can be added to the forefront (beginning) or the back of the queue based on the `forefront` - parameter. Information about the request's addition to the queue, including whether it was already present or - handled, is returned in an output dictionary. - - Args: - request: The request object to be added to the queue. Must include at least the `url` key. - Optionaly it can include the `method`, `payload` and `uniqueKey` keys. - - forefront: If True, adds the request to the forefront of the queue; otherwise, adds it to the end. - - keep_url_fragment: Determines whether the URL fragment (the part of the URL after '#') should be retained - in the unique key computation. - - use_extended_unique_key: Determines whether to use an extended unique key, incorporating the request's - method and payload into the unique key computation. - - Returns: A dictionary containing information about the operation, including: - - `requestId` (str): The ID of the request. - - `uniqueKey` (str): The unique key associated with the request. - - `wasAlreadyPresent` (bool): Indicates whether the request was already in the queue. - - `wasAlreadyHandled` (bool): Indicates whether the request was already processed. - """ - budget_ow( - request, - { - 'url': (str, True), - }, - ) - self._last_activity = datetime.now(timezone.utc) - - if request.get('uniqueKey') is None: - request['uniqueKey'] = compute_unique_key( - url=request['url'], - method=request.get('method', 'GET'), - payload=request.get('payload'), - keep_url_fragment=keep_url_fragment, - use_extended_unique_key=use_extended_unique_key, - ) - - cache_key = unique_key_to_request_id(request['uniqueKey']) - cached_info = self._requests_cache.get(cache_key) - - if cached_info: - request['id'] = cached_info['id'] - return { - 'wasAlreadyPresent': True, - # We may assume that if request is in local cache then also the information if the - # request was already handled is there because just one client should be using one queue. - 'wasAlreadyHandled': cached_info['isHandled'], - 'requestId': cached_info['id'], - 'uniqueKey': cached_info['uniqueKey'], - } - - queue_operation_info = await self._request_queue_client.add_request(request, forefront=forefront) - queue_operation_info['uniqueKey'] = request['uniqueKey'] - - self._cache_request(cache_key, queue_operation_info) - - request_id, was_already_present = queue_operation_info['requestId'], queue_operation_info['wasAlreadyPresent'] - is_handled = request.get('handledAt') is not None - if not is_handled and not was_already_present and request_id not in self._in_progress and self._recently_handled.get(request_id) is None: - self._assumed_total_count += 1 - - self._maybe_add_request_to_queue_head(request_id, forefront) - - return queue_operation_info - - async def get_request(self: RequestQueue, request_id: str) -> dict | None: - """Retrieve a request from the queue. - - Args: - request_id (str): ID of the request to retrieve. - - Returns: - dict, optional: The retrieved request, or `None`, if it does not exist. - """ - budget_ow(request_id, (str, True), 'request_id') - return await self._request_queue_client.get_request(request_id) - - async def fetch_next_request(self: RequestQueue) -> dict | None: - """Return the next request in the queue to be processed. - - Once you successfully finish processing of the request, you need to call - `RequestQueue.mark_request_as_handled` to mark the request as handled in the queue. - If there was some error in processing the request, call `RequestQueue.reclaim_request` instead, - so that the queue will give the request to some other consumer in another call to the `fetch_next_request` method. - - Note that the `None` return value does not mean the queue processing finished, it means there are currently no pending requests. - To check whether all requests in queue were finished, use `RequestQueue.is_finished` instead. - - Returns: - dict, optional: The request or `None` if there are no more pending requests. - """ - await self._ensure_head_is_non_empty() - - # We are likely done at this point. - if len(self._queue_head_dict) == 0: - return None - - next_request_id, _ = self._queue_head_dict.popitem(last=False) # ~removeFirst() - - # This should never happen, but... - if next_request_id in self._in_progress or self._recently_handled.get(next_request_id): - logger.warning( - 'Queue head returned a request that is already in progress?!', - extra={ - 'nextRequestId': next_request_id, - 'inProgress': next_request_id in self._in_progress, - 'recentlyHandled': next_request_id in self._recently_handled, - }, - ) - return None - self._in_progress.add(next_request_id) - self._last_activity = datetime.now(timezone.utc) - - try: - request = await self.get_request(next_request_id) - except Exception: - # On error, remove the request from in progress, otherwise it would be there forever - self._in_progress.remove(next_request_id) - raise - - # NOTE: It can happen that the queue head index is inconsistent with the main queue table. This can occur in two situations: - - """ 1) Queue head index is ahead of the main table and the request is not present in the main table yet (i.e. getRequest() returned null). - In this case, keep the request marked as in progress for a short while, - so that isFinished() doesn't return true and _ensureHeadIsNonEmpty() doesn't not load the request - into the queueHeadDict straight again. After the interval expires, fetchNextRequest() - will try to fetch this request again, until it eventually appears in the main table. - """ - if request is None: - logger.debug('Cannot find a request from the beginning of queue, will be retried later', extra={'nextRequestId': next_request_id}) - asyncio.get_running_loop().call_later(STORAGE_CONSISTENCY_DELAY_MILLIS // 1000, lambda: self._in_progress.remove(next_request_id)) - return None - - """ 2) Queue head index is behind the main table and the underlying request was already handled - (by some other client, since we keep the track of handled requests in recentlyHandled dictionary). - We just add the request to the recentlyHandled dictionary so that next call to _ensureHeadIsNonEmpty() - will not put the request again to queueHeadDict. - """ - if request.get('handledAt') is not None: - logger.debug('Request fetched from the beginning of queue was already handled', extra={'nextRequestId': next_request_id}) - self._recently_handled[next_request_id] = True - return None - - return request - - async def mark_request_as_handled(self: RequestQueue, request: dict) -> dict | None: - """Mark a request as handled after successful processing. - - Handled requests will never again be returned by the `RequestQueue.fetch_next_request` method. - - Args: - request (dict): The request to mark as handled. - - Returns: - dict, optional: Information about the queue operation with keys `requestId`, `uniqueKey`, `wasAlreadyPresent`, `wasAlreadyHandled`. - `None` if the given request was not in progress. - """ - budget_ow( - request, - { - 'id': (str, True), - 'uniqueKey': (str, True), - 'handledAt': (datetime, False), - }, - ) - self._last_activity = datetime.now(timezone.utc) - if request['id'] not in self._in_progress: - logger.debug('Cannot mark request as handled, because it is not in progress!', extra={'requestId': request['id']}) - return None - - request['handledAt'] = request.get('handledAt', datetime.now(timezone.utc)) - queue_operation_info = await self._request_queue_client.update_request({**request}) - queue_operation_info['uniqueKey'] = request['uniqueKey'] - - self._in_progress.remove(request['id']) - self._recently_handled[request['id']] = True - - if not queue_operation_info['wasAlreadyHandled']: - self._assumed_handled_count += 1 - - self._cache_request(unique_key_to_request_id(request['uniqueKey']), queue_operation_info) - - return queue_operation_info - - async def reclaim_request( - self: RequestQueue, - request: dict, - forefront: bool = False, # noqa: FBT001, FBT002 - ) -> dict | None: - """Reclaim a failed request back to the queue. - - The request will be returned for processing later again - by another call to `RequestQueue.fetchNextRequest`. - - Args: - request (dict): The request to return to the queue. - forefront (bool, optional): Whether to add the request to the head or the end of the queue - Returns: - dict, optional: Information about the queue operation with keys `requestId`, `uniqueKey`, `wasAlreadyPresent`, `wasAlreadyHandled`. - `None` if the given request was not in progress. - """ - budget_ow( - request, - { - 'id': (str, True), - 'uniqueKey': (str, True), - }, - ) - self._last_activity = datetime.now(timezone.utc) - - if request['id'] not in self._in_progress: - logger.debug('Cannot reclaim request, because it is not in progress!', extra={'requestId': request['id']}) - return None - - # TODO: If request hasn't been changed since the last getRequest(), we don't need to call updateRequest() - # and thus improve performance. - # https://github.com/apify/apify-sdk-python/issues/143 - queue_operation_info = await self._request_queue_client.update_request(request, forefront=forefront) - queue_operation_info['uniqueKey'] = request['uniqueKey'] - self._cache_request(unique_key_to_request_id(request['uniqueKey']), queue_operation_info) - - # Wait a little to increase a chance that the next call to fetchNextRequest() will return the request with updated data. - # This is to compensate for the limitation of DynamoDB, where writes might not be immediately visible to subsequent reads. - def callback() -> None: - if request['id'] not in self._in_progress: - logger.debug('The request is no longer marked as in progress in the queue?!', {'requestId': request['id']}) - return - - self._in_progress.remove(request['id']) - - # Performance optimization: add request straight to head if possible - self._maybe_add_request_to_queue_head(request['id'], forefront) - - asyncio.get_running_loop().call_later(STORAGE_CONSISTENCY_DELAY_MILLIS // 1000, callback) - - return queue_operation_info - - def _in_progress_count(self: RequestQueue) -> int: - return len(self._in_progress) - - async def is_empty(self: RequestQueue) -> bool: - """Check whether the queue is empty. - - Returns: - bool: `True` if the next call to `RequestQueue.fetchNextRequest` would return `None`, otherwise `False`. - """ - await self._ensure_head_is_non_empty() - return len(self._queue_head_dict) == 0 - - async def is_finished(self: RequestQueue) -> bool: - """Check whether the queue is finished. - - Due to the nature of distributed storage used by the queue, - the function might occasionally return a false negative, - but it will never return a false positive. - - Returns: - bool: `True` if all requests were already handled and there are no more left. `False` otherwise. - """ - seconds_since_last_activity = (datetime.now(timezone.utc) - self._last_activity).seconds - if self._in_progress_count() > 0 and seconds_since_last_activity > self._internal_timeout_seconds: - message = f'The request queue seems to be stuck for {self._internal_timeout_seconds}s, resetting internal state.' - logger.warning(message) - self._reset() - - if len(self._queue_head_dict) > 0 or self._in_progress_count() > 0: - return False - - is_head_consistent = await self._ensure_head_is_non_empty(ensure_consistency=True) - return is_head_consistent and len(self._queue_head_dict) == 0 and self._in_progress_count() == 0 - - def _reset(self: RequestQueue) -> None: - self._queue_head_dict.clear() - self._query_queue_head_task = None - self._in_progress.clear() - self._recently_handled.clear() - self._assumed_total_count = 0 - self._assumed_handled_count = 0 - self._requests_cache.clear() - self._last_activity = datetime.now(timezone.utc) - - def _cache_request(self: RequestQueue, cache_key: str, queue_operation_info: dict) -> None: - self._requests_cache[cache_key] = { - 'id': queue_operation_info['requestId'], - 'isHandled': queue_operation_info['wasAlreadyHandled'], - 'uniqueKey': queue_operation_info['uniqueKey'], - 'wasAlreadyHandled': queue_operation_info['wasAlreadyHandled'], - } - - async def _queue_query_head(self: RequestQueue, limit: int) -> dict: - query_started_at = datetime.now(timezone.utc) - - list_head = await self._request_queue_client.list_head(limit=limit) - for request in list_head['items']: - # Queue head index might be behind the main table, so ensure we don't recycle requests - if not request['id'] or not request['uniqueKey'] or request['id'] in self._in_progress or self._recently_handled.get(request['id']): - continue - self._queue_head_dict[request['id']] = request['id'] - self._cache_request( - unique_key_to_request_id(request['uniqueKey']), - { - 'requestId': request['id'], - 'wasAlreadyHandled': False, - 'wasAlreadyPresent': True, - 'uniqueKey': request['uniqueKey'], - }, - ) - - # This is needed so that the next call to _ensureHeadIsNonEmpty() will fetch the queue head again. - self._query_queue_head_task = None - - return { - 'wasLimitReached': len(list_head['items']) >= limit, - 'prevLimit': limit, - 'queueModifiedAt': list_head['queueModifiedAt'], - 'queryStartedAt': query_started_at, - 'hadMultipleClients': list_head['hadMultipleClients'], - } - - async def _ensure_head_is_non_empty( - self: RequestQueue, - ensure_consistency: bool = False, # noqa: FBT001, FBT002 - limit: int | None = None, - iteration: int = 0, - ) -> bool: - # If is nonempty resolve immediately. - if len(self._queue_head_dict) > 0: - return True - - if limit is None: - limit = max(self._in_progress_count() * QUERY_HEAD_BUFFER, QUERY_HEAD_MIN_LENGTH) - - if self._query_queue_head_task is None: - self._query_queue_head_task = asyncio.Task(self._queue_query_head(limit)) - - queue_head = await self._query_queue_head_task - - # TODO: I feel this code below can be greatly simplified... (comes from TS implementation *wink*) - # https://github.com/apify/apify-sdk-python/issues/142 - - # If queue is still empty then one of the following holds: - # - the other calls waiting for this task already consumed all the returned requests - # - the limit was too low and contained only requests in progress - # - the writes from other clients were not propagated yet - # - the whole queue was processed and we are done - - # If limit was not reached in the call then there are no more requests to be returned. - if queue_head['prevLimit'] >= REQUEST_QUEUE_HEAD_MAX_LIMIT: - logger.warning('Reached the maximum number of requests in progress', extra={'limit': REQUEST_QUEUE_HEAD_MAX_LIMIT}) - - should_repeat_with_higher_limit = ( - len(self._queue_head_dict) == 0 and queue_head['wasLimitReached'] and queue_head['prevLimit'] < REQUEST_QUEUE_HEAD_MAX_LIMIT - ) - - # If ensureConsistency=true then we must ensure that either: - # - queueModifiedAt is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS - # - hadMultipleClients=false and this.assumedTotalCount<=this.assumedHandledCount - is_database_consistent = (queue_head['queryStartedAt'] - queue_head['queueModifiedAt'].replace(tzinfo=timezone.utc)).seconds >= ( - API_PROCESSED_REQUESTS_DELAY_MILLIS // 1000 - ) - is_locally_consistent = not queue_head['hadMultipleClients'] and self._assumed_total_count <= self._assumed_handled_count - # Consistent information from one source is enough to consider request queue finished. - should_repeat_for_consistency = ensure_consistency and not is_database_consistent and not is_locally_consistent - - # If both are false then head is consistent and we may exit. - if not should_repeat_with_higher_limit and not should_repeat_for_consistency: - return True - - # If we are querying for consistency then we limit the number of queries to MAX_QUERIES_FOR_CONSISTENCY. - # If this is reached then we return false so that empty() and finished() returns possibly false negative. - if not should_repeat_with_higher_limit and iteration > MAX_QUERIES_FOR_CONSISTENCY: - return False - - next_limit = round(queue_head['prevLimit'] * 1.5) if should_repeat_with_higher_limit else queue_head['prevLimit'] - - # If we are repeating for consistency then wait required time. - if should_repeat_for_consistency: - delay_seconds = (API_PROCESSED_REQUESTS_DELAY_MILLIS // 1000) - (datetime.now(timezone.utc) - queue_head['queueModifiedAt']).seconds - logger.info(f'Waiting for {delay_seconds}s before considering the queue as finished to ensure that the data is consistent.') - await asyncio.sleep(delay_seconds) - - return await self._ensure_head_is_non_empty(ensure_consistency, next_limit, iteration + 1) - - def _maybe_add_request_to_queue_head( - self: RequestQueue, - request_id: str, - forefront: bool, # noqa: FBT001 - ) -> None: - if forefront: - self._queue_head_dict[request_id] = request_id - # Move to start, i.e. forefront of the queue - self._queue_head_dict.move_to_end(request_id, last=False) - elif self._assumed_total_count < QUERY_HEAD_MIN_LENGTH: - # OrderedDict puts the item to the end of the queue by default - self._queue_head_dict[request_id] = request_id - - async def drop(self: RequestQueue) -> None: - """Remove the request queue either from the Apify cloud storage or from the local directory.""" - await self._request_queue_client.delete() - self._remove_from_cache() - - async def get_info(self: RequestQueue) -> dict | None: - """Get an object containing general information about the request queue. - - Returns: - dict: Object returned by calling the GET request queue API endpoint. - """ - return await self._request_queue_client.get() - - @classmethod - async def open( - cls: type[RequestQueue], - *, - id: str | None = None, # noqa: A002 - name: str | None = None, - force_cloud: bool = False, - config: Configuration | None = None, - ) -> RequestQueue: - """Open a request queue. - - Request queue represents a queue of URLs to crawl, which is stored either on local filesystem or in the Apify cloud. - The queue is used for deep crawling of websites, where you start with several URLs and then - recursively follow links to other pages. The data structure supports both breadth-first - and depth-first crawling orders. - - Args: - id (str, optional): ID of the request queue to be opened. - If neither `id` nor `name` are provided, the method returns the default request queue associated with the actor run. - If the request queue with the given ID does not exist, it raises an error. - name (str, optional): Name of the request queue to be opened. - If neither `id` nor `name` are provided, the method returns the default request queue associated with the actor run. - If the request queue with the given name does not exist, it is created. - force_cloud (bool, optional): If set to True, it will open a request queue on the Apify Platform even when running the actor locally. - Defaults to False. - config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted. - - Returns: - RequestQueue: An instance of the `RequestQueue` class for the given ID or name. - """ - queue = await super().open(id=id, name=name, force_cloud=force_cloud, config=config) - await queue._ensure_head_is_non_empty() # type: ignore - return queue # type: ignore diff --git a/src/apify/storages/storage_client_manager.py b/src/apify/storages/storage_client_manager.py deleted file mode 100644 index 52207248..00000000 --- a/src/apify/storages/storage_client_manager.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from apify_shared.utils import ignore_docs - -from apify._memory_storage import MemoryStorageClient -from apify.config import Configuration - -if TYPE_CHECKING: - from apify_client import ApifyClientAsync - - -@ignore_docs -class StorageClientManager: - """A class for managing storage clients.""" - - _config: Configuration - - _local_client: MemoryStorageClient | None = None - _cloud_client: ApifyClientAsync | None = None - - _default_instance: StorageClientManager | None = None - - def __init__(self: StorageClientManager) -> None: - """Create a `StorageClientManager` instance.""" - self._config = Configuration.get_global_configuration() - - @classmethod - def set_config(cls: type[StorageClientManager], config: Configuration) -> None: - """Set the config for the StorageClientManager. - - Args: - config (Configuration): The configuration this StorageClientManager should use. - """ - cls._get_default_instance()._config = config - - @classmethod - def get_storage_client( - cls: type[StorageClientManager], - force_cloud: bool = False, # noqa: FBT001, FBT002 - ) -> ApifyClientAsync | MemoryStorageClient: - """Get the current storage client instance. - - Returns: - ApifyClientAsync or MemoryStorageClient: The current storage client instance. - """ - default_instance = cls._get_default_instance() - if not default_instance._local_client: - default_instance._local_client = MemoryStorageClient(persist_storage=default_instance._config.persist_storage, write_metadata=True) - - if default_instance._config.is_at_home or force_cloud: - assert default_instance._cloud_client is not None # noqa: S101 - return default_instance._cloud_client - - return default_instance._local_client - - @classmethod - def set_cloud_client(cls: type[StorageClientManager], client: ApifyClientAsync) -> None: - """Set the storage client. - - Args: - client (ApifyClientAsync or MemoryStorageClient): The instance of a storage client. - """ - cls._get_default_instance()._cloud_client = client - - @classmethod - def _get_default_instance(cls: type[StorageClientManager]) -> StorageClientManager: - if cls._default_instance is None: - cls._default_instance = cls() - - return cls._default_instance diff --git a/tests/integration/README.md b/tests/integration/README.md index fb86f84b..331acad1 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -1,7 +1,7 @@ Integration tests ================= -We have integration tests which build and run actors using the Python SDK on the Apify Platform. +We have integration tests which build and run Actors using the Python SDK on the Apify Platform. To run these tests, you need to set the `APIFY_TEST_USER_API_TOKEN` environment variable to the API token of the Apify user you want to use for the tests, and then start them with `make integration-tests`. @@ -25,20 +25,20 @@ async def test_something(apify_client_async: ApifyClientAsync) -> None: ### `make_actor` -This fixture returns a factory function for creating actors on the Apify Platform. +This fixture returns a factory function for creating Actors on the Apify Platform. -For the actor source, the fixture takes the files from `tests/integration/actor_source_base`, +For the Actor source, the fixture takes the files from `tests/integration/actor_source_base`, builds the Apify SDK wheel from the current codebase, -and adds the actor source you passed to the fixture as an argument. +and adds the Actor source you passed to the fixture as an argument. You have to pass exactly one of the `main_func`, `main_py` and `source_files` arguments. -The created actor will be uploaded to the platform, built there, and after the test finishes, it will be automatically deleted. -If the actor build fails, it will not be deleted, so that you can check why the build failed. +The created Actor will be uploaded to the platform, built there, and after the test finishes, it will be automatically deleted. +If the Actor build fails, it will not be deleted, so that you can check why the build failed. -### Creating test actor straight from a Python function +### Creating test Actor straight from a Python function -You can create actors straight from a Python function. -This is great because you can have the test actor source code checked with the linter. +You can create Actors straight from a Python function. +This is great because you can have the test Actor source code checked with the linter. ```python async def test_something(self, make_actor: ActorFactory) -> None: @@ -54,7 +54,7 @@ async def test_something(self, make_actor: ActorFactory) -> None: assert run_result['status'] == 'SUCCEEDED' ``` -These actors will have the `src/main.py` file set to the `main` function definition, +These Actors will have the `src/main.py` file set to the `main` function definition, prepended with `import asyncio` and `from apify import Actor`, for your convenience. You can also pass extra imports directly to the main function: @@ -65,7 +65,7 @@ async def test_something(self, make_actor: ActorFactory) -> None: import os from apify_shared.consts import ActorEventTypes, ActorEnvVars async with Actor: - print('The actor is running with ' + os.getenv(ActorEnvVars.MEMORY_MBYTES) + 'MB of memory') + print('The Actor is running with ' + os.getenv(ActorEnvVars.MEMORY_MBYTES) + 'MB of memory') await Actor.on(ActorEventTypes.SYSTEM_INFO, lambda event_data: print(event_data)) actor = await make_actor('something', main_func=main) @@ -76,10 +76,10 @@ async def test_something(self, make_actor: ActorFactory) -> None: assert run_result['status'] == 'SUCCEEDED' ``` -### Creating actor from source files +### Creating Actor from source files You can also pass the source files directly if you need something more complex -(e.g. pass some fixed value to the actor source code or use multiple source files). +(e.g. pass some fixed value to the Actor source code or use multiple source files). To pass the source code of the `src/main.py` file directly, use the `main_py` argument to `make_actor`: diff --git a/tests/integration/_utils.py b/tests/integration/_utils.py index b69d6d58..cbea845d 100644 --- a/tests/integration/_utils.py +++ b/tests/integration/_utils.py @@ -1,6 +1,6 @@ from __future__ import annotations -from apify._crypto import crypto_random_object_id +from crawlee._utils.crypto import crypto_random_object_id def generate_unique_resource_name(label: str) -> str: diff --git a/tests/integration/actor_source_base/src/__main__.py b/tests/integration/actor_source_base/src/__main__.py index 0d1d65af..f6228448 100644 --- a/tests/integration/actor_source_base/src/__main__.py +++ b/tests/integration/actor_source_base/src/__main__.py @@ -4,7 +4,7 @@ import logging from .main import main -from apify.log import ActorLogFormatter +from apify._log import ActorLogFormatter handler = logging.StreamHandler() handler.setFormatter(ActorLogFormatter()) diff --git a/tests/integration/actor_source_base/src/main.py b/tests/integration/actor_source_base/src/main.py index 78c03a48..678334bd 100644 --- a/tests/integration/actor_source_base/src/main.py +++ b/tests/integration/actor_source_base/src/main.py @@ -4,5 +4,5 @@ async def main() -> None: - async with Actor(): + async with Actor: raise RuntimeError('You need to override the `main.py` file in the integration test!') diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index c68d441a..040bc71a 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -7,19 +7,20 @@ import sys import textwrap from pathlib import Path -from typing import TYPE_CHECKING, AsyncIterator, Awaitable, Callable, Mapping, Protocol +from typing import TYPE_CHECKING, Callable, Protocol, cast import pytest +from filelock import FileLock + from apify_client import ApifyClientAsync from apify_shared.consts import ActorJobStatus, ActorSourceType -from filelock import FileLock +import apify._actor from ._utils import generate_unique_resource_name -from apify import Actor -from apify.config import Configuration -from apify.storages import Dataset, KeyValueStore, RequestQueue, StorageClientManager if TYPE_CHECKING: + from collections.abc import AsyncIterator, Awaitable, Mapping + from apify_client.clients.resource_clients import ActorClientAsync TOKEN_ENV_VAR = 'APIFY_TEST_USER_API_TOKEN' @@ -30,16 +31,13 @@ # To isolate the tests, we need to reset the used singletons before each test case # We also patch the default storage client with a tmp_path @pytest.fixture(autouse=True) -def _reset_and_patch_default_instances(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(Actor, '_default_instance', None) - monkeypatch.setattr(Configuration, '_default_instance', None) - monkeypatch.setattr(Dataset, '_cache_by_id', None) - monkeypatch.setattr(Dataset, '_cache_by_name', None) - monkeypatch.setattr(KeyValueStore, '_cache_by_id', None) - monkeypatch.setattr(KeyValueStore, '_cache_by_name', None) - monkeypatch.setattr(RequestQueue, '_cache_by_id', None) - monkeypatch.setattr(RequestQueue, '_cache_by_name', None) - monkeypatch.setattr(StorageClientManager, '_default_instance', None) +def _reset_and_patch_default_instances() -> None: + from crawlee import service_container + + cast(dict, service_container._services).clear() + delattr(apify._actor.Actor, '__wrapped__') + + # TODO: StorageClientManager local storage client purge # noqa: TD003 # This fixture can't be session-scoped, @@ -91,7 +89,7 @@ def sdk_wheel_path(tmp_path_factory: pytest.TempPathFactory, testrun_uid: str) - @pytest.fixture(scope='session') def actor_base_source_files(sdk_wheel_path: Path) -> dict[str, str | bytes]: - """Create a dictionary of the base source files for a testing actor. + """Create a dictionary of the base source files for a testing Actor. It takes the files from `tests/integration/actor_source_base`, builds the Apify SDK wheel from the current codebase, @@ -133,13 +131,12 @@ def __call__( main_func: Callable | None = None, main_py: str | None = None, source_files: Mapping[str, str | bytes] | None = None, - ) -> Awaitable[ActorClientAsync]: - ... + ) -> Awaitable[ActorClientAsync]: ... @pytest.fixture() async def make_actor(actor_base_source_files: dict[str, str | bytes], apify_client_async: ApifyClientAsync) -> AsyncIterator[ActorFactory]: - """A fixture for returning a temporary actor factory.""" + """A fixture for returning a temporary Actor factory.""" actor_clients_for_cleanup: list[ActorClientAsync] = [] async def _make_actor( @@ -149,20 +146,19 @@ async def _make_actor( main_py: str | None = None, source_files: Mapping[str, str | bytes] | None = None, ) -> ActorClientAsync: - """Create a temporary actor from the given main function or source file(s). + """Create a temporary Actor from the given main function or source file(s). - The actor will be uploaded to the Apify Platform, built there, and after the test finishes, it will be automatically deleted. + The Actor will be uploaded to the Apify Platform, built there, and after the test finishes, it will be automatically deleted. You have to pass exactly one of the `main_func`, `main_py` and `source_files` arguments. Args: - actor_label (str): The label which will be a part of the generated actor name - main_func (Callable, optional): The main function of the actor. - main_py (str, optional): The `src/main.py` file of the actor. - source_files (dict, optional): A dictionary of the source files of the actor. + actor_label: The label which will be a part of the generated Actor name + main_func: The main function of the Actor. + main_py: The `src/main.py` file of the Actor. + source_files: A dictionary of the source files of the Actor. - Returns: - ActorClientAsync: A resource client for the created actor. + Returns: A resource client for the created Actor. """ if not (main_func or main_py or source_files): raise TypeError('One of `main_func`, `main_py` or `source_files` arguments must be specified') @@ -176,7 +172,17 @@ async def _make_actor( if main_func: func_source = textwrap.dedent(inspect.getsource(main_func)) func_source = func_source.replace(f'def {main_func.__name__}(', 'def main(') - main_py = f'import asyncio\n\nfrom apify import Actor\n\n\n{func_source}' + main_py = '\n'.join( # noqa: FLY002 + [ + 'import asyncio', + '', + 'from apify import Actor', + '', + '', + '', + func_source, + ] + ) if main_py: source_files = {'src/main.py': main_py} @@ -206,7 +212,7 @@ async def _make_actor( } ) - print(f'Creating actor {actor_name}...') + print(f'Creating Actor {actor_name}...') created_actor = await apify_client_async.actors().create( name=actor_name, default_run_build='latest', @@ -224,7 +230,7 @@ async def _make_actor( actor_client = apify_client_async.actor(created_actor['id']) - print(f'Building actor {actor_name}...') + print(f'Building Actor {actor_name}...') build = await actor_client.build(version_number='0.0', wait_for_finish=300) assert build['status'] == ActorJobStatus.SUCCEEDED diff --git a/tests/integration/test_actor_api_helpers.py b/tests/integration/test_actor_api_helpers.py index a31b9e2e..589528fa 100644 --- a/tests/integration/test_actor_api_helpers.py +++ b/tests/integration/test_actor_api_helpers.py @@ -4,9 +4,10 @@ import json from typing import TYPE_CHECKING +from crawlee._utils.crypto import crypto_random_object_id + from ._utils import generate_unique_resource_name from apify import Actor -from apify._crypto import crypto_random_object_id if TYPE_CHECKING: from apify_client import ApifyClientAsync @@ -307,7 +308,7 @@ async def main_outer() -> None: # This should not be called await Actor.set_value('RECORD_AFTER_METAMORPH_CALL', 'dummy') - raise AssertionError('The actor should have been metamorphed by now') + raise AssertionError('The Actor should have been metamorphed by now') inner_actor = await make_actor('metamorph-inner', main_func=main_inner) outer_actor = await make_actor('metamorph-outer', main_func=main_outer) @@ -328,7 +329,7 @@ async def main_outer() -> None: assert await outer_run_key_value_store.get_record('RECORD_AFTER_METAMORPH_CALL') is None - # After metamorph, the run still belongs to the original actor, so the inner one should have no runs + # After metamorph, the run still belongs to the original Actor, so the inner one should have no runs assert await inner_actor.last_run().get() is None diff --git a/tests/integration/test_actor_dataset.py b/tests/integration/test_actor_dataset.py index 81a4f938..e61446db 100644 --- a/tests/integration/test_actor_dataset.py +++ b/tests/integration/test_actor_dataset.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: import pytest + from apify_client import ApifyClientAsync from .conftest import ActorFactory diff --git a/tests/integration/test_actor_events.py b/tests/integration/test_actor_events.py index b4436b7e..f1a89ace 100644 --- a/tests/integration/test_actor_events.py +++ b/tests/integration/test_actor_events.py @@ -19,11 +19,12 @@ async def main() -> None: from typing import Any, Callable from apify_shared.consts import ActorEventTypes, ApifyEnvVars + from crawlee.events._types import Event, EventSystemInfoData os.environ[ApifyEnvVars.PERSIST_STATE_INTERVAL_MILLIS] = '900' was_system_info_emitted = False - system_infos = [] + system_infos = list[EventSystemInfoData]() def on_event(event_type: ActorEventTypes) -> Callable: async def log_event(data: Any) -> None: @@ -38,8 +39,8 @@ async def log_event(data: Any) -> None: return log_event async with Actor: - Actor.on(ActorEventTypes.SYSTEM_INFO, on_event(ActorEventTypes.SYSTEM_INFO)) - Actor.on(ActorEventTypes.PERSIST_STATE, on_event(ActorEventTypes.PERSIST_STATE)) + Actor.on(Event.SYSTEM_INFO, on_event(ActorEventTypes.SYSTEM_INFO)) + Actor.on(Event.PERSIST_STATE, on_event(ActorEventTypes.PERSIST_STATE)) await asyncio.sleep(3) # The SYSTEM_INFO event sometimes takes a while to appear, let's wait for it for a while longer @@ -50,7 +51,7 @@ async def log_event(data: Any) -> None: # Check that parsing datetimes works correctly # Check `createdAt` is a datetime (so it's the same locally and on platform) - assert isinstance(system_infos[0]['createdAt'], datetime) + assert isinstance(system_infos[0].cpu_info.created_at, datetime) actor = await make_actor('actor-interval-events', main_func=main) @@ -68,7 +69,8 @@ async def test_off_event(self: TestActorEvents, make_actor: ActorFactory) -> Non async def main() -> None: import os - from apify_shared.consts import ActorEventTypes, ApifyEnvVars + from apify_shared.consts import ApifyEnvVars + from crawlee.events._types import Event os.environ[ApifyEnvVars.PERSIST_STATE_INTERVAL_MILLIS] = '100' @@ -80,11 +82,11 @@ def count_event(data): # type: ignore # noqa: ANN202, ANN001 counter += 1 async with Actor: - Actor.on(ActorEventTypes.PERSIST_STATE, count_event) + Actor.on(Event.PERSIST_STATE, count_event) await asyncio.sleep(0.5) assert counter > 1 last_count = counter - Actor.off(ActorEventTypes.PERSIST_STATE, count_event) + Actor.off(Event.PERSIST_STATE, count_event) await asyncio.sleep(0.5) assert counter == last_count diff --git a/tests/integration/test_actor_key_value_store.py b/tests/integration/test_actor_key_value_store.py index 3bd6df62..6a27ef40 100644 --- a/tests/integration/test_actor_key_value_store.py +++ b/tests/integration/test_actor_key_value_store.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: import pytest + from apify_client import ApifyClientAsync from .conftest import ActorFactory @@ -186,12 +187,16 @@ async def main(): class TestGetPublicUrl: async def test_get_public_url(self: TestGetPublicUrl, make_actor: ActorFactory) -> None: async def main() -> None: + from typing import cast + + from apify.apify_storage_client._key_value_store_client import KeyValueStoreClient + async with Actor: public_api_url = Actor.config.api_public_base_url default_store_id = Actor.config.default_key_value_store_id store = await Actor.open_key_value_store() - record_url = await store.get_public_url('dummy') + record_url = await cast(KeyValueStoreClient, store._resource_client).get_public_url('dummy') print(record_url) assert record_url == f'{public_api_url}/v2/key-value-stores/{default_store_id}/records/dummy' diff --git a/tests/integration/test_actor_lifecycle.py b/tests/integration/test_actor_lifecycle.py index ae517f90..84d1b22f 100644 --- a/tests/integration/test_actor_lifecycle.py +++ b/tests/integration/test_actor_lifecycle.py @@ -11,7 +11,7 @@ class TestActorInit: async def test_actor_init(self: TestActorInit, make_actor: ActorFactory) -> None: async def main() -> None: - my_actor = Actor() + my_actor = Actor await my_actor.init() assert my_actor._is_initialized is True double_init = False @@ -19,14 +19,14 @@ async def main() -> None: await my_actor.init() double_init = True except RuntimeError as err: - assert str(err) == 'The actor was already initialized!' # noqa: PT017 + assert str(err) == 'The Actor was already initialized!' # noqa: PT017 except Exception: raise try: await Actor.init() double_init = True except RuntimeError as err: - assert str(err) == 'The actor was already initialized!' # noqa: PT017 + assert str(err) == 'The Actor was already initialized!' # noqa: PT017 except Exception: raise await my_actor.exit() @@ -42,9 +42,11 @@ async def main() -> None: async def test_async_with_actor_properly_initialize(self: TestActorInit, make_actor: ActorFactory) -> None: async def main() -> None: + import apify._actor + async with Actor: - assert Actor._get_default_instance()._is_initialized - assert Actor._get_default_instance()._is_initialized is False + assert apify._actor.Actor._is_initialized + assert apify._actor.Actor._is_initialized is False actor = await make_actor('with-actor-init', main_func=main) @@ -106,47 +108,3 @@ async def main() -> None: assert run_result is not None assert run_result['exitCode'] == 91 assert run_result['status'] == 'FAILED' - - -class TestActorMain: - async def test_actor_main(self: TestActorMain, make_actor: ActorFactory) -> None: - async def main() -> None: - async def actor_function() -> None: - input = await Actor.get_input() # noqa: A001 - if input.get('raise_exception'): - raise Exception(input.get('raise_exception')) # noqa: TRY002 - if input.get('exit_code'): - await Actor.exit(exit_code=input.get('exit_code')) - elif input.get('fail'): - await Actor.fail() - elif input.get('set_output'): - await Actor.set_value('OUTPUT', input.get('set_output')) - print('Main function called') - - await Actor.main(actor_function) - - actor = await make_actor('actor-main', main_func=main) - - exception_run = await actor.call(run_input={'raise_exception': 'This is a test exception'}) - assert exception_run is not None - assert exception_run['status'] == 'FAILED' - assert exception_run['exitCode'] == 91 - - exit_code = 10 - exited_run = await actor.call(run_input={'exit_code': exit_code}) - assert exited_run is not None - assert exited_run['status'] == 'FAILED' - assert exited_run['exitCode'] == exit_code - - failed_run = await actor.call(run_input={'fail': True}) - assert failed_run is not None - assert failed_run['status'] == 'FAILED' - assert failed_run['exitCode'] == 1 - - test_output = {'test': 'output'} - run_with_output = await actor.call(run_input={'set_output': test_output}) - assert run_with_output is not None - assert run_with_output['status'] == 'SUCCEEDED' - output = await actor.last_run().key_value_store().get_record('OUTPUT') - assert output is not None - assert output['value'] == test_output diff --git a/tests/integration/test_actor_log.py b/tests/integration/test_actor_log.py index f07598e6..dcfe8d8d 100644 --- a/tests/integration/test_actor_log.py +++ b/tests/integration/test_actor_log.py @@ -13,7 +13,7 @@ async def test_actor_log(self: TestActorLog, make_actor: ActorFactory) -> None: async def main() -> None: import logging - from apify.log import ActorLogFormatter, logger + from apify._log import ActorLogFormatter, logger # Clear any other log handlers, so they don't mess with this test client_logger = logging.getLogger('apify_client') @@ -72,24 +72,24 @@ async def main() -> None: assert run_log_lines.pop(0).startswith('ACTOR: Pulling Docker image') assert run_log_lines.pop(0) == 'ACTOR: Creating Docker container.' assert run_log_lines.pop(0) == 'ACTOR: Starting Docker container.' - assert run_log_lines.pop(0) == 'INFO Initializing actor...' - assert run_log_lines.pop(0).startswith(f'INFO System info ({{"apify_sdk_version": "{__version__}", "apify_client_version": "') - assert run_log_lines.pop(0) == 'DEBUG Debug message' - assert run_log_lines.pop(0) == 'INFO Info message' - assert run_log_lines.pop(0) == 'WARN Warning message' - assert run_log_lines.pop(0) == 'ERROR Error message' - assert run_log_lines.pop(0) == 'ERROR Exception message' + assert run_log_lines.pop(0) == '[apify] INFO Initializing Actor...' + assert run_log_lines.pop(0).startswith(f'[apify] INFO System info ({{"apify_sdk_version": "{__version__}", "apify_client_version": "') + assert run_log_lines.pop(0) == '[apify] DEBUG Debug message' + assert run_log_lines.pop(0) == '[apify] INFO Info message' + assert run_log_lines.pop(0) == '[apify] WARN Warning message' + assert run_log_lines.pop(0) == '[apify] ERROR Error message' + assert run_log_lines.pop(0) == '[apify] ERROR Exception message' assert run_log_lines.pop(0) == ' Traceback (most recent call last):' - assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 34, in main' + assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 35, in main' assert run_log_lines.pop(0) == " raise ValueError('Dummy ValueError')" assert run_log_lines.pop(0) == ' ValueError: Dummy ValueError' - assert run_log_lines.pop(0) == 'INFO Multi' - assert run_log_lines.pop(0) == ' line' - assert run_log_lines.pop(0) == ' log' - assert run_log_lines.pop(0) == ' message' - assert run_log_lines.pop(0) == 'ERROR Actor failed with an exception' + assert run_log_lines.pop(0) == '[apify] INFO Multi' + assert run_log_lines.pop(0) == 'line' + assert run_log_lines.pop(0) == 'log' + assert run_log_lines.pop(0) == 'message' + assert run_log_lines.pop(0) == '[apify] ERROR Actor failed with an exception' assert run_log_lines.pop(0) == ' Traceback (most recent call last):' - assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 42, in main' + assert run_log_lines.pop(0) == ' File "/usr/src/app/src/main.py", line 43, in main' assert run_log_lines.pop(0) == " raise RuntimeError('Dummy RuntimeError')" assert run_log_lines.pop(0) == ' RuntimeError: Dummy RuntimeError' - assert run_log_lines.pop(0) == 'INFO Exiting actor ({"exit_code": 91})' + assert run_log_lines.pop(0) == '[apify] INFO Exiting Actor ({"exit_code": 91})' diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 076115e3..adc5784a 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -3,12 +3,14 @@ from typing import TYPE_CHECKING from apify_shared.consts import ApifyEnvVars +from crawlee import Request from ._utils import generate_unique_resource_name from apify import Actor if TYPE_CHECKING: import pytest + from apify_client import ApifyClientAsync from .conftest import ActorFactory @@ -66,7 +68,7 @@ async def test_force_cloud( request_queue = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) request_queue_id = request_queue._id - request_info = await request_queue.add_request({'url': 'http://example.com'}) + request_info = await request_queue.add_request(Request.from_url('http://example.com')) request_queue_client = apify_client_async.request_queue(request_queue_id) @@ -75,7 +77,7 @@ async def test_force_cloud( assert request_queue_details is not None assert request_queue_details.get('name') == request_queue_name - request_queue_request = await request_queue_client.get_request(request_info['requestId']) + request_queue_request = await request_queue_client.get_request(request_info.id) assert request_queue_request is not None assert request_queue_request['url'] == 'http://example.com' finally: diff --git a/tests/integration/test_fixtures.py b/tests/integration/test_fixtures.py index c5c67a4d..93ff5588 100644 --- a/tests/integration/test_fixtures.py +++ b/tests/integration/test_fixtures.py @@ -3,8 +3,9 @@ from datetime import datetime, timezone from typing import TYPE_CHECKING +from crawlee._utils.crypto import crypto_random_object_id + from apify import Actor -from apify._crypto import crypto_random_object_id if TYPE_CHECKING: from apify_client import ApifyClientAsync diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index 9e81aa43..46afa2ab 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -19,14 +19,14 @@ async def main() -> None: # Add some requests for i in range(desired_request_count): print(f'Adding request {i}...') - await rq.add_request({'url': f'https://example.com/{i}'}) + await rq.add_request(f'https://example.com/{i}') handled_request_count = 0 while next_request := await rq.fetch_next_request(): print('Fetching next request...') queue_operation_info = await rq.mark_request_as_handled(next_request) assert queue_operation_info is not None - assert queue_operation_info['wasAlreadyHandled'] is False + assert queue_operation_info.was_already_handled is False handled_request_count += 1 assert handled_request_count == desired_request_count diff --git a/tests/unit/actor/test_actor_create_proxy_configuration.py b/tests/unit/actor/test_actor_create_proxy_configuration.py index 593ee080..8dd4db95 100644 --- a/tests/unit/actor/test_actor_create_proxy_configuration.py +++ b/tests/unit/actor/test_actor_create_proxy_configuration.py @@ -4,6 +4,7 @@ import httpx import pytest + from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars diff --git a/tests/unit/actor/test_actor_dataset.py b/tests/unit/actor/test_actor_dataset.py index d9ba9c66..10400069 100644 --- a/tests/unit/actor/test_actor_dataset.py +++ b/tests/unit/actor/test_actor_dataset.py @@ -3,12 +3,13 @@ from typing import TYPE_CHECKING import pytest + from apify_shared.consts import ActorEnvVars from apify import Actor if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient + from crawlee.memory_storage_client import MemoryStorageClient # NOTE: We only test the dataset methods available on Actor class/instance. # Actual tests for the implementations are in storages/. @@ -52,7 +53,7 @@ async def test_open_datatset_based_env_var( class TestActorPushData: async def test_push_data(self: TestActorPushData) -> None: - async with Actor() as my_actor: + async with Actor as my_actor: dataset = await my_actor.open_dataset() desired_item_count = 100 await dataset.push_data([{'id': i} for i in range(desired_item_count)]) @@ -61,5 +62,4 @@ async def test_push_data(self: TestActorPushData) -> None: assert dataset_info is not None list_page = await dataset.get_data(limit=desired_item_count) - assert list_page.items[0]['id'] == 0 - assert list_page.items[-1]['id'] == desired_item_count - 1 + assert {item['id'] for item in list_page.items} == set(range(desired_item_count)) diff --git a/tests/unit/actor/test_actor_env_helpers.py b/tests/unit/actor/test_actor_env_helpers.py index a6e3d6fd..36a5268f 100644 --- a/tests/unit/actor/test_actor_env_helpers.py +++ b/tests/unit/actor/test_actor_env_helpers.py @@ -2,9 +2,11 @@ import random import string -from datetime import datetime, timezone +from datetime import datetime, timedelta from typing import TYPE_CHECKING, Any +from pydantic_core import TzInfo + from apify_shared.consts import BOOL_ENV_VARS, DATETIME_ENV_VARS, FLOAT_ENV_VARS, INTEGER_ENV_VARS, STRING_ENV_VARS, ActorEnvVars, ApifyEnvVars from apify import Actor @@ -14,12 +16,13 @@ class TestIsAtHome: - async def test_is_at_home_local(self: TestIsAtHome) -> None: + async def test_is_at_home_local(self) -> None: async with Actor as actor: is_at_home = actor.is_at_home() assert is_at_home is False - async def test_is_at_home_on_apify(self: TestIsAtHome, monkeypatch: pytest.MonkeyPatch) -> None: + async def test_is_at_home_on_apify(self, monkeypatch: pytest.MonkeyPatch) -> None: + print('setenv') monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, 'true') async with Actor as actor: is_at_home = actor.is_at_home() @@ -27,39 +30,92 @@ async def test_is_at_home_on_apify(self: TestIsAtHome, monkeypatch: pytest.Monke class TestGetEnv: - async def test_get_env_use_env_vars(self: TestGetEnv, monkeypatch: pytest.MonkeyPatch) -> None: + async def test_get_env_use_env_vars(self, monkeypatch: pytest.MonkeyPatch) -> None: # noqa: PLR0912 + ignored_env_vars = { + ApifyEnvVars.INPUT_KEY, + ApifyEnvVars.MEMORY_MBYTES, + ApifyEnvVars.STARTED_AT, + ApifyEnvVars.TIMEOUT_AT, + ApifyEnvVars.DEFAULT_DATASET_ID, + ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID, + ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID, + ApifyEnvVars.SDK_LATEST_VERSION, + ApifyEnvVars.LOG_FORMAT, + } + + legacy_env_vars = { + ApifyEnvVars.ACT_ID: ActorEnvVars.ID, + ApifyEnvVars.ACT_RUN_ID: ActorEnvVars.RUN_ID, + ApifyEnvVars.ACTOR_ID: ActorEnvVars.ID, + ApifyEnvVars.ACTOR_BUILD_ID: ActorEnvVars.BUILD_ID, + ApifyEnvVars.ACTOR_BUILD_NUMBER: ActorEnvVars.BUILD_NUMBER, + ApifyEnvVars.ACTOR_RUN_ID: ActorEnvVars.RUN_ID, + ApifyEnvVars.ACTOR_TASK_ID: ActorEnvVars.TASK_ID, + ApifyEnvVars.CONTAINER_URL: ActorEnvVars.WEB_SERVER_URL, + ApifyEnvVars.CONTAINER_PORT: ActorEnvVars.WEB_SERVER_PORT, + } + # Set up random env vars expected_get_env: dict[str, Any] = {} for int_env_var in INTEGER_ENV_VARS: + if int_env_var in ignored_env_vars: + continue + int_get_env_var = int_env_var.name.lower() expected_get_env[int_get_env_var] = random.randint(1, 99999) monkeypatch.setenv(int_env_var, f'{expected_get_env[int_get_env_var]}') for float_env_var in FLOAT_ENV_VARS: + if float_env_var in ignored_env_vars: + continue + float_get_env_var = float_env_var.name.lower() expected_get_env[float_get_env_var] = random.random() monkeypatch.setenv(float_env_var, f'{expected_get_env[float_get_env_var]}') for bool_env_var in BOOL_ENV_VARS: + if bool_env_var in ignored_env_vars: + continue + bool_get_env_var = bool_env_var.name.lower() expected_get_env[bool_get_env_var] = random.choice([True, False]) monkeypatch.setenv(bool_env_var, f'{"true" if expected_get_env[bool_get_env_var] else "false"}') for datetime_env_var in DATETIME_ENV_VARS: + if datetime_env_var in ignored_env_vars: + continue + datetime_get_env_var = datetime_env_var.name.lower() - expected_get_env[datetime_get_env_var] = datetime.now(timezone.utc) + expected_get_env[datetime_get_env_var] = datetime.now(TzInfo(0)) # type: ignore monkeypatch.setenv(datetime_env_var, expected_get_env[datetime_get_env_var].strftime('%Y-%m-%dT%H:%M:%S.%fZ')) for string_env_var in STRING_ENV_VARS: + if string_env_var in ignored_env_vars: + continue + string_get_env_var = string_env_var.name.lower() expected_get_env[string_get_env_var] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)) monkeypatch.setenv(string_env_var, expected_get_env[string_get_env_var]) # We need this override so that the actor doesn't fail when connecting to the platform events websocket monkeypatch.delenv(ActorEnvVars.EVENTS_WEBSOCKET_URL) + monkeypatch.delenv(ApifyEnvVars.ACTOR_EVENTS_WS_URL) expected_get_env[ActorEnvVars.EVENTS_WEBSOCKET_URL.name.lower()] = None + expected_get_env[ApifyEnvVars.ACTOR_EVENTS_WS_URL.name.lower()] = None + + # Adjust expectations for timedelta fields + for env_name, env_value in expected_get_env.items(): + if env_name.endswith('_millis'): + expected_get_env[env_name] = timedelta(milliseconds=env_value) + + # Convert dedicated_cpus to float + expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] = float(expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()]) + + # Update expectations for legacy configuration + for old_name, new_name in legacy_env_vars.items(): + expected_get_env[old_name.name.lower()] = expected_get_env[new_name.name.lower()] await Actor.init() - assert expected_get_env == Actor.get_env() + assert Actor.get_env() == expected_get_env await Actor.exit() diff --git a/tests/unit/actor/test_actor_helpers.py b/tests/unit/actor/test_actor_helpers.py index 92d0716d..0d6a08d0 100644 --- a/tests/unit/actor/test_actor_helpers.py +++ b/tests/unit/actor/test_actor_helpers.py @@ -6,6 +6,7 @@ from apify_shared.consts import ApifyEnvVars, WebhookEventType from apify import Actor +from apify._actor import _ActorType if TYPE_CHECKING: import pytest @@ -17,7 +18,7 @@ class TestActorNewClient: async def test_actor_new_client_config(self: TestActorNewClient, monkeypatch: pytest.MonkeyPatch) -> None: token = 'my-token' monkeypatch.setenv(ApifyEnvVars.TOKEN, token) - my_actor = Actor() + my_actor = _ActorType() await my_actor.init() client = my_actor.new_client() @@ -39,71 +40,64 @@ async def test_actor_call( ) -> None: apify_client_async_patcher.patch('actor', 'call', return_value=None) actor_id = 'some-actor-id' - my_actor = Actor() - await my_actor.init() - await my_actor.call(actor_id) + async with Actor: + await Actor.call(actor_id) + assert len(apify_client_async_patcher.calls['actor']['call']) == 1 # The first argument is ActorClientAsync, which was called, let's check its id. assert apify_client_async_patcher.calls['actor']['call'][0][0][0].resource_id == actor_id - await my_actor.exit() - async def test_actor_call_task( self: TestActorCallStartAbortActor, apify_client_async_patcher: ApifyClientAsyncPatcher, ) -> None: apify_client_async_patcher.patch('task', 'call', return_value=None) task_id = 'some-task-id' - my_actor = Actor() - await my_actor.init() - await my_actor.call_task(task_id) + async with Actor: + await Actor.call_task(task_id) + assert len(apify_client_async_patcher.calls['task']['call']) == 1 assert apify_client_async_patcher.calls['task']['call'][0][0][0].resource_id == task_id - await my_actor.exit() - async def test_actor_start( self: TestActorCallStartAbortActor, apify_client_async_patcher: ApifyClientAsyncPatcher, ) -> None: apify_client_async_patcher.patch('actor', 'start', return_value=None) actor_id = 'some-id' - my_actor = Actor() - await my_actor.init() - await my_actor.start(actor_id) + async with Actor: + await Actor.start(actor_id) + assert len(apify_client_async_patcher.calls['actor']['start']) == 1 assert apify_client_async_patcher.calls['actor']['start'][0][0][0].resource_id == actor_id - await my_actor.exit() - async def test_actor_abort( self: TestActorCallStartAbortActor, apify_client_async_patcher: ApifyClientAsyncPatcher, ) -> None: apify_client_async_patcher.patch('run', 'abort', return_value=None) run_id = 'some-run-id' - my_actor = Actor() - await my_actor.init() - await my_actor.abort(run_id) + async with Actor: + await Actor.abort(run_id) + assert len(apify_client_async_patcher.calls['run']['abort']) == 1 assert apify_client_async_patcher.calls['run']['abort'][0][0][0].resource_id == run_id - await my_actor.exit() - class TestActorMethodsWorksOnlyOnPlatform: - # NOTE: These medhods will be tested properly using integrations tests. + # NOTE: These methods will be tested properly using integrations tests. async def test_actor_metamorpth_not_work_locally( self: TestActorMethodsWorksOnlyOnPlatform, caplog: pytest.LogCaptureFixture, ) -> None: - async with Actor() as my_actor: - await my_actor.metamorph('random-id') + async with Actor: + await Actor.metamorph('random-id') + assert len(caplog.records) == 1 assert caplog.records[0].levelname == 'ERROR' assert 'Actor.metamorph() is only supported when running on the Apify platform.' in caplog.records[0].message @@ -112,8 +106,9 @@ async def test_actor_reboot_not_work_locally( self: TestActorMethodsWorksOnlyOnPlatform, caplog: pytest.LogCaptureFixture, ) -> None: - async with Actor() as my_actor: - await my_actor.reboot() + async with Actor: + await Actor.reboot() + assert len(caplog.records) == 1 assert caplog.records[0].levelname == 'ERROR' assert 'Actor.reboot() is only supported when running on the Apify platform.' in caplog.records[0].message @@ -122,8 +117,9 @@ async def test_actor_add_webhook_not_work_locally( self: TestActorMethodsWorksOnlyOnPlatform, caplog: pytest.LogCaptureFixture, ) -> None: - async with Actor() as my_actor: - await my_actor.add_webhook(event_types=[WebhookEventType.ACTOR_BUILD_ABORTED], request_url='https://example.com') + async with Actor: + await Actor.add_webhook(event_types=[WebhookEventType.ACTOR_BUILD_ABORTED], request_url='https://example.com') + assert len(caplog.records) == 1 assert caplog.records[0].levelname == 'ERROR' assert 'Actor.add_webhook() is only supported when running on the Apify platform.' in caplog.records[0].message @@ -133,8 +129,9 @@ async def test_actor_set_status_message_mock_locally( caplog: pytest.LogCaptureFixture, ) -> None: caplog.set_level('INFO') - async with Actor() as my_actor: - await my_actor.set_status_message('test-status-message') + async with Actor: + await Actor.set_status_message('test-status-message') + matching_records = [record for record in caplog.records if 'test-status-message' in record.message] assert len(matching_records) == 1 assert matching_records[0].levelname == 'INFO' @@ -145,8 +142,9 @@ async def test_actor_set_status_message_terminal_mock_locally( caplog: pytest.LogCaptureFixture, ) -> None: caplog.set_level('INFO') - async with Actor() as my_actor: - await my_actor.fail(status_message='test-terminal-message') + async with Actor: + await Actor.fail(status_message='test-terminal-message') + matching_records = [record for record in caplog.records if 'test-terminal-message' in record.message] assert len(matching_records) == 1 assert matching_records[0].levelname == 'INFO' diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index 3de07378..5d855f36 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -3,16 +3,17 @@ from typing import TYPE_CHECKING import pytest + from apify_shared.consts import ApifyEnvVars from apify_shared.utils import json_dumps from ..test_crypto import PRIVATE_KEY_PASSWORD, PRIVATE_KEY_PEM_BASE64, PUBLIC_KEY from apify import Actor +from apify._consts import ENCRYPTED_INPUT_VALUE_PREFIX from apify._crypto import public_encrypt -from apify.consts import ENCRYPTED_INPUT_VALUE_PREFIX if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient + from crawlee.memory_storage_client import MemoryStorageClient # NOTE: We only test the key-value store methods available on Actor class/instance. @@ -44,7 +45,7 @@ async def test_get_set_value(self: TestKeyValueStoreOnActor) -> None: test_key = 'test_key' test_value = 'test_value' test_content_type = 'text/plain' - async with Actor() as my_actor: + async with Actor as my_actor: await my_actor.set_value(key=test_key, value=test_value, content_type=test_content_type) value = await my_actor.get_value(key=test_key) assert value == test_value @@ -53,14 +54,14 @@ async def test_get_input(self: TestKeyValueStoreOnActor, memory_storage_client: input_key = 'INPUT' test_input = {'foo': 'bar'} - await memory_storage_client.key_value_stores().get_or_create(_id='default') + await memory_storage_client.key_value_stores().get_or_create(id='default') await memory_storage_client.key_value_store('default').set_record( key=input_key, value=json_dumps(test_input), content_type='application/json', ) - async with Actor() as my_actor: + async with Actor as my_actor: input = await my_actor.get_input() # noqa: A001 assert input['foo'] == test_input['foo'] @@ -80,14 +81,14 @@ async def test_get_input_with_secrets( 'secret': f'{ENCRYPTED_INPUT_VALUE_PREFIX}:{encrypted_secret["encrypted_password"]}:{encrypted_secret["encrypted_value"]}', } - await memory_storage_client.key_value_stores().get_or_create(_id='default') + await memory_storage_client.key_value_stores().get_or_create(id='default') await memory_storage_client.key_value_store('default').set_record( key=input_key, value=json_dumps(input_with_secret), content_type='application/json', ) - async with Actor() as my_actor: + async with Actor as my_actor: input = await my_actor.get_input() # noqa: A001 assert input['foo'] == input_with_secret['foo'] assert input['secret'] == secret_string diff --git a/tests/unit/actor/test_actor_lifecycle.py b/tests/unit/actor/test_actor_lifecycle.py index ef187f47..8053f2f3 100644 --- a/tests/unit/actor/test_actor_lifecycle.py +++ b/tests/unit/actor/test_actor_lifecycle.py @@ -2,24 +2,30 @@ import asyncio import contextlib -from datetime import datetime -from typing import Any, Callable -from unittest.mock import AsyncMock +import json +from typing import Any, Callable, cast import pytest -from apify_shared.consts import ActorEventTypes, ApifyEnvVars +import websockets.server +from lazy_object_proxy import Proxy +from apify_shared.consts import ApifyEnvVars +from crawlee.events._types import Event, EventPersistStateData + +import apify._actor from apify import Actor +from apify._actor import _ActorType class TestActorInit: async def test_async_with_actor_properly_initialize(self: TestActorInit) -> None: async with Actor: - assert Actor._get_default_instance()._is_initialized - assert Actor._get_default_instance()._is_initialized is False + assert cast(Proxy, apify._actor.Actor).__wrapped__ is not None + assert cast(Proxy, apify._actor.Actor).__wrapped__._is_initialized + assert not cast(Proxy, apify._actor.Actor).__wrapped__._is_initialized async def test_actor_init(self: TestActorInit) -> None: - my_actor = Actor() + my_actor = _ActorType() await my_actor.init() assert my_actor._is_initialized is True @@ -28,7 +34,7 @@ async def test_actor_init(self: TestActorInit) -> None: assert my_actor._is_initialized is False async def test_double_init(self: TestActorInit) -> None: - my_actor = Actor() + my_actor = _ActorType() await my_actor.init() with pytest.raises(RuntimeError): @@ -48,20 +54,20 @@ async def test_with_actor_exit(self: TestActorExit, monkeypatch: pytest.MonkeyPa on_persist = [] on_system_info = [] - def on_event(event_type: ActorEventTypes) -> Callable: + def on_event(event_type: Event) -> Callable: nonlocal on_persist nonlocal on_system_info - if event_type == ActorEventTypes.PERSIST_STATE: + if event_type == Event.PERSIST_STATE: return lambda data: on_persist.append(data) - if event_type == ActorEventTypes.SYSTEM_INFO: + if event_type == Event.SYSTEM_INFO: return lambda data: on_system_info.append(data) return lambda data: print(data) - my_actor = Actor() + my_actor = _ActorType() async with my_actor: assert my_actor._is_initialized - my_actor.on(ActorEventTypes.PERSIST_STATE, on_event(ActorEventTypes.PERSIST_STATE)) - my_actor.on(ActorEventTypes.SYSTEM_INFO, on_event(ActorEventTypes.SYSTEM_INFO)) + my_actor.on(Event.PERSIST_STATE, on_event(Event.PERSIST_STATE)) + my_actor.on(Event.SYSTEM_INFO, on_event(Event.SYSTEM_INFO)) await asyncio.sleep(1) on_persist_count = len(on_persist) @@ -73,26 +79,28 @@ def on_event(event_type: ActorEventTypes) -> Callable: await asyncio.sleep(0.2) assert on_persist_count == len(on_persist) assert on_system_info_count == len(on_system_info) - # Check `createdAt` is a datetime (so it's the same locally and on platform) - assert isinstance(on_system_info[0]['createdAt'], datetime) - async def test_raise_on_exit_witout_init(self: TestActorExit) -> None: + async def test_raise_on_exit_without_init(self: TestActorExit) -> None: with pytest.raises(RuntimeError): await Actor.exit() class TestActorFail: async def test_with_actor_fail(self: TestActorFail) -> None: - async with Actor() as my_actor: + async with _ActorType() as my_actor: assert my_actor._is_initialized await my_actor.fail() assert my_actor._is_initialized is False async def test_with_actor_failed(self: TestActorFail) -> None: + my_actor = None + with contextlib.suppress(Exception): - async with Actor() as my_actor: + async with _ActorType() as my_actor: assert my_actor._is_initialized raise Exception('Failed') # noqa: TRY002 + + assert my_actor is not None assert my_actor._is_initialized is False async def test_raise_on_fail_without_init(self: TestActorFail) -> None: @@ -104,53 +112,12 @@ async def test_actor_reboot_not_work_locally(self: TestActorFail) -> None: await Actor.reboot() -class TestActorMainMethod: - async def test_actor_main_method(self: TestActorMainMethod) -> None: - my_actor = Actor() - main_was_called = False - - async def actor_function() -> None: - nonlocal main_was_called - main_was_called = True - assert my_actor._is_initialized - - await my_actor.main(actor_function) - assert my_actor._is_initialized is False - assert main_was_called - - async def test_actor_main_method_throw_exception(self: TestActorMainMethod) -> None: - my_actor = Actor() - err = Exception('Failed') - my_actor.fail = AsyncMock() # type: ignore - - async def actor_function() -> None: - nonlocal err - raise err - - await my_actor.main(actor_function) - # NOTE: Actor didn't call sys.exit() during testing, check if fail was called. - my_actor.fail.assert_called_with(exit_code=91, exception=err) - - # This is necessary to stop the event emitting intervals - await my_actor.exit() - - async def test_actor_main_method_raise_return_value(self: TestActorMainMethod) -> None: - my_actor = Actor() - expected_string = 'Hello world' - - async def actor_function() -> str: - nonlocal expected_string - return expected_string - - returned_value = await my_actor.main(actor_function) - assert returned_value == expected_string - - class TestMigratingEvent: async def test_migrating_event(self: TestMigratingEvent, monkeypatch: pytest.MonkeyPatch) -> None: # This should test whether when you get a MIGRATING event, - # the actor automatically emits the PERSIST_STATE event with data `{'isMigrating': True}` + # the Actor automatically emits the PERSIST_STATE event with data `{'isMigrating': True}` monkeypatch.setenv(ApifyEnvVars.PERSIST_STATE_INTERVAL_MILLIS, '500') + monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, '1') persist_state_events_data = [] @@ -158,19 +125,38 @@ def log_persist_state(data: Any) -> None: nonlocal persist_state_events_data persist_state_events_data.append(data) - async with Actor: - Actor.on(ActorEventTypes.PERSIST_STATE, log_persist_state) - await asyncio.sleep(2) - Actor._get_default_instance()._event_manager.emit(ActorEventTypes.MIGRATING, None) - await asyncio.sleep(1) + async def handler(websocket: websockets.server.WebSocketServerProtocol) -> None: + await websocket.wait_closed() + + async with websockets.server.serve(handler, host='localhost') as ws_server: + port: int = ws_server.sockets[0].getsockname()[1] # type: ignore[index] + monkeypatch.setenv(ApifyEnvVars.ACTOR_EVENTS_WS_URL, f'ws://localhost:{port}') + + async with Actor: + Actor.on(Event.PERSIST_STATE, log_persist_state) + await asyncio.sleep(2) + + for socket in ws_server.websockets: + await socket.send( + json.dumps( + { + 'name': 'migrating', + 'data': { + 'isMigrating': True, + }, + } + ) + ) + + await asyncio.sleep(1) assert len(persist_state_events_data) >= 3 print(persist_state_events_data) # Check if the last event is from the migration - assert persist_state_events_data.pop() == {'isMigrating': True} + assert persist_state_events_data.pop() == EventPersistStateData(is_migrating=True) # Check if all the other events are regular persist state events for event_data in persist_state_events_data: - assert event_data == {'isMigrating': False} + assert event_data == EventPersistStateData(is_migrating=False) diff --git a/tests/unit/actor/test_actor_log.py b/tests/unit/actor/test_actor_log.py index c1f9cc30..9103c7c3 100644 --- a/tests/unit/actor/test_actor_log.py +++ b/tests/unit/actor/test_actor_log.py @@ -8,15 +8,17 @@ from apify_client import __version__ as apify_client_version from apify import Actor, __version__ -from apify.log import logger +from apify._log import logger if TYPE_CHECKING: import pytest class TestActorLog: - async def test_actor_log(self: TestActorLog, caplog: pytest.LogCaptureFixture) -> None: + async def test_actor_log(self: TestActorLog, caplog: pytest.LogCaptureFixture, monkeypatch: pytest.MonkeyPatch) -> None: caplog.set_level(logging.DEBUG, logger='apify') + monkeypatch.setenv('APIFY_IS_AT_HOME', '1') + with contextlib.suppress(RuntimeError): async with Actor: # Test Actor.log @@ -42,7 +44,7 @@ async def test_actor_log(self: TestActorLog, caplog: pytest.LogCaptureFixture) - assert len(caplog.records) == 12 assert caplog.records[0].levelno == logging.INFO - assert caplog.records[0].message == 'Initializing actor...' + assert caplog.records[0].message == 'Initializing Actor...' assert caplog.records[1].levelno == logging.INFO assert caplog.records[1].message == 'System info' @@ -84,7 +86,7 @@ async def test_actor_log(self: TestActorLog, caplog: pytest.LogCaptureFixture) - assert str(caplog.records[9].exc_info[1]) == 'Dummy RuntimeError' assert caplog.records[10].levelno == logging.INFO - assert caplog.records[10].message == 'Exiting actor' + assert caplog.records[10].message == 'Exiting Actor' assert caplog.records[11].levelno == logging.DEBUG - assert caplog.records[11].message == 'Not calling sys.exit(91) because actor is running in an unit test' + assert caplog.records[11].message == 'Not calling sys.exit(91) because Actor is running in an unit test' diff --git a/tests/unit/actor/test_actor_memory_storage_e2e.py b/tests/unit/actor/test_actor_memory_storage_e2e.py deleted file mode 100644 index dd1d541e..00000000 --- a/tests/unit/actor/test_actor_memory_storage_e2e.py +++ /dev/null @@ -1,130 +0,0 @@ -from __future__ import annotations - -from datetime import datetime, timezone -from typing import Callable - -import pytest -from apify_shared.consts import ApifyEnvVars - -from apify import Actor -from apify.storages import StorageClientManager - - -@pytest.mark.parametrize('purge_on_start', [True, False]) -async def test_actor_memory_storage_client_key_value_store_e2e( - monkeypatch: pytest.MonkeyPatch, - purge_on_start: bool, # noqa: FBT001 - reset_default_instances: Callable[[], None], -) -> None: - """This test simulates two clean runs using memory storage. - The second run attempts to access data created by the first one. - We run 2 configurations with different `purge_on_start`.""" - # Configure purging env var - monkeypatch.setenv(ApifyEnvVars.PURGE_ON_START, f'{int(purge_on_start)}') - # Store old storage client so we have the object reference for comparison - old_client = StorageClientManager.get_storage_client() - async with Actor: - old_default_kvs = await Actor.open_key_value_store() - old_non_default_kvs = await Actor.open_key_value_store(name='non-default') - # Create data in default and non-default key-value store - await old_default_kvs.set_value('test', 'default value') - await old_non_default_kvs.set_value('test', 'non-default value') - - # We simulate another clean run, we expect the memory storage to read from the local data directory - # Default storages are purged based on purge_on_start parameter. - reset_default_instances() - - async with Actor: - # Check if we're using a different memory storage instance - assert old_client is not StorageClientManager.get_storage_client() - default_kvs = await Actor.open_key_value_store() - assert default_kvs is not old_default_kvs - non_default_kvs = await Actor.open_key_value_store(name='non-default') - assert non_default_kvs is not old_non_default_kvs - default_value = await default_kvs.get_value('test') - non_default_value = await non_default_kvs.get_value('test') - if purge_on_start: - assert default_value is None - else: - assert default_value == 'default value' - assert non_default_value == 'non-default value' - - -@pytest.mark.parametrize('purge_on_start', [True, False]) -async def test_actor_memory_storage_client_request_queue_e2e( - monkeypatch: pytest.MonkeyPatch, - purge_on_start: bool, # noqa: FBT001 - reset_default_instances: Callable[[], None], -) -> None: - """This test simulates two clean runs using memory storage. - The second run attempts to access data created by the first one. - We run 2 configurations with different `purge_on_start`.""" - # Configure purging env var - monkeypatch.setenv(ApifyEnvVars.PURGE_ON_START, f'{int(purge_on_start)}') - async with Actor: - # Add some requests to the default queue - default_queue = await Actor.open_request_queue() - for i in range(6): - request_url = f'http://example.com/{i}' - forefront = i % 3 == 1 - was_handled = i % 3 == 2 - await default_queue.add_request( - { - 'uniqueKey': str(i), - 'url': request_url, - 'handledAt': datetime.now(timezone.utc) if was_handled else None, - }, - forefront=forefront, - ) - - # We simulate another clean run, we expect the memory storage to read from the local data directory - # Default storages are purged based on purge_on_start parameter. - reset_default_instances() - - async with Actor: - # Add some more requests to the default queue - default_queue = await Actor.open_request_queue() - for i in range(6, 12): - request_url = f'http://example.com/{i}' - forefront = i % 3 == 1 - was_handled = i % 3 == 2 - await default_queue.add_request( - { - 'uniqueKey': str(i), - 'url': request_url, - 'handledAt': datetime.now(timezone.utc) if was_handled else None, - }, - forefront=forefront, - ) - - queue_info = await default_queue.get_info() - assert queue_info is not None - - # If the queue was purged between the runs, only the requests from the second run should be present, in the right order - if purge_on_start: - assert queue_info.get('totalRequestCount') == 6 - assert queue_info.get('handledRequestCount') == 2 - - expected_pending_request_order = [10, 7, 6, 9] - for request_number in expected_pending_request_order: - next_request = await default_queue.fetch_next_request() - assert next_request is not None - assert next_request.get('uniqueKey') == f'{request_number}' - assert next_request.get('url') == f'http://example.com/{request_number}' - - next_request = await default_queue.fetch_next_request() - assert next_request is None - # If the queue was NOT purged between the runs, all the requests should be in the queue in the right order - else: - assert queue_info.get('totalRequestCount') == 12 - assert queue_info.get('handledRequestCount') == 4 - - expected_pending_request_order = [10, 7, 4, 1, 0, 3, 6, 9] - for request_number in expected_pending_request_order: - next_request = await default_queue.fetch_next_request() - assert next_request is not None - assert next_request.get('uniqueKey') == f'{request_number}' - assert next_request.get('url') == f'http://example.com/{request_number}' - - next_request = await default_queue.fetch_next_request() - assert next_request is None diff --git a/tests/unit/actor/test_actor_non_default_instance.py b/tests/unit/actor/test_actor_non_default_instance.py new file mode 100644 index 00000000..e9d34a0b --- /dev/null +++ b/tests/unit/actor/test_actor_non_default_instance.py @@ -0,0 +1,8 @@ +from datetime import timedelta + +from apify import Actor, Configuration + + +async def test_actor_non_default_instance() -> None: + async with Actor(Configuration(internal_timeout=timedelta(minutes=111))) as actor: + assert actor.config.internal_timeout == timedelta(minutes=111) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 39e2cb17..2c441883 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -4,33 +4,46 @@ import inspect from collections import defaultdict from copy import deepcopy -from typing import TYPE_CHECKING, Any, Callable, get_type_hints +from typing import TYPE_CHECKING, Any, Callable, cast, get_type_hints import pytest + from apify_client.client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars +from crawlee.configuration import Configuration as CrawleeConfiguration +from crawlee.memory_storage_client import MemoryStorageClient -from apify import Actor -from apify._memory_storage import MemoryStorageClient -from apify.config import Configuration -from apify.storages import Dataset, KeyValueStore, RequestQueue, StorageClientManager +import apify._actor if TYPE_CHECKING: from pathlib import Path @pytest.fixture() -def reset_default_instances(monkeypatch: pytest.MonkeyPatch) -> Callable[[], None]: +def reset_default_instances() -> Callable[[], None]: def reset() -> None: - monkeypatch.setattr(Actor, '_default_instance', None) - monkeypatch.setattr(Configuration, '_default_instance', None) - monkeypatch.setattr(Dataset, '_cache_by_id', None) - monkeypatch.setattr(Dataset, '_cache_by_name', None) - monkeypatch.setattr(KeyValueStore, '_cache_by_id', None) - monkeypatch.setattr(KeyValueStore, '_cache_by_name', None) - monkeypatch.setattr(RequestQueue, '_cache_by_id', None) - monkeypatch.setattr(RequestQueue, '_cache_by_name', None) - monkeypatch.setattr(StorageClientManager, '_default_instance', None) + from crawlee.storages._creation_management import ( + _cache_dataset_by_id, + _cache_dataset_by_name, + _cache_kvs_by_id, + _cache_kvs_by_name, + _cache_rq_by_id, + _cache_rq_by_name, + ) + + _cache_dataset_by_id.clear() + _cache_dataset_by_name.clear() + _cache_kvs_by_id.clear() + _cache_kvs_by_name.clear() + _cache_rq_by_id.clear() + _cache_rq_by_name.clear() + + from crawlee import service_container + + cast(dict, service_container._services).clear() + + delattr(apify._actor.Actor, '__wrapped__') + # TODO: local storage client purge # noqa: TD003 return reset @@ -39,11 +52,11 @@ def reset() -> None: # We also set the MemoryStorageClient to use a temp path @pytest.fixture(autouse=True) def _reset_and_patch_default_instances(monkeypatch: pytest.MonkeyPatch, tmp_path: Path, reset_default_instances: Callable[[], None]) -> None: - reset_default_instances() - # This forces the MemoryStorageClient to use tmp_path for its storage dir monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path)) + reset_default_instances() + # This class is used to patch the ApifyClientAsync methods to return a fixed value or be replaced with another method. class ApifyClientAsyncPatcher: @@ -70,13 +83,13 @@ def patch( One of `return_value` and `replacement_method` arguments must be specified. Args: - method (str): Which root method to patch in the ApifyClientAsync. - submethod (str): Which submethod to patch in the root method's result. - return_value (optional, Any): What should the patched method return. - replacement_method (optional, Callable): What method should the original method be replaced by. - is_async (optional, bool): Whether the return value or replacement method should be wrapped by an async wrapper, - in order to not break any `await` statements. - If not passed, it is automatically detected from the type of the method which is being replaced. + method: Which root method to patch in the ApifyClientAsync. + submethod: Which submethod to patch in the root method's result. + return_value: What should the patched method return. + replacement_method: What method should the original method be replaced by. + is_async: Whether the return value or replacement method should be wrapped by an async wrapper, + in order to not break any `await` statements. + If not passed, it is automatically detected from the type of the method which is being replaced. """ client_method = getattr(ApifyClientAsync, method, None) @@ -157,4 +170,8 @@ def apify_client_async_patcher(monkeypatch: pytest.MonkeyPatch) -> ApifyClientAs @pytest.fixture() def memory_storage_client() -> MemoryStorageClient: - return MemoryStorageClient(write_metadata=True, persist_storage=True) + configuration = CrawleeConfiguration() + configuration.persist_storage = True + configuration.write_metadata = True + + return MemoryStorageClient(configuration) diff --git a/tests/unit/memory_storage/__init__.py b/tests/unit/memory_storage/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/memory_storage/resource_clients/__init__.py b/tests/unit/memory_storage/resource_clients/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/memory_storage/resource_clients/test_dataset.py b/tests/unit/memory_storage/resource_clients/test_dataset.py deleted file mode 100644 index 6c5aaecf..00000000 --- a/tests/unit/memory_storage/resource_clients/test_dataset.py +++ /dev/null @@ -1,138 +0,0 @@ -from __future__ import annotations - -import asyncio -import os -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import DatasetClient - - -@pytest.fixture() -async def dataset_client(memory_storage_client: MemoryStorageClient) -> DatasetClient: - datasets_client = memory_storage_client.datasets() - dataset_info = await datasets_client.get_or_create(name='test') - return memory_storage_client.dataset(dataset_info['id']) - - -async def test_nonexistent(memory_storage_client: MemoryStorageClient) -> None: - dataset_client = memory_storage_client.dataset(dataset_id='nonexistent-id') - assert await dataset_client.get() is None - with pytest.raises(ValueError, match='Dataset with id "nonexistent-id" does not exist.'): - await dataset_client.update(name='test-update') - - with pytest.raises(ValueError, match='Dataset with id "nonexistent-id" does not exist.'): - await dataset_client.list_items() - - with pytest.raises(ValueError, match='Dataset with id "nonexistent-id" does not exist.'): - await dataset_client.push_items([{'abc': 123}]) - await dataset_client.delete() - - -async def test_not_implemented(dataset_client: DatasetClient) -> None: - with pytest.raises(NotImplementedError, match='This method is not supported in local memory storage.'): - await dataset_client.stream_items() - with pytest.raises(NotImplementedError, match='This method is not supported in local memory storage.'): - await dataset_client.get_items_as_bytes() - - -async def test_get(dataset_client: DatasetClient) -> None: - await asyncio.sleep(0.1) - info = await dataset_client.get() - assert info is not None - assert info['id'] == dataset_client._id - assert info['accessedAt'] != info['createdAt'] - - -async def test_update(dataset_client: DatasetClient) -> None: - new_dataset_name = 'test-update' - await dataset_client.push_items({'abc': 123}) - - old_dataset_info = await dataset_client.get() - assert old_dataset_info is not None - old_dataset_directory = os.path.join(dataset_client._memory_storage_client._datasets_directory, old_dataset_info['name']) - new_dataset_directory = os.path.join(dataset_client._memory_storage_client._datasets_directory, new_dataset_name) - assert os.path.exists(os.path.join(old_dataset_directory, '000000001.json')) is True - assert os.path.exists(os.path.join(new_dataset_directory, '000000001.json')) is False - - await asyncio.sleep(0.1) - updated_dataset_info = await dataset_client.update(name=new_dataset_name) - assert os.path.exists(os.path.join(old_dataset_directory, '000000001.json')) is False - assert os.path.exists(os.path.join(new_dataset_directory, '000000001.json')) is True - # Only modifiedAt and accessedAt should be different - assert old_dataset_info['createdAt'] == updated_dataset_info['createdAt'] - assert old_dataset_info['modifiedAt'] != updated_dataset_info['modifiedAt'] - assert old_dataset_info['accessedAt'] != updated_dataset_info['accessedAt'] - - # Should fail with the same name - with pytest.raises(ValueError, match='Dataset with name "test-update" already exists.'): - await dataset_client.update(name=new_dataset_name) - - -async def test_delete(dataset_client: DatasetClient) -> None: - await dataset_client.push_items({'abc': 123}) - dataset_info = await dataset_client.get() - assert dataset_info is not None - dataset_directory = os.path.join(dataset_client._memory_storage_client._datasets_directory, dataset_info['name']) - assert os.path.exists(os.path.join(dataset_directory, '000000001.json')) is True - await dataset_client.delete() - assert os.path.exists(os.path.join(dataset_directory, '000000001.json')) is False - # Does not crash when called again - await dataset_client.delete() - - -async def test_push_items(dataset_client: DatasetClient) -> None: - await dataset_client.push_items('{"test": "JSON from a string"}') - await dataset_client.push_items({'abc': {'def': {'ghi': '123'}}}) - await dataset_client.push_items(['{"test-json-parse": "JSON from a string"}' for _ in range(10)]) - await dataset_client.push_items([{'test-dict': i} for i in range(10)]) - - list_page = await dataset_client.list_items() - assert list_page.items[0]['test'] == 'JSON from a string' - assert list_page.items[1]['abc']['def']['ghi'] == '123' - assert list_page.items[11]['test-json-parse'] == 'JSON from a string' - assert list_page.items[21]['test-dict'] == 9 - assert list_page.count == 22 - - -async def test_list_items(dataset_client: DatasetClient) -> None: - item_count = 100 - used_offset = 10 - used_limit = 50 - await dataset_client.push_items([{'id': i} for i in range(item_count)]) - # Test without any parameters - list_default = await dataset_client.list_items() - assert list_default.count == item_count - assert list_default.offset == 0 - assert list_default.items[0]['id'] == 0 - assert list_default.desc is False - # Test offset - list_offset_10 = await dataset_client.list_items(offset=used_offset) - assert list_offset_10.count == item_count - used_offset - assert list_offset_10.offset == used_offset - assert list_offset_10.total == item_count - assert list_offset_10.items[0]['id'] == used_offset - # Test limit - list_limit_50 = await dataset_client.list_items(limit=used_limit) - assert list_limit_50.count == used_limit - assert list_limit_50.limit == used_limit - assert list_limit_50.total == item_count - # Test desc - list_desc_true = await dataset_client.list_items(desc=True) - assert list_desc_true.items[0]['id'] == 99 - assert list_desc_true.desc is True - - -async def test_iterate_items(dataset_client: DatasetClient) -> None: - item_count = 100 - await dataset_client.push_items([{'id': i} for i in range(item_count)]) - actual_items = [] - async for item in dataset_client.iterate_items(): - assert 'id' in item - actual_items.append(item) - assert len(actual_items) == item_count - assert actual_items[0]['id'] == 0 - assert actual_items[99]['id'] == 99 diff --git a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py b/tests/unit/memory_storage/resource_clients/test_dataset_collection.py deleted file mode 100644 index 89b79228..00000000 --- a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import DatasetCollectionClient - - -@pytest.fixture() -def datasets_client(memory_storage_client: MemoryStorageClient) -> DatasetCollectionClient: - return memory_storage_client.datasets() - - -async def test_get_or_create(datasets_client: DatasetCollectionClient) -> None: - dataset_name = 'test' - # A new dataset gets created - dataset_info = await datasets_client.get_or_create(name=dataset_name) - assert dataset_info['name'] == dataset_name - - # Another get_or_create call returns the same dataset - dataset_info_existing = await datasets_client.get_or_create(name=dataset_name) - assert dataset_info['id'] == dataset_info_existing['id'] - assert dataset_info['name'] == dataset_info_existing['name'] - assert dataset_info['createdAt'] == dataset_info_existing['createdAt'] - - -async def test_list(datasets_client: DatasetCollectionClient) -> None: - assert (await datasets_client.list()).count == 0 - dataset_info = await datasets_client.get_or_create(name='dataset') - dataset_list = await datasets_client.list() - assert dataset_list.count == 1 - assert dataset_list.items[0]['name'] == dataset_info['name'] - - # Test sorting behavior - newer_dataset_info = await datasets_client.get_or_create(name='newer-dataset') - dataset_list_sorting = await datasets_client.list() - assert dataset_list_sorting.count == 2 - assert dataset_list_sorting.items[0]['name'] == dataset_info['name'] - assert dataset_list_sorting.items[1]['name'] == newer_dataset_info['name'] diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store.py b/tests/unit/memory_storage/resource_clients/test_key_value_store.py deleted file mode 100644 index 3d885320..00000000 --- a/tests/unit/memory_storage/resource_clients/test_key_value_store.py +++ /dev/null @@ -1,403 +0,0 @@ -from __future__ import annotations - -import asyncio -import base64 -import json -import os -from datetime import datetime, timezone -from typing import TYPE_CHECKING - -import pytest -from apify_shared.utils import json_dumps - -from apify._crypto import crypto_random_object_id -from apify._utils import maybe_parse_body - -if TYPE_CHECKING: - from pathlib import Path - - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import KeyValueStoreClient - -TINY_PNG = base64.b64decode('iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVQYV2NgYAAAAAMAAWgmWQ0AAAAASUVORK5CYII=') -TINY_BYTES = b'\x12\x34\x56\x78\x90\xAB\xCD\xEF' -TINY_DATA = {'a': 'b'} -TINY_TEXT = 'abcd' - - -@pytest.fixture() -async def key_value_store_client(memory_storage_client: MemoryStorageClient) -> KeyValueStoreClient: - key_value_stores_client = memory_storage_client.key_value_stores() - kvs_info = await key_value_stores_client.get_or_create(name='test') - return memory_storage_client.key_value_store(kvs_info['id']) - - -async def test_nonexistent(memory_storage_client: MemoryStorageClient) -> None: - kvs_client = memory_storage_client.key_value_store(key_value_store_id='nonexistent-id') - assert await kvs_client.get() is None - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.update(name='test-update') - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.list_keys() - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.set_record('test', {'abc': 123}) - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.get_record('test') - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.get_record_as_bytes('test') - - with pytest.raises(ValueError, match='Key-value store with id "nonexistent-id" does not exist.'): - await kvs_client.delete_record('test') - - await kvs_client.delete() - - -async def test_not_implemented(key_value_store_client: KeyValueStoreClient) -> None: - with pytest.raises(NotImplementedError, match='This method is not supported in local memory storage.'): - await key_value_store_client.stream_record('test') - - -async def test_get(key_value_store_client: KeyValueStoreClient) -> None: - await asyncio.sleep(0.1) - info = await key_value_store_client.get() - assert info is not None - assert info['id'] == key_value_store_client._id - assert info['accessedAt'] != info['createdAt'] - - -async def test_update(key_value_store_client: KeyValueStoreClient) -> None: - new_kvs_name = 'test-update' - await key_value_store_client.set_record('test', {'abc': 123}) - old_kvs_info = await key_value_store_client.get() - assert old_kvs_info is not None - old_kvs_directory = os.path.join(key_value_store_client._memory_storage_client._key_value_stores_directory, old_kvs_info['name']) - new_kvs_directory = os.path.join(key_value_store_client._memory_storage_client._key_value_stores_directory, new_kvs_name) - assert os.path.exists(os.path.join(old_kvs_directory, 'test.json')) is True - assert os.path.exists(os.path.join(new_kvs_directory, 'test.json')) is False - - await asyncio.sleep(0.1) - updated_kvs_info = await key_value_store_client.update(name=new_kvs_name) - assert os.path.exists(os.path.join(old_kvs_directory, 'test.json')) is False - assert os.path.exists(os.path.join(new_kvs_directory, 'test.json')) is True - # Only modifiedAt and accessedAt should be different - assert old_kvs_info['createdAt'] == updated_kvs_info['createdAt'] - assert old_kvs_info['modifiedAt'] != updated_kvs_info['modifiedAt'] - assert old_kvs_info['accessedAt'] != updated_kvs_info['accessedAt'] - - # Should fail with the same name - with pytest.raises(ValueError, match='Key-value store with name "test-update" already exists.'): - await key_value_store_client.update(name=new_kvs_name) - - -async def test_delete(key_value_store_client: KeyValueStoreClient) -> None: - await key_value_store_client.set_record('test', {'abc': 123}) - kvs_info = await key_value_store_client.get() - assert kvs_info is not None - kvs_directory = os.path.join(key_value_store_client._memory_storage_client._key_value_stores_directory, kvs_info['name']) - assert os.path.exists(os.path.join(kvs_directory, 'test.json')) is True - await key_value_store_client.delete() - assert os.path.exists(os.path.join(kvs_directory, 'test.json')) is False - # Does not crash when called again - await key_value_store_client.delete() - - -async def test_list_keys_empty(key_value_store_client: KeyValueStoreClient) -> None: - keys = await key_value_store_client.list_keys() - assert len(keys['items']) == 0 - assert keys['count'] == 0 - assert keys['isTruncated'] is False - - -async def test_list_keys(key_value_store_client: KeyValueStoreClient) -> None: - record_count = 4 - used_limit = 2 - used_exclusive_start_key = 'a' - await key_value_store_client.set_record('b', 'test') - await key_value_store_client.set_record('a', 'test') - await key_value_store_client.set_record('d', 'test') - await key_value_store_client.set_record('c', 'test') - - # Default settings - keys = await key_value_store_client.list_keys() - assert keys['items'][0]['key'] == 'a' - assert keys['items'][3]['key'] == 'd' - assert keys['count'] == record_count - assert keys['isTruncated'] is False - # Test limit - keys_limit_2 = await key_value_store_client.list_keys(limit=used_limit) - assert keys_limit_2['count'] == record_count - assert keys_limit_2['limit'] == used_limit - assert keys_limit_2['items'][1]['key'] == 'b' - # Test exclusive start key - keys_exclusive_start = await key_value_store_client.list_keys(exclusive_start_key=used_exclusive_start_key, limit=2) - assert keys_exclusive_start['exclusiveStartKey'] == used_exclusive_start_key - assert keys_exclusive_start['isTruncated'] is True - assert keys_exclusive_start['nextExclusiveStartKey'] == 'c' - assert keys_exclusive_start['items'][0]['key'] == 'b' - assert keys_exclusive_start['items'][-1]['key'] == keys_exclusive_start['nextExclusiveStartKey'] - - -async def test_get_and_set_record(tmp_path: Path, key_value_store_client: KeyValueStoreClient) -> None: - # Test setting dict record - dict_record_key = 'test-dict' - await key_value_store_client.set_record(dict_record_key, {'test': 123}) - dict_record_info = await key_value_store_client.get_record(dict_record_key) - assert dict_record_info is not None - assert 'application/json' in dict_record_info['contentType'] - assert dict_record_info['value']['test'] == 123 - - # Test setting str record - str_record_key = 'test-str' - await key_value_store_client.set_record(str_record_key, 'test') - str_record_info = await key_value_store_client.get_record(str_record_key) - assert str_record_info is not None - assert 'text/plain' in str_record_info['contentType'] - assert str_record_info['value'] == 'test' - - # Test setting explicit json record but use str as value, i.e. json dumps is skipped - explicit_json_key = 'test-json' - await key_value_store_client.set_record(explicit_json_key, '{"test": "explicit string"}', 'application/json') - bytes_record_info = await key_value_store_client.get_record(explicit_json_key) - assert bytes_record_info is not None - assert 'application/json' in bytes_record_info['contentType'] - assert bytes_record_info['value']['test'] == 'explicit string' - - # Test using bytes - bytes_key = 'test-json' - bytes_value = b'testing bytes set_record' - await key_value_store_client.set_record(bytes_key, bytes_value, 'unknown') - bytes_record_info = await key_value_store_client.get_record(bytes_key) - assert bytes_record_info is not None - assert 'unknown' in bytes_record_info['contentType'] - assert bytes_record_info['value'] == bytes_value - assert bytes_record_info['value'].decode('utf-8') == bytes_value.decode('utf-8') - - # Test using file descriptor - with open(os.path.join(tmp_path, 'test.json'), 'w+', encoding='utf-8') as f: # noqa: ASYNC230 - f.write('Test') - with pytest.raises(NotImplementedError, match='File-like values are not supported in local memory storage'): - await key_value_store_client.set_record('file', f) - - -async def test_get_record_as_bytes(key_value_store_client: KeyValueStoreClient) -> None: - record_key = 'test' - record_value = 'testing' - await key_value_store_client.set_record(record_key, record_value) - record_info = await key_value_store_client.get_record_as_bytes(record_key) - assert record_info is not None - assert record_info['value'] == record_value.encode('utf-8') - - -async def test_delete_record(key_value_store_client: KeyValueStoreClient) -> None: - record_key = 'test' - await key_value_store_client.set_record(record_key, 'test') - await key_value_store_client.delete_record(record_key) - # Does not crash when called again - await key_value_store_client.delete_record(record_key) - - -@pytest.mark.parametrize( - 'test_case', - [ - { - 'input': {'key': 'image', 'value': TINY_PNG, 'contentType': None}, - 'expectedOutput': {'filename': 'image', 'key': 'image', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'key': 'image', 'value': TINY_PNG, 'contentType': 'image/png'}, - 'expectedOutput': {'filename': 'image.png', 'key': 'image', 'contentType': 'image/png'}, - }, - { - 'input': {'key': 'image.png', 'value': TINY_PNG, 'contentType': None}, - 'expectedOutput': {'filename': 'image.png', 'key': 'image.png', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'key': 'image.png', 'value': TINY_PNG, 'contentType': 'image/png'}, - 'expectedOutput': {'filename': 'image.png', 'key': 'image.png', 'contentType': 'image/png'}, - }, - { - 'input': {'key': 'data', 'value': TINY_DATA, 'contentType': None}, - 'expectedOutput': {'filename': 'data.json', 'key': 'data', 'contentType': 'application/json'}, - }, - { - 'input': {'key': 'data', 'value': TINY_DATA, 'contentType': 'application/json'}, - 'expectedOutput': {'filename': 'data.json', 'key': 'data', 'contentType': 'application/json'}, - }, - { - 'input': {'key': 'data.json', 'value': TINY_DATA, 'contentType': None}, - 'expectedOutput': {'filename': 'data.json', 'key': 'data.json', 'contentType': 'application/json'}, - }, - { - 'input': {'key': 'data.json', 'value': TINY_DATA, 'contentType': 'application/json'}, - 'expectedOutput': {'filename': 'data.json', 'key': 'data.json', 'contentType': 'application/json'}, - }, - { - 'input': {'key': 'text', 'value': TINY_TEXT, 'contentType': None}, - 'expectedOutput': {'filename': 'text.txt', 'key': 'text', 'contentType': 'text/plain'}, - }, - { - 'input': {'key': 'text', 'value': TINY_TEXT, 'contentType': 'text/plain'}, - 'expectedOutput': {'filename': 'text.txt', 'key': 'text', 'contentType': 'text/plain'}, - }, - { - 'input': {'key': 'text.txt', 'value': TINY_TEXT, 'contentType': None}, - 'expectedOutput': {'filename': 'text.txt', 'key': 'text.txt', 'contentType': 'text/plain'}, - }, - { - 'input': {'key': 'text.txt', 'value': TINY_TEXT, 'contentType': 'text/plain'}, - 'expectedOutput': {'filename': 'text.txt', 'key': 'text.txt', 'contentType': 'text/plain'}, - }, - ], -) -async def test_writes_correct_metadata(memory_storage_client: MemoryStorageClient, test_case: dict) -> None: - test_input = test_case['input'] - expected_output = test_case['expectedOutput'] - key_value_store_name = crypto_random_object_id() - - # Write the input data to the key-value store - store_details = await memory_storage_client.key_value_stores().get_or_create(name=key_value_store_name) - key_value_store_client = memory_storage_client.key_value_store(store_details['id']) - await key_value_store_client.set_record(test_input['key'], test_input['value'], content_type=test_input['contentType']) - - # Check that everything was written correctly, both the data and metadata - storage_path = os.path.join(memory_storage_client._key_value_stores_directory, key_value_store_name) - item_path = os.path.join(storage_path, expected_output['filename']) - metadata_path = os.path.join(storage_path, expected_output['filename'] + '.__metadata__.json') - - assert os.path.exists(item_path) - assert os.path.exists(metadata_path) - - with open(item_path, 'rb') as item_file: # noqa: ASYNC230 - actual_value = maybe_parse_body(item_file.read(), expected_output['contentType']) - assert actual_value == test_input['value'] - - with open(metadata_path, encoding='utf-8') as metadata_file: # noqa: ASYNC230 - metadata = json.load(metadata_file) - assert metadata['key'] == expected_output['key'] - assert expected_output['contentType'] in metadata['contentType'] - - -@pytest.mark.parametrize( - 'test_case', - [ - { - 'input': {'filename': 'image', 'value': TINY_PNG, 'metadata': None}, - 'expectedOutput': {'key': 'image', 'filename': 'image', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'filename': 'image.png', 'value': TINY_PNG, 'metadata': None}, - 'expectedOutput': {'key': 'image', 'filename': 'image.png', 'contentType': 'image/png'}, - }, - { - 'input': {'filename': 'image', 'value': TINY_PNG, 'metadata': {'key': 'image', 'contentType': 'application/octet-stream'}}, - 'expectedOutput': {'key': 'image', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'filename': 'image', 'value': TINY_PNG, 'metadata': {'key': 'image', 'contentType': 'image/png'}}, - 'expectedOutput': {'key': 'image', 'filename': 'image', 'contentType': 'image/png'}, - }, - { - 'input': {'filename': 'image.png', 'value': TINY_PNG, 'metadata': {'key': 'image.png', 'contentType': 'application/octet-stream'}}, - 'expectedOutput': {'key': 'image.png', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'filename': 'image.png', 'value': TINY_PNG, 'metadata': {'key': 'image.png', 'contentType': 'image/png'}}, - 'expectedOutput': {'key': 'image.png', 'contentType': 'image/png'}, - }, - { - 'input': {'filename': 'image.png', 'value': TINY_PNG, 'metadata': {'key': 'image', 'contentType': 'image/png'}}, - 'expectedOutput': {'key': 'image', 'contentType': 'image/png'}, - }, - { - 'input': {'filename': 'input', 'value': TINY_BYTES, 'metadata': None}, - 'expectedOutput': {'key': 'input', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'filename': 'input.json', 'value': TINY_DATA, 'metadata': None}, - 'expectedOutput': {'key': 'input', 'contentType': 'application/json'}, - }, - {'input': {'filename': 'input.txt', 'value': TINY_TEXT, 'metadata': None}, 'expectedOutput': {'key': 'input', 'contentType': 'text/plain'}}, - { - 'input': {'filename': 'input.bin', 'value': TINY_BYTES, 'metadata': None}, - 'expectedOutput': {'key': 'input', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'filename': 'input', 'value': TINY_BYTES, 'metadata': {'key': 'input', 'contentType': 'application/octet-stream'}}, - 'expectedOutput': {'key': 'input', 'contentType': 'application/octet-stream'}, - }, - { - 'input': {'filename': 'input.json', 'value': TINY_DATA, 'metadata': {'key': 'input', 'contentType': 'application/json'}}, - 'expectedOutput': {'key': 'input', 'contentType': 'application/json'}, - }, - { - 'input': {'filename': 'input.txt', 'value': TINY_TEXT, 'metadata': {'key': 'input', 'contentType': 'text/plain'}}, - 'expectedOutput': {'key': 'input', 'contentType': 'text/plain'}, - }, - { - 'input': {'filename': 'input.bin', 'value': TINY_BYTES, 'metadata': {'key': 'input', 'contentType': 'application/octet-stream'}}, - 'expectedOutput': {'key': 'input', 'contentType': 'application/octet-stream'}, - }, - ], -) -async def test_reads_correct_metadata(memory_storage_client: MemoryStorageClient, test_case: dict) -> None: - test_input = test_case['input'] - expected_output = test_case['expectedOutput'] - key_value_store_name = crypto_random_object_id() - - # Ensure the directory for the store exists - storage_path = os.path.join(memory_storage_client._key_value_stores_directory, key_value_store_name) - os.makedirs(storage_path, exist_ok=True) - - store_metadata = { - 'id': crypto_random_object_id(), - 'name': None, - 'accessedAt': datetime.now(timezone.utc), - 'createdAt': datetime.now(timezone.utc), - 'modifiedAt': datetime.now(timezone.utc), - 'userId': '1', - } - - # Write the store metadata to disk - store_metadata_path = os.path.join(storage_path, '__metadata__.json') - with open(store_metadata_path, mode='wb') as store_metadata_file: # noqa: ASYNC230 - store_metadata_file.write(json_dumps(store_metadata).encode('utf-8')) - - # Write the test input item to the disk - item_path = os.path.join(storage_path, test_input['filename']) - with open(item_path, 'wb') as item_file: # noqa: ASYNC230 - if isinstance(test_input['value'], bytes): - item_file.write(test_input['value']) - elif isinstance(test_input['value'], str): - item_file.write(test_input['value'].encode('utf-8')) - else: - item_file.write(json_dumps(test_input['value']).encode('utf-8')) - - # Optionally write the metadata to disk if there is some - if test_input['metadata'] is not None: - metadata_path = os.path.join(storage_path, test_input['filename'] + '.__metadata__.json') - with open(metadata_path, 'w', encoding='utf-8') as metadata_file: # noqa: ASYNC230 - metadata_file.write( - json_dumps( - { - 'key': test_input['metadata']['key'], - 'contentType': test_input['metadata']['contentType'], - } - ) - ) - - # Create the key-value store client to load the items from disk - store_details = await memory_storage_client.key_value_stores().get_or_create(name=key_value_store_name) - key_value_store_client = memory_storage_client.key_value_store(store_details['id']) - - # Read the item from the store and check if it is as expected - actual_record = await key_value_store_client.get_record(expected_output['key']) - assert actual_record is not None - - assert actual_record['key'] == expected_output['key'] - assert actual_record['contentType'] == expected_output['contentType'] - assert actual_record['value'] == test_input['value'] diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py b/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py deleted file mode 100644 index f645df01..00000000 --- a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import KeyValueStoreCollectionClient - - -@pytest.fixture() -def key_value_stores_client(memory_storage_client: MemoryStorageClient) -> KeyValueStoreCollectionClient: - return memory_storage_client.key_value_stores() - - -async def test_get_or_create(key_value_stores_client: KeyValueStoreCollectionClient) -> None: - kvs_name = 'test' - # A new kvs gets created - kvs_info = await key_value_stores_client.get_or_create(name=kvs_name) - assert kvs_info['name'] == kvs_name - - # Another get_or_create call returns the same kvs - kvs_info_existing = await key_value_stores_client.get_or_create(name=kvs_name) - assert kvs_info['id'] == kvs_info_existing['id'] - assert kvs_info['name'] == kvs_info_existing['name'] - assert kvs_info['createdAt'] == kvs_info_existing['createdAt'] - - -async def test_list(key_value_stores_client: KeyValueStoreCollectionClient) -> None: - assert (await key_value_stores_client.list()).count == 0 - kvs_info = await key_value_stores_client.get_or_create(name='kvs') - kvs_list = await key_value_stores_client.list() - assert kvs_list.count == 1 - assert kvs_list.items[0]['name'] == kvs_info['name'] - - # Test sorting behavior - newer_kvs_info = await key_value_stores_client.get_or_create(name='newer-kvs') - kvs_list_sorting = await key_value_stores_client.list() - assert kvs_list_sorting.count == 2 - assert kvs_list_sorting.items[0]['name'] == kvs_info['name'] - assert kvs_list_sorting.items[1]['name'] == newer_kvs_info['name'] diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue.py b/tests/unit/memory_storage/resource_clients/test_request_queue.py deleted file mode 100644 index c66bc68f..00000000 --- a/tests/unit/memory_storage/resource_clients/test_request_queue.py +++ /dev/null @@ -1,260 +0,0 @@ -from __future__ import annotations - -import asyncio -import os -from datetime import datetime, timezone -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import RequestQueueClient - - -@pytest.fixture() -async def request_queue_client(memory_storage_client: MemoryStorageClient) -> RequestQueueClient: - request_queues_client = memory_storage_client.request_queues() - rq_info = await request_queues_client.get_or_create(name='test') - return memory_storage_client.request_queue(rq_info['id']) - - -async def test_nonexistent(memory_storage_client: MemoryStorageClient) -> None: - request_queue_client = memory_storage_client.request_queue(request_queue_id='nonexistent-id') - assert await request_queue_client.get() is None - with pytest.raises(ValueError, match='Request queue with id "nonexistent-id" does not exist.'): - await request_queue_client.update(name='test-update') - await request_queue_client.delete() - - -async def test_get(request_queue_client: RequestQueueClient) -> None: - await asyncio.sleep(0.1) - info = await request_queue_client.get() - assert info is not None - assert info['id'] == request_queue_client._id - assert info['accessedAt'] != info['createdAt'] - - -async def test_update(request_queue_client: RequestQueueClient) -> None: - new_rq_name = 'test-update' - await request_queue_client.add_request( - { - 'uniqueKey': 'https://apify.com', - 'url': 'https://apify.com', - } - ) - old_rq_info = await request_queue_client.get() - assert old_rq_info is not None - old_rq_directory = os.path.join(request_queue_client._memory_storage_client._request_queues_directory, old_rq_info['name']) - new_rq_directory = os.path.join(request_queue_client._memory_storage_client._request_queues_directory, new_rq_name) - assert os.path.exists(os.path.join(old_rq_directory, 'fvwscO2UJLdr10B.json')) is True - assert os.path.exists(os.path.join(new_rq_directory, 'fvwscO2UJLdr10B.json')) is False - - await asyncio.sleep(0.1) - updated_rq_info = await request_queue_client.update(name=new_rq_name) - assert os.path.exists(os.path.join(old_rq_directory, 'fvwscO2UJLdr10B.json')) is False - assert os.path.exists(os.path.join(new_rq_directory, 'fvwscO2UJLdr10B.json')) is True - # Only modifiedAt and accessedAt should be different - assert old_rq_info['createdAt'] == updated_rq_info['createdAt'] - assert old_rq_info['modifiedAt'] != updated_rq_info['modifiedAt'] - assert old_rq_info['accessedAt'] != updated_rq_info['accessedAt'] - - # Should fail with the same name - with pytest.raises(ValueError, match='Request queue with name "test-update" already exists'): - await request_queue_client.update(name=new_rq_name) - - -async def test_delete(request_queue_client: RequestQueueClient) -> None: - await request_queue_client.add_request( - { - 'uniqueKey': 'https://apify.com', - 'url': 'https://apify.com', - } - ) - rq_info = await request_queue_client.get() - assert rq_info is not None - - rq_directory = os.path.join(request_queue_client._memory_storage_client._request_queues_directory, rq_info['name']) - assert os.path.exists(os.path.join(rq_directory, 'fvwscO2UJLdr10B.json')) is True - - await request_queue_client.delete() - assert os.path.exists(os.path.join(rq_directory, 'fvwscO2UJLdr10B.json')) is False - - # Does not crash when called again - await request_queue_client.delete() - - -async def test_list_head(request_queue_client: RequestQueueClient) -> None: - request_1_url = 'https://apify.com' - request_2_url = 'https://example.com' - await request_queue_client.add_request( - { - 'uniqueKey': request_1_url, - 'url': request_1_url, - } - ) - await request_queue_client.add_request( - { - 'uniqueKey': request_2_url, - 'url': request_2_url, - } - ) - list_head = await request_queue_client.list_head() - assert len(list_head['items']) == 2 - for item in list_head['items']: - assert 'id' in item - - -async def test_add_record(request_queue_client: RequestQueueClient) -> None: - request_forefront_url = 'https://apify.com' - request_not_forefront_url = 'https://example.com' - request_forefront_info = await request_queue_client.add_request( - { - 'uniqueKey': request_forefront_url, - 'url': request_forefront_url, - }, - forefront=True, - ) - request_not_forefront_info = await request_queue_client.add_request( - { - 'uniqueKey': request_not_forefront_url, - 'url': request_not_forefront_url, - }, - forefront=False, - ) - - assert request_forefront_info.get('requestId') is not None - assert request_not_forefront_info.get('requestId') is not None - assert request_forefront_info['wasAlreadyHandled'] is False - assert request_not_forefront_info['wasAlreadyHandled'] is False - - rq_info = await request_queue_client.get() - assert rq_info is not None - assert rq_info['pendingRequestCount'] == rq_info['totalRequestCount'] == 2 - assert rq_info['handledRequestCount'] == 0 - - -async def test_get_record(request_queue_client: RequestQueueClient) -> None: - request_url = 'https://apify.com' - request_info = await request_queue_client.add_request( - { - 'uniqueKey': request_url, - 'url': request_url, - } - ) - request = await request_queue_client.get_request(request_info['requestId']) - assert request is not None - assert 'id' in request - assert request['url'] == request['uniqueKey'] == request_url - - # Non-existent id - assert (await request_queue_client.get_request('non-existent id')) is None - - -async def test_update_record(request_queue_client: RequestQueueClient) -> None: - request_url = 'https://apify.com' - request_info = await request_queue_client.add_request( - { - 'uniqueKey': request_url, - 'url': request_url, - } - ) - request = await request_queue_client.get_request(request_info['requestId']) - assert request is not None - - rq_info_before_update = await request_queue_client.get() - assert rq_info_before_update is not None - assert rq_info_before_update['pendingRequestCount'] == 1 - assert rq_info_before_update['handledRequestCount'] == 0 - - request_update_info = await request_queue_client.update_request({**request, 'handledAt': datetime.now(timezone.utc)}) - assert request_update_info['wasAlreadyHandled'] is False - - rq_info_after_update = await request_queue_client.get() - assert rq_info_after_update is not None - assert rq_info_after_update['pendingRequestCount'] == 0 - assert rq_info_after_update['handledRequestCount'] == 1 - - -async def test_delete_record(request_queue_client: RequestQueueClient) -> None: - request_url = 'https://apify.com' - pending_request_info = await request_queue_client.add_request( - { - 'uniqueKey': 'pending', - 'url': request_url, - } - ) - handled_request_info = await request_queue_client.add_request( - { - 'uniqueKey': 'handled', - 'url': request_url, - 'handledAt': datetime.now(tz=timezone.utc), - } - ) - - rq_info_before_delete = await request_queue_client.get() - assert rq_info_before_delete is not None - assert rq_info_before_delete['pendingRequestCount'] == 1 - assert rq_info_before_delete['pendingRequestCount'] == 1 - - await request_queue_client.delete_request(pending_request_info['requestId']) - rq_info_after_first_delete = await request_queue_client.get() - assert rq_info_after_first_delete is not None - assert rq_info_after_first_delete['pendingRequestCount'] == 0 - assert rq_info_after_first_delete['handledRequestCount'] == 1 - - await request_queue_client.delete_request(handled_request_info['requestId']) - rq_info_after_second_delete = await request_queue_client.get() - assert rq_info_after_second_delete is not None - assert rq_info_after_second_delete['pendingRequestCount'] == 0 - assert rq_info_after_second_delete['handledRequestCount'] == 0 - - # Does not crash when called again - await request_queue_client.delete_request(pending_request_info['requestId']) - - -async def test_forefront(request_queue_client: RequestQueueClient) -> None: - # this should create a queue with requests in this order: - # Handled: - # 2, 5, 8 - # Not handled: - # 7, 4, 1, 0, 3, 6 - for i in range(9): - request_url = f'http://example.com/{i}' - forefront = i % 3 == 1 - was_handled = i % 3 == 2 - await request_queue_client.add_request( - { - 'uniqueKey': str(i), - 'url': request_url, - 'handledAt': datetime.now(timezone.utc) if was_handled else None, - }, - forefront=forefront, - ) - - # Check that the queue head (unhandled items) is in the right order - queue_head = await request_queue_client.list_head() - req_unique_keys = [req['uniqueKey'] for req in queue_head['items']] - assert req_unique_keys == ['7', '4', '1', '0', '3', '6'] - - # Mark request #1 as handled - await request_queue_client.update_request( - { - 'uniqueKey': '1', - 'url': 'http://example.com/1', - 'handledAt': datetime.now(timezone.utc), - } - ) - # Move request #3 to forefront - await request_queue_client.update_request( - { - 'uniqueKey': '3', - 'url': 'http://example.com/3', - }, - forefront=True, - ) - - # Check that the queue head (unhandled items) is in the right order after the updates - queue_head = await request_queue_client.list_head() - req_unique_keys = [req['uniqueKey'] for req in queue_head['items']] - assert req_unique_keys == ['3', '7', '4', '0', '6'] diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py b/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py deleted file mode 100644 index 3c33a2ac..00000000 --- a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -if TYPE_CHECKING: - from apify._memory_storage import MemoryStorageClient - from apify._memory_storage.resource_clients import RequestQueueCollectionClient - - -@pytest.fixture() -def request_queues_client(memory_storage_client: MemoryStorageClient) -> RequestQueueCollectionClient: - return memory_storage_client.request_queues() - - -async def test_get_or_create(request_queues_client: RequestQueueCollectionClient) -> None: - rq_name = 'test' - # A new request queue gets created - rq_info = await request_queues_client.get_or_create(name=rq_name) - assert rq_info['name'] == rq_name - - # Another get_or_create call returns the same request queue - rq_existing = await request_queues_client.get_or_create(name=rq_name) - assert rq_info['id'] == rq_existing['id'] - assert rq_info['name'] == rq_existing['name'] - assert rq_info['createdAt'] == rq_existing['createdAt'] - - -async def test_list(request_queues_client: RequestQueueCollectionClient) -> None: - assert (await request_queues_client.list()).count == 0 - rq_info = await request_queues_client.get_or_create(name='dataset') - rq_list = await request_queues_client.list() - assert rq_list.count == 1 - assert rq_list.items[0]['name'] == rq_info['name'] - - # Test sorting behavior - newer_rq_info = await request_queues_client.get_or_create(name='newer-dataset') - rq_list_sorting = await request_queues_client.list() - assert rq_list_sorting.count == 2 - assert rq_list_sorting.items[0]['name'] == rq_info['name'] - assert rq_list_sorting.items[1]['name'] == newer_rq_info['name'] diff --git a/tests/unit/memory_storage/test_memory_storage.py b/tests/unit/memory_storage/test_memory_storage.py deleted file mode 100644 index 3d32398e..00000000 --- a/tests/unit/memory_storage/test_memory_storage.py +++ /dev/null @@ -1,154 +0,0 @@ -from __future__ import annotations - -import os -from typing import TYPE_CHECKING - -import pytest -from apify_shared.consts import ApifyEnvVars - -from apify._memory_storage import MemoryStorageClient - -if TYPE_CHECKING: - from pathlib import Path - - -async def test_write_metadata(tmp_path: Path) -> None: - dataset_name = 'test' - dataset_no_metadata_name = 'test-no-metadata' - ms = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=True) - ms_no_metadata = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=False) - datasets_client = ms.datasets() - datasets_no_metadata_client = ms_no_metadata.datasets() - await datasets_client.get_or_create(name=dataset_name) - await datasets_no_metadata_client.get_or_create(name=dataset_no_metadata_name) - assert os.path.exists(os.path.join(ms._datasets_directory, dataset_name, '__metadata__.json')) is True - assert os.path.exists(os.path.join(ms_no_metadata._datasets_directory, dataset_no_metadata_name, '__metadata__.json')) is False - - -async def test_persist_storage(tmp_path: Path) -> None: - ms = MemoryStorageClient(local_data_directory=str(tmp_path), persist_storage=True) - ms_no_persist = MemoryStorageClient(local_data_directory=str(tmp_path), persist_storage=False) - kvs_client = ms.key_value_stores() - kvs_no_metadata_client = ms_no_persist.key_value_stores() - kvs_info = await kvs_client.get_or_create(name='kvs') - kvs_no_metadata_info = await kvs_no_metadata_client.get_or_create(name='kvs-no-persist') - await ms.key_value_store(kvs_info['id']).set_record('test', {'x': 1}, 'application/json') - await ms_no_persist.key_value_store(kvs_no_metadata_info['id']).set_record('test', {'x': 1}, 'application/json') - assert os.path.exists(os.path.join(ms._key_value_stores_directory, kvs_info['name'], 'test.json')) is True - assert os.path.exists(os.path.join(ms_no_persist._key_value_stores_directory, kvs_no_metadata_info['name'], 'test.json')) is False - - -def test_config_via_env_vars_persist_storage(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - # Env var changes persist_storage to False - monkeypatch.setenv('APIFY_PERSIST_STORAGE', 'false') - ms = MemoryStorageClient(local_data_directory=str(tmp_path)) - assert ms._persist_storage is False - monkeypatch.setenv('APIFY_PERSIST_STORAGE', '0') - ms = MemoryStorageClient(local_data_directory=str(tmp_path)) - assert ms._persist_storage is False - monkeypatch.setenv('APIFY_PERSIST_STORAGE', '') - ms = MemoryStorageClient(local_data_directory=str(tmp_path)) - assert ms._persist_storage is False - # Test if constructor arg takes precedence over env var value - ms = MemoryStorageClient(local_data_directory=str(tmp_path), persist_storage=True) - assert ms._persist_storage is True - - -def test_config_via_env_vars_write_metadata(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: - # Env var changes write_metadata to True - monkeypatch.setenv('DEBUG', '*') - ms = MemoryStorageClient(local_data_directory=str(tmp_path)) - assert ms._write_metadata is True - # Test if constructor arg takes precedence over env var value - ms = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=False) - assert ms._write_metadata is False - - -async def test_purge_datasets(tmp_path: Path) -> None: - ms = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=True) - # Create default and non-default datasets - datasets_client = ms.datasets() - default_dataset_info = await datasets_client.get_or_create(name='default') - non_default_dataset_info = await datasets_client.get_or_create(name='non-default') - - # Check all folders inside datasets directory before and after purge - folders_before_purge = os.listdir(ms._datasets_directory) - assert default_dataset_info['name'] in folders_before_purge - assert non_default_dataset_info['name'] in folders_before_purge - - await ms._purge() - folders_after_purge = os.listdir(ms._datasets_directory) - assert default_dataset_info['name'] not in folders_after_purge - assert non_default_dataset_info['name'] in folders_after_purge - - -async def test_purge_key_value_stores(tmp_path: Path) -> None: - ms = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=True) - - # Create default and non-default key-value stores - kvs_client = ms.key_value_stores() - default_kvs_info = await kvs_client.get_or_create(name='default') - non_default_kvs_info = await kvs_client.get_or_create(name='non-default') - default_kvs_client = ms.key_value_store(default_kvs_info['id']) - # INPUT.json should be kept - await default_kvs_client.set_record('INPUT', {'abc': 123}, 'application/json') - # test.json should not be kept - await default_kvs_client.set_record('test', {'abc': 123}, 'application/json') - - # Check all folders and files inside kvs directory before and after purge - folders_before_purge = os.listdir(ms._key_value_stores_directory) - assert default_kvs_info['name'] in folders_before_purge - assert non_default_kvs_info['name'] in folders_before_purge - default_folder_files_before_purge = os.listdir(os.path.join(ms._key_value_stores_directory, 'default')) - assert 'INPUT.json' in default_folder_files_before_purge - assert 'test.json' in default_folder_files_before_purge - - await ms._purge() - folders_after_purge = os.listdir(ms._key_value_stores_directory) - assert default_kvs_info['name'] in folders_after_purge - assert non_default_kvs_info['name'] in folders_after_purge - default_folder_files_after_purge = os.listdir(os.path.join(ms._key_value_stores_directory, 'default')) - assert 'INPUT.json' in default_folder_files_after_purge - assert 'test.json' not in default_folder_files_after_purge - - -async def test_purge_request_queues(tmp_path: Path) -> None: - ms = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=True) - # Create default and non-default request queues - rq_client = ms.request_queues() - default_rq_info = await rq_client.get_or_create(name='default') - non_default_rq_info = await rq_client.get_or_create(name='non-default') - - # Check all folders inside rq directory before and after purge - folders_before_purge = os.listdir(ms._request_queues_directory) - assert default_rq_info['name'] in folders_before_purge - assert non_default_rq_info['name'] in folders_before_purge - await ms._purge() - folders_after_purge = os.listdir(ms._request_queues_directory) - assert default_rq_info['name'] not in folders_after_purge - assert non_default_rq_info['name'] in folders_after_purge - - -async def test_not_implemented_method(tmp_path: Path) -> None: - ms = MemoryStorageClient(local_data_directory=str(tmp_path), write_metadata=True) - ddt = ms.dataset('test') - with pytest.raises(NotImplementedError, match='This method is not supported in local memory storage.'): - await ddt.stream_items(item_format='json') - - with pytest.raises(NotImplementedError, match='This method is not supported in local memory storage.'): - await ddt.stream_items(item_format='json') - - -async def test_storage_path_configuration(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.delenv(ApifyEnvVars.LOCAL_STORAGE_DIR) - default_ms = MemoryStorageClient() - assert default_ms._local_data_directory == './storage' - - # We expect the env var to override the default value - monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, './env_var_storage_dir') - env_var_ms = MemoryStorageClient() - assert env_var_ms._local_data_directory == './env_var_storage_dir' - - # We expect the parametrized value to override the env var - parametrized_ms = MemoryStorageClient(local_data_directory='./parametrized_storage_dir') - assert parametrized_ms._local_data_directory == './parametrized_storage_dir' diff --git a/tests/unit/scrapy/requests/test_to_apify_request.py b/tests/unit/scrapy/requests/test_to_apify_request.py index ac483e76..0116f5ec 100644 --- a/tests/unit/scrapy/requests/test_to_apify_request.py +++ b/tests/unit/scrapy/requests/test_to_apify_request.py @@ -22,9 +22,9 @@ def test__to_apify_request__simple(spider: Spider) -> None: apify_request = to_apify_request(scrapy_request, spider) assert apify_request is not None - assert apify_request.get('url') == 'https://example.com' + assert apify_request.url == 'https://example.com' - user_data = apify_request.get('userData', {}) + user_data = apify_request.user_data assert isinstance(user_data, dict) assert 'scrapy_request' in user_data assert isinstance(user_data.get('scrapy_request'), str) @@ -37,7 +37,7 @@ def test__to_apify_request__headers(spider: Spider) -> None: apify_request = to_apify_request(scrapy_request, spider) assert apify_request is not None - assert apify_request['headers'] == dict(scrapy_request_headers.to_unicode_dict()) + assert apify_request.headers == dict(scrapy_request_headers.to_unicode_dict()) def test__to_apify_request__without_id_and_unique_key(spider: Spider) -> None: @@ -50,10 +50,10 @@ def test__to_apify_request__without_id_and_unique_key(spider: Spider) -> None: apify_request = to_apify_request(scrapy_request, spider) assert apify_request is not None - assert apify_request.get('url') == 'https://example.com' - assert apify_request.get('method') == 'GET' + assert apify_request.url == 'https://example.com' + assert apify_request.method == 'GET' - user_data = apify_request.get('userData', {}) + user_data = apify_request.user_data assert isinstance(user_data, dict) assert user_data['some_user_data'] == 'test' @@ -75,12 +75,12 @@ def test__to_apify_request__with_id_and_unique_key(spider: Spider) -> None: apify_request = to_apify_request(scrapy_request, spider) assert apify_request is not None - assert apify_request.get('url') == 'https://example.com' - assert apify_request.get('method') == 'GET' - assert apify_request.get('id') == 'abc123' - assert apify_request.get('uniqueKey') == 'https://example.com' + assert apify_request.url == 'https://example.com' + assert apify_request.method == 'GET' + assert apify_request.id == 'abc123' + assert apify_request.unique_key == 'https://example.com' - user_data = apify_request.get('userData', {}) + user_data = apify_request.user_data assert isinstance(user_data, dict) assert user_data['some_user_data'] == 'hello' diff --git a/tests/unit/scrapy/requests/test_to_scrapy_request.py b/tests/unit/scrapy/requests/test_to_scrapy_request.py index ebd294e4..8c9ebe4f 100644 --- a/tests/unit/scrapy/requests/test_to_scrapy_request.py +++ b/tests/unit/scrapy/requests/test_to_scrapy_request.py @@ -6,6 +6,8 @@ from scrapy import Request, Spider from scrapy.http.headers import Headers +from crawlee import Request as CrawleeRequest + from apify.scrapy.requests import to_scrapy_request @@ -21,134 +23,96 @@ def spider() -> DummySpider: def test__to_scrapy_request__without_reconstruction(spider: Spider) -> None: # Without reconstruction of encoded Scrapy request - apify_request = { - 'url': 'https://example.com', - 'method': 'GET', - 'uniqueKey': 'https://example.com', - 'id': 'fvwscO2UJLdr10B', - } + apify_request = CrawleeRequest(url='https://example.com', method='GET', unique_key='https://example.com', id='fvwscO2UJLdr10B', user_data={}) scrapy_request = to_scrapy_request(apify_request, spider) assert isinstance(scrapy_request, Request) - assert apify_request['url'] == scrapy_request.url - assert apify_request['method'] == scrapy_request.method - assert apify_request['id'] == scrapy_request.meta.get('apify_request_id') - assert apify_request['uniqueKey'] == scrapy_request.meta.get('apify_request_unique_key') + assert apify_request.url == scrapy_request.url + assert apify_request.method == scrapy_request.method + assert apify_request.id == scrapy_request.meta.get('apify_request_id') + assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') def test__to_scrapy_request__without_reconstruction_with_optional_fields(spider: Spider) -> None: # Without reconstruction of encoded Scrapy request - apify_request = { - 'url': 'https://crawlee.dev', - 'method': 'GET', - 'uniqueKey': 'https://crawlee.dev', - 'id': 'fvwscO2UJLdr10B', - 'headers': {'Authorization': 'Bearer access_token'}, - 'userData': {'some_user_data': 'test'}, - } + apify_request = CrawleeRequest( + url='https://crawlee.dev', + method='GET', + unique_key='https://crawlee.dev', + id='fvwscO2UJLdr10B', + headers={'Authorization': 'Bearer access_token'}, + user_data={'some_user_data': 'test'}, + ) scrapy_request = to_scrapy_request(apify_request, spider) assert isinstance(scrapy_request, Request) - assert apify_request['url'] == scrapy_request.url - assert apify_request['method'] == scrapy_request.method - assert apify_request['id'] == scrapy_request.meta.get('apify_request_id') - assert apify_request['uniqueKey'] == scrapy_request.meta.get('apify_request_unique_key') - assert Headers(apify_request['headers']) == scrapy_request.headers - assert apify_request['userData'] == scrapy_request.meta.get('userData') + assert apify_request.url == scrapy_request.url + assert apify_request.method == scrapy_request.method + assert apify_request.id == scrapy_request.meta.get('apify_request_id') + assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') + assert Headers(apify_request.headers) == scrapy_request.headers + assert apify_request.user_data == scrapy_request.meta.get('userData') def test__to_scrapy_request__with_reconstruction(spider: Spider) -> None: # With reconstruction of encoded Scrapy request - apify_request = { - 'url': 'https://apify.com', - 'method': 'GET', - 'id': 'fvwscO2UJLdr10B', - 'uniqueKey': 'https://apify.com', - 'userData': { + apify_request = CrawleeRequest( + url='https://apify.com', + method='GET', + id='fvwscO2UJLdr10B', + unique_key='https://apify.com', + user_data={ 'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n', # noqa: E501 }, - } + ) scrapy_request = to_scrapy_request(apify_request, spider) assert isinstance(scrapy_request, Request) - assert apify_request['url'] == scrapy_request.url - assert apify_request['method'] == scrapy_request.method - assert apify_request['id'] == scrapy_request.meta.get('apify_request_id') - assert apify_request['uniqueKey'] == scrapy_request.meta.get('apify_request_unique_key') - assert apify_request['userData'] == scrapy_request.meta.get('userData') + assert apify_request.url == scrapy_request.url + assert apify_request.method == scrapy_request.method + assert apify_request.id == scrapy_request.meta.get('apify_request_id') + assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') + assert apify_request.user_data == scrapy_request.meta.get('userData') def test__to_scrapy_request__with_reconstruction_with_optional_fields(spider: Spider) -> None: # With reconstruction of encoded Scrapy request - apify_request = { - 'url': 'https://apify.com', - 'method': 'GET', - 'id': 'fvwscO2UJLdr10B', - 'uniqueKey': 'https://apify.com', - 'headers': {'Authorization': 'Bearer access_token'}, - 'userData': { + apify_request = CrawleeRequest( + url='https://apify.com', + method='GET', + id='fvwscO2UJLdr10B', + unique_key='https://apify.com', + headers={'Authorization': 'Bearer access_token'}, + user_data={ 'some_user_data': 'hello', 'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n', # noqa: E501 }, - } + ) scrapy_request = to_scrapy_request(apify_request, spider) assert isinstance(scrapy_request, Request) - assert apify_request['url'] == scrapy_request.url - assert apify_request['method'] == scrapy_request.method - assert apify_request['id'] == scrapy_request.meta.get('apify_request_id') - assert apify_request['uniqueKey'] == scrapy_request.meta.get('apify_request_unique_key') - assert Headers(apify_request['headers']) == scrapy_request.headers - assert apify_request['userData'] == scrapy_request.meta.get('userData') - - -def test__to_scrapy_request__invalid_missing_url(spider: Spider) -> None: - apify_request = { - 'method': 'GET', - 'id': 'fvwscO2UJLdr10B', - 'uniqueKey': 'https://example.com', - } - - with pytest.raises(ValueError): - to_scrapy_request(apify_request, spider) - - -def test__to_scrapy_request__invalid_missing_id(spider: Spider) -> None: - apify_request = { - 'url': 'https://example.com', - 'method': 'GET', - 'uniqueKey': 'https://example.com', - } - - with pytest.raises(ValueError): - to_scrapy_request(apify_request, spider) - - -def test__to_scrapy_request__invalid_missing_unique_key(spider: Spider) -> None: - apify_request = { - 'url': 'https://example.com', - 'method': 'GET', - 'id': 'fvwscO2UJLdr10B', - } - - with pytest.raises(ValueError): - to_scrapy_request(apify_request, spider) + assert apify_request.url == scrapy_request.url + assert apify_request.method == scrapy_request.method + assert apify_request.id == scrapy_request.meta.get('apify_request_id') + assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') + assert Headers(apify_request.headers) == scrapy_request.headers + assert apify_request.user_data == scrapy_request.meta.get('userData') def test__to_scrapy_request__invalid_request_for_reconstruction(spider: Spider) -> None: - apify_request = { - 'url': 'https://example.com', - 'method': 'GET', - 'id': 'invalid123', - 'uniqueKey': 'https://example.com', - 'userData': { + apify_request = CrawleeRequest( + url='https://example.com', + method='GET', + id='invalid123', + unique_key='https://example.com', + user_data={ 'scrapy_request': 'this is not a correctly encoded Scrapy request', }, - } + ) with pytest.raises(binascii.Error): to_scrapy_request(apify_request, spider) diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py deleted file mode 100644 index ca3b1ca3..00000000 --- a/tests/unit/storages/test_dataset.py +++ /dev/null @@ -1,107 +0,0 @@ -from __future__ import annotations - -import pytest - -from apify.storages import Dataset, KeyValueStore - - -@pytest.fixture() -async def dataset() -> Dataset: - return await Dataset.open() - - -async def test_open() -> None: - default_dataset = await Dataset.open() - default_dataset_by_id = await Dataset.open(id=default_dataset._id) - - assert default_dataset is default_dataset_by_id - - dataset_name = 'dummy-name' - named_dataset = await Dataset.open(name=dataset_name) - assert default_dataset is not named_dataset - - with pytest.raises(RuntimeError, match='Dataset with id "nonexistent-id" does not exist!'): - await Dataset.open(id='nonexistent-id') - - # Test that when you try to open a dataset by ID and you use a name of an existing dataset, - # it doesn't work - with pytest.raises(RuntimeError, match='Dataset with id "dummy-name" does not exist!'): - await Dataset.open(id='dummy-name') - - -async def test_same_references() -> None: - dataset1 = await Dataset.open() - dataset2 = await Dataset.open() - assert dataset1 is dataset2 - - dataset_name = 'non-default' - dataset_named1 = await Dataset.open(name=dataset_name) - dataset_named2 = await Dataset.open(name=dataset_name) - assert dataset_named1 is dataset_named2 - - -async def test_drop() -> None: - dataset1 = await Dataset.open() - await dataset1.drop() - dataset2 = await Dataset.open() - assert dataset1 is not dataset2 - - -async def test_export(dataset: Dataset) -> None: - expected_csv = 'id,test\r\n0,test\r\n1,test\r\n2,test\r\n' - expected_json = [{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}, {'id': 2, 'test': 'test'}] - desired_item_count = 3 - await dataset.push_data([{'id': i, 'test': 'test'} for i in range(desired_item_count)]) - await dataset.export_to_csv('dataset-csv') - await dataset.export_to_json('dataset-json') - dataset_csv = await KeyValueStore.get_value('dataset-csv') - dataset_json = await KeyValueStore.get_value('dataset-json') - assert dataset_csv == expected_csv - assert dataset_json == expected_json - - -async def test_push_data(dataset: Dataset) -> None: - desired_item_count = 2000 - await dataset.push_data([{'id': i} for i in range(desired_item_count)]) - dataset_info = await dataset.get_info() - assert dataset_info is not None - assert dataset_info['itemCount'] == desired_item_count - list_page = await dataset.get_data(limit=desired_item_count) - assert list_page.items[0]['id'] == 0 - assert list_page.items[-1]['id'] == desired_item_count - 1 - - -async def test_push_data_empty(dataset: Dataset) -> None: - await dataset.push_data([]) - dataset_info = await dataset.get_info() - assert dataset_info is not None - assert dataset_info['itemCount'] == 0 - - -async def test_push_data_singular(dataset: Dataset) -> None: - await dataset.push_data({'id': 1}) - dataset_info = await dataset.get_info() - assert dataset_info is not None - assert dataset_info['itemCount'] == 1 - list_page = await dataset.get_data() - assert list_page.items[0]['id'] == 1 - - -async def test_get_data(dataset: Dataset) -> None: # We don't test everything, that's done in memory storage tests - desired_item_count = 3 - await dataset.push_data([{'id': i} for i in range(desired_item_count)]) - list_page = await dataset.get_data() - assert list_page.count == desired_item_count - assert list_page.desc is False - assert list_page.offset == 0 - assert list_page.items[0]['id'] == 0 - assert list_page.items[-1]['id'] == desired_item_count - 1 - - -async def test_iterate_items(dataset: Dataset) -> None: - desired_item_count = 3 - idx = 0 - await dataset.push_data([{'id': i} for i in range(desired_item_count)]) - async for item in dataset.iterate_items(): - assert item['id'] == idx - idx += 1 diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py deleted file mode 100644 index 042fd873..00000000 --- a/tests/unit/storages/test_key_value_store.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import annotations - -import pytest - -from apify.storages import KeyValueStore - - -@pytest.fixture() -async def key_value_store() -> KeyValueStore: - return await KeyValueStore.open() - - -async def test_open() -> None: - default_key_value_store = await KeyValueStore.open() - default_key_value_store_by_id = await KeyValueStore.open(id=default_key_value_store._id) - - assert default_key_value_store is default_key_value_store_by_id - - key_value_store_name = 'dummy-name' - named_key_value_store = await KeyValueStore.open(name=key_value_store_name) - assert default_key_value_store is not named_key_value_store - - with pytest.raises(RuntimeError, match='Key-value store with id "nonexistent-id" does not exist!'): - await KeyValueStore.open(id='nonexistent-id') - - # Test that when you try to open a key-value store by ID and you use a name of an existing key-value store, - # it doesn't work - with pytest.raises(RuntimeError, match='Key-value store with id "dummy-name" does not exist!'): - await KeyValueStore.open(id='dummy-name') - - -async def test_same_references() -> None: - kvs1 = await KeyValueStore.open() - kvs2 = await KeyValueStore.open() - assert kvs1 is kvs2 - - kvs_name = 'non-default' - kvs_named1 = await KeyValueStore.open(name=kvs_name) - kvs_named2 = await KeyValueStore.open(name=kvs_name) - assert kvs_named1 is kvs_named2 - - -async def test_drop() -> None: - kvs1 = await KeyValueStore.open() - await kvs1.drop() - kvs2 = await KeyValueStore.open() - assert kvs1 is not kvs2 - - -async def test_get_set_value(key_value_store: KeyValueStore) -> None: - await key_value_store.set_value('test-str', 'string') - await key_value_store.set_value('test-int', 123) - await key_value_store.set_value('test-dict', {'abc': '123'}) - str_value = await key_value_store.get_value('test-str') - int_value = await key_value_store.get_value('test-int') - dict_value = await key_value_store.get_value('test-dict') - non_existent_value = await key_value_store.get_value('test-non-existent') - assert str_value == 'string' - assert int_value == 123 - assert dict_value['abc'] == '123' - assert non_existent_value is None - - -async def test_for_each_key(key_value_store: KeyValueStore) -> None: - keys = [i async for i in key_value_store.iterate_keys()] - assert len(keys) == 0 - - for i in range(2001): - await key_value_store.set_value(str(i).zfill(4), i) - index = 0 - async for key, _ in key_value_store.iterate_keys(): - assert key == str(index).zfill(4) - index += 1 - assert index == 2001 - - -async def test_get_public_url() -> None: - store = await KeyValueStore.open() - with pytest.raises(RuntimeError, match='Cannot generate a public URL for this key-value store as it is not on the Apify Platform!'): - await store.get_public_url('dummy') - - -async def test_static_get_set_value() -> None: - await KeyValueStore.set_value('test-static', 'static') - value = await KeyValueStore.get_value('test-static') - assert value == 'static' diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py deleted file mode 100644 index 2922e5b8..00000000 --- a/tests/unit/storages/test_request_queue.py +++ /dev/null @@ -1,112 +0,0 @@ -from __future__ import annotations - -import asyncio -from datetime import datetime, timezone - -import pytest - -from apify.storages import RequestQueue - - -@pytest.fixture() -async def request_queue() -> RequestQueue: - return await RequestQueue.open() - - -async def test_open() -> None: - default_request_queue = await RequestQueue.open() - default_request_queue_by_id = await RequestQueue.open(id=default_request_queue._id) - - assert default_request_queue is default_request_queue_by_id - - request_queue_name = 'dummy-name' - named_request_queue = await RequestQueue.open(name=request_queue_name) - assert default_request_queue is not named_request_queue - - with pytest.raises(RuntimeError, match='Request queue with id "nonexistent-id" does not exist!'): - await RequestQueue.open(id='nonexistent-id') - - # Test that when you try to open a request queue by ID and you use a name of an existing request queue, - # it doesn't work - with pytest.raises(RuntimeError, match='Request queue with id "dummy-name" does not exist!'): - await RequestQueue.open(id='dummy-name') - - -async def test_same_references() -> None: - rq1 = await RequestQueue.open() - rq2 = await RequestQueue.open() - assert rq1 is rq2 - - rq_name = 'non-default' - rq_named1 = await RequestQueue.open(name=rq_name) - rq_named2 = await RequestQueue.open(name=rq_name) - assert rq_named1 is rq_named2 - - -async def test_drop() -> None: - rq1 = await RequestQueue.open() - await rq1.drop() - rq2 = await RequestQueue.open() - assert rq1 is not rq2 - - -async def test_get_request(request_queue: RequestQueue) -> None: - url = 'https://example.com' - add_request_info = await request_queue.add_request( - { - 'uniqueKey': url, - 'url': url, - } - ) - request = await request_queue.get_request(add_request_info['requestId']) - assert request is not None - assert request['url'] == url - - -async def test_add_fetch_handle_request(request_queue: RequestQueue) -> None: - url = 'https://example.com' - assert await request_queue.is_empty() is True - with pytest.raises(ValueError, match='"url" is required'): - await request_queue.add_request({}) - add_request_info = await request_queue.add_request( - { - 'uniqueKey': url, - 'url': url, - } - ) - assert add_request_info['wasAlreadyPresent'] is False - assert add_request_info['wasAlreadyHandled'] is False - assert await request_queue.is_empty() is False - - # Fetch the request - next_request = await request_queue.fetch_next_request() - assert next_request is not None - - # Mark it as handled - next_request['handledAt'] = datetime.now(timezone.utc) - queue_operation_info = await request_queue.mark_request_as_handled(next_request) - assert queue_operation_info is not None - assert queue_operation_info['uniqueKey'] == url - assert await request_queue.is_finished() is True - - -async def test_reclaim_request(request_queue: RequestQueue) -> None: - url = 'https://example.com' - await request_queue.add_request( - { - 'uniqueKey': url, - 'url': url, - } - ) - # Fetch the request - next_request = await request_queue.fetch_next_request() - assert next_request is not None - assert next_request['uniqueKey'] == url - - # Reclaim - await request_queue.reclaim_request(next_request) - # Try to fetch again after a few secs - await asyncio.sleep(4) # 3 seconds is the consistency delay in request queue - next_again = await request_queue.fetch_next_request() - assert next_again is not None - assert next_again['uniqueKey'] == url diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py deleted file mode 100644 index b7770e38..00000000 --- a/tests/unit/test_config.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import annotations - -from datetime import datetime, timezone -from typing import TYPE_CHECKING - -from apify_shared.consts import ActorEnvVars, ApifyEnvVars - -from apify.config import Configuration - -if TYPE_CHECKING: - import pytest - - -class TestConfiguration: - # Test that some config properties have some reasonable defaults - def test_configuration_defaults(self: TestConfiguration) -> None: - config = Configuration() - assert config.token is None - assert config.proxy_password is None - assert config.api_base_url == 'https://api.apify.com' - assert config.proxy_hostname == 'proxy.apify.com' - assert config.default_dataset_id == 'default' - assert config.default_key_value_store_id == 'default' - assert config.default_request_queue_id == 'default' - assert config.is_at_home is False - assert config.proxy_port == 8000 - assert config.memory_mbytes is None - assert config.started_at is None - - # Test that defining properties via env vars works - def test_configuration_from_env_vars(self: TestConfiguration, monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setenv(ApifyEnvVars.TOKEN, 'DUMMY_TOKEN') - monkeypatch.setenv(ApifyEnvVars.PROXY_PASSWORD, 'DUMMY_PROXY_PASSWORD') - monkeypatch.setenv(ApifyEnvVars.API_BASE_URL, 'DUMMY_API_BASE_URL') - monkeypatch.setenv(ApifyEnvVars.PROXY_HOSTNAME, 'DUMMY_PROXY_HOSTNAME') - monkeypatch.setenv(ActorEnvVars.DEFAULT_KEY_VALUE_STORE_ID, 'DUMMY_DEFAULT_KEY_VALUE_STORE_ID') - monkeypatch.setenv(ActorEnvVars.DEFAULT_REQUEST_QUEUE_ID, 'DUMMY_DEFAULT_REQUEST_QUEUE_ID') - monkeypatch.setenv(ActorEnvVars.DEFAULT_DATASET_ID, 'DUMMY_DEFAULT_DATASET_ID') - monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, '1') - monkeypatch.setenv(ApifyEnvVars.PROXY_PORT, '1234') - monkeypatch.setenv(ActorEnvVars.MEMORY_MBYTES, '1024') - monkeypatch.setenv(ActorEnvVars.STARTED_AT, '2023-01-01T12:34:56.789Z') - - config = Configuration() - assert config.token == 'DUMMY_TOKEN' - assert config.proxy_password == 'DUMMY_PROXY_PASSWORD' - assert config.api_base_url == 'DUMMY_API_BASE_URL' - assert config.proxy_hostname == 'DUMMY_PROXY_HOSTNAME' - assert config.default_dataset_id == 'DUMMY_DEFAULT_DATASET_ID' - assert config.default_key_value_store_id == 'DUMMY_DEFAULT_KEY_VALUE_STORE_ID' - assert config.default_request_queue_id == 'DUMMY_DEFAULT_REQUEST_QUEUE_ID' - assert config.is_at_home is True - assert config.proxy_port == 1234 - assert config.memory_mbytes == 1024 - assert config.started_at == datetime(2023, 1, 1, 12, 34, 56, 789000, tzinfo=timezone.utc) - - # Test that constructor arguments take precedence over env vars - def test_configuration_from_constructor_arguments(self: TestConfiguration, monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setenv(ApifyEnvVars.TOKEN, 'DUMMY_TOKEN') - monkeypatch.setenv(ApifyEnvVars.PROXY_PASSWORD, 'DUMMY_PROXY_PASSWORD') - monkeypatch.setenv(ApifyEnvVars.API_BASE_URL, 'DUMMY_API_BASE_URL') - monkeypatch.setenv(ApifyEnvVars.PROXY_HOSTNAME, 'DUMMY_PROXY_HOSTNAME') - monkeypatch.setenv(ActorEnvVars.DEFAULT_DATASET_ID, 'DUMMY_DEFAULT_DATASET_ID') - monkeypatch.setenv(ActorEnvVars.DEFAULT_KEY_VALUE_STORE_ID, 'DUMMY_DEFAULT_KEY_VALUE_STORE_ID') - monkeypatch.setenv(ActorEnvVars.DEFAULT_REQUEST_QUEUE_ID, 'DUMMY_DEFAULT_REQUEST_QUEUE_ID') - monkeypatch.setenv(ApifyEnvVars.PROXY_PORT, '1234') - - config = Configuration( - token='TOKEN_FROM_CONSTRUCTOR', - proxy_password='PROXY_PASSWORD_FROM_CONSTRUCTOR', - proxy_hostname='PROXY_HOSTNAME_FROM_CONSTRUCTOR', - api_base_url='API_BASE_URL_FROM_CONSTRUCTOR', - default_dataset_id='DEFAULT_DATASET_ID_FROM_CONSTRUCTOR', - default_key_value_store_id='DEFAULT_KEY_VALUE_STORE_ID_FROM_CONSTRUCTOR', - default_request_queue_id='DEFAULT_REQUEST_QUEUE_ID_FROM_CONSTRUCTOR', - proxy_port=5678, - ) - - assert config.token == 'TOKEN_FROM_CONSTRUCTOR' - assert config.proxy_password == 'PROXY_PASSWORD_FROM_CONSTRUCTOR' - assert config.api_base_url == 'API_BASE_URL_FROM_CONSTRUCTOR' - assert config.proxy_hostname == 'PROXY_HOSTNAME_FROM_CONSTRUCTOR' - assert config.default_dataset_id == 'DEFAULT_DATASET_ID_FROM_CONSTRUCTOR' - assert config.default_key_value_store_id == 'DEFAULT_KEY_VALUE_STORE_ID_FROM_CONSTRUCTOR' - assert config.default_request_queue_id == 'DEFAULT_REQUEST_QUEUE_ID_FROM_CONSTRUCTOR' - assert config.proxy_port == 5678 diff --git a/tests/unit/test_event_manager.py b/tests/unit/test_event_manager.py index 539bd472..80977e97 100644 --- a/tests/unit/test_event_manager.py +++ b/tests/unit/test_event_manager.py @@ -3,275 +3,116 @@ import asyncio import json import logging -import time from collections import defaultdict -from pprint import pprint from typing import Any, Callable +from unittest.mock import Mock import pytest import websockets import websockets.server -from apify_shared.consts import ActorEnvVars, ActorEventTypes -from apify.config import Configuration -from apify.event_manager import EventManager +from apify_shared.consts import ActorEnvVars +from crawlee.events._types import Event + +from apify import Configuration +from apify._platform_event_manager import EventManager, PlatformEventManager, SystemInfoEventData class TestEventManagerLocal: - async def test_lifecycle_local(self: TestEventManagerLocal, caplog: pytest.LogCaptureFixture) -> None: + async def test_lifecycle_local(self, caplog: pytest.LogCaptureFixture) -> None: caplog.set_level(logging.DEBUG, logger='apify') - config = Configuration() - event_manager = EventManager(config) - - await event_manager.init() - assert event_manager._initialized is True + async with PlatformEventManager(Configuration.get_global_configuration()): + pass assert len(caplog.records) == 1 assert caplog.records[0].levelno == logging.DEBUG assert caplog.records[0].message == 'APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.' - with pytest.raises(RuntimeError, match='EventManager was already initialized!'): - await event_manager.init() - - await event_manager.close() - - with pytest.raises(RuntimeError, match='EventManager was not initialized!'): - await event_manager.close() - - assert event_manager._initialized is False - - async def test_event_handling_local(self: TestEventManagerLocal) -> None: - config = Configuration() - event_manager = EventManager(config) - - await event_manager.init() - - event_calls = defaultdict(list) - - def on_event(event: ActorEventTypes, id: int | None = None) -> Callable: # noqa: A002 - def event_handler(data: Any) -> None: - nonlocal event_calls - event_calls[event].append((id, data)) - - return event_handler - - handler_system_info = on_event(ActorEventTypes.SYSTEM_INFO) - - # Basic test with just one handler on event - # Test adding the handler - event_manager.on(ActorEventTypes.SYSTEM_INFO, handler_system_info) - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(0.1) - assert event_calls[ActorEventTypes.SYSTEM_INFO] == [(None, 'DUMMY_SYSTEM_INFO')] - event_calls[ActorEventTypes.SYSTEM_INFO].clear() - - # Test removing the handler - event_manager.off(ActorEventTypes.SYSTEM_INFO, handler_system_info) - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO_2') - await asyncio.sleep(0.1) - assert event_calls[ActorEventTypes.SYSTEM_INFO] == [] - - # Complicated test with multiple handlers - # Add three handlers - handler_persist_state_1 = on_event(ActorEventTypes.PERSIST_STATE, 1) - handler_persist_state_2 = on_event(ActorEventTypes.PERSIST_STATE, 2) - handler_persist_state_3 = on_event(ActorEventTypes.PERSIST_STATE, 3) - event_manager.on(ActorEventTypes.PERSIST_STATE, handler_persist_state_1) - event_manager.on(ActorEventTypes.PERSIST_STATE, handler_persist_state_2) - event_manager.on(ActorEventTypes.PERSIST_STATE, handler_persist_state_3) - - # Test that they all work, and that they're called in order - event_manager.emit(ActorEventTypes.PERSIST_STATE, 'DUMMY_PERSIST_STATE') - await asyncio.sleep(0.1) - assert event_calls[ActorEventTypes.PERSIST_STATE] == [ - (1, 'DUMMY_PERSIST_STATE'), - (2, 'DUMMY_PERSIST_STATE'), - (3, 'DUMMY_PERSIST_STATE'), - ] - event_calls[ActorEventTypes.PERSIST_STATE].clear() - - # Test that if you remove one, the others stay - event_manager.off(ActorEventTypes.PERSIST_STATE, handler_persist_state_3) - event_manager.emit(ActorEventTypes.PERSIST_STATE, 'DUMMY_PERSIST_STATE') - await asyncio.sleep(0.1) - assert event_calls[ActorEventTypes.PERSIST_STATE] == [ - (1, 'DUMMY_PERSIST_STATE'), - (2, 'DUMMY_PERSIST_STATE'), - ] - event_calls[ActorEventTypes.PERSIST_STATE].clear() - - # Test that removing all in bulk works - event_manager.off(ActorEventTypes.PERSIST_STATE) - event_manager.emit(ActorEventTypes.PERSIST_STATE, 'DUMMY_PERSIST_STATE') - await asyncio.sleep(0.1) - assert event_calls[ActorEventTypes.PERSIST_STATE] == [] - - await event_manager.close() - - async def test_event_handler_argument_counts_local(self: TestEventManagerLocal) -> None: - config = Configuration() - event_manager = EventManager(config) - - await event_manager.init() - - event_calls = [] - - def sync_no_arguments() -> None: - nonlocal event_calls - event_calls.append(('sync_no_arguments', None)) - - async def async_no_arguments() -> None: - nonlocal event_calls - event_calls.append(('async_no_arguments', None)) - - def sync_one_argument(event_data: Any) -> None: - nonlocal event_calls - event_calls.append(('sync_one_argument', event_data)) - - async def async_one_argument(event_data: Any) -> None: - nonlocal event_calls - event_calls.append(('async_one_argument', event_data)) - - def sync_two_arguments(_arg1: Any, _arg2: Any) -> None: - pass - - async def async_two_arguments(_arg1: Any, _arg2: Any) -> None: - pass - - def sync_two_arguments_one_default(event_data: Any, _arg2: Any = 'default_value') -> None: - nonlocal event_calls - event_calls.append(('sync_two_arguments_one_default', event_data)) - - async def async_two_arguments_one_default(event_data: Any, _arg2: Any = 'default_value') -> None: - nonlocal event_calls - event_calls.append(('async_two_arguments_one_default', event_data)) - - event_manager.on(ActorEventTypes.SYSTEM_INFO, sync_no_arguments) - event_manager.on(ActorEventTypes.SYSTEM_INFO, async_no_arguments) - event_manager.on(ActorEventTypes.SYSTEM_INFO, sync_one_argument) - event_manager.on(ActorEventTypes.SYSTEM_INFO, async_one_argument) - event_manager.on(ActorEventTypes.SYSTEM_INFO, sync_two_arguments_one_default) - event_manager.on(ActorEventTypes.SYSTEM_INFO, async_two_arguments_one_default) - - # built-in functions should work too - event_manager.on(ActorEventTypes.SYSTEM_INFO, print) - - # functions from the standard library should work too - event_manager.on(ActorEventTypes.SYSTEM_INFO, pprint) - - with pytest.raises(ValueError, match='The "listener" argument must be a callable which accepts 0 or 1 arguments!'): - event_manager.on(ActorEventTypes.SYSTEM_INFO, sync_two_arguments) # type: ignore[arg-type] - with pytest.raises(ValueError, match='The "listener" argument must be a callable which accepts 0 or 1 arguments!'): - event_manager.on(ActorEventTypes.SYSTEM_INFO, async_two_arguments) # type: ignore[arg-type] - - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(0.1) + async def test_event_handling_local(self) -> None: + async with EventManager() as event_manager: + event_calls = defaultdict(list) - assert len(event_calls) == 6 - assert ('sync_no_arguments', None) in event_calls - assert ('async_no_arguments', None) in event_calls - assert ('sync_one_argument', 'DUMMY_SYSTEM_INFO') in event_calls - assert ('async_one_argument', 'DUMMY_SYSTEM_INFO') in event_calls - assert ('sync_two_arguments_one_default', 'DUMMY_SYSTEM_INFO') in event_calls - assert ('async_two_arguments_one_default', 'DUMMY_SYSTEM_INFO') in event_calls + def on_event(event: Event, id: int | None = None) -> Callable: + def event_handler(data: Any) -> None: + nonlocal event_calls + event_calls[event].append((id, data)) - async def test_event_async_handling_local(self: TestEventManagerLocal) -> None: - config = Configuration() - event_manager = EventManager(config) + return event_handler - await event_manager.init() + handler_system_info = on_event(Event.SYSTEM_INFO) + dummy_system_info = Mock() + dummy_system_info_2 = Mock() - event_calls = [] + # Basic test with just one handler on event + # Test adding the handler + event_manager.on(event=Event.SYSTEM_INFO, listener=handler_system_info) + event_manager.emit(event=Event.SYSTEM_INFO, event_data=dummy_system_info) + await asyncio.sleep(0.1) + assert event_calls[Event.SYSTEM_INFO] == [(None, dummy_system_info)] + event_calls[Event.SYSTEM_INFO].clear() - async def event_handler(data: Any) -> None: - nonlocal event_calls - await asyncio.sleep(2) - event_calls.append(data) + # Test removing the handler + event_manager.off(event=Event.SYSTEM_INFO, listener=handler_system_info) + event_manager.emit(event=Event.SYSTEM_INFO, event_data=dummy_system_info_2) + await asyncio.sleep(0.1) + assert event_calls[Event.SYSTEM_INFO] == [] - # Test that async event handlers work, and that they don't block the main thread - event_manager.on(ActorEventTypes.SYSTEM_INFO, event_handler) - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(1) - assert event_calls == [] - await asyncio.sleep(2) - assert event_calls == ['DUMMY_SYSTEM_INFO'] + # Complicated test with multiple handlers + # Add three handlers + handler_persist_state_1 = on_event(Event.PERSIST_STATE, 1) + handler_persist_state_2 = on_event(Event.PERSIST_STATE, 2) + handler_persist_state_3 = on_event(Event.PERSIST_STATE, 3) + event_manager.on(event=Event.PERSIST_STATE, listener=handler_persist_state_1) + event_manager.on(event=Event.PERSIST_STATE, listener=handler_persist_state_2) + event_manager.on(event=Event.PERSIST_STATE, listener=handler_persist_state_3) - await event_manager.close() + dummy_persist_state = Mock() - async def test_wait_for_all_listeners_to_complete( - self: TestEventManagerLocal, - caplog: pytest.LogCaptureFixture, - ) -> None: - config = Configuration() - event_manager = EventManager(config) + # Test that they all work, and that they're called in order + event_manager.emit(event=Event.PERSIST_STATE, event_data=dummy_persist_state) + await asyncio.sleep(0.1) + assert event_calls[Event.PERSIST_STATE] == [ + (1, dummy_persist_state), + (2, dummy_persist_state), + (3, dummy_persist_state), + ] + event_calls[Event.PERSIST_STATE].clear() + + # Test that if you remove one, the others stay + event_manager.off(event=Event.PERSIST_STATE, listener=handler_persist_state_3) + event_manager.emit(event=Event.PERSIST_STATE, event_data=dummy_persist_state) + await asyncio.sleep(0.1) + assert event_calls[Event.PERSIST_STATE] == [ + (1, dummy_persist_state), + (2, dummy_persist_state), + ] + event_calls[Event.PERSIST_STATE].clear() + + # Test that removing all in bulk works + event_manager.off(event=Event.PERSIST_STATE) + event_manager.emit(event=Event.PERSIST_STATE, event_data=dummy_persist_state) + await asyncio.sleep(0.1) + assert event_calls[Event.PERSIST_STATE] == [] - await event_manager.init() + async def test_event_async_handling_local(self) -> None: + dummy_system_info = Mock() - event_calls = [] + async with EventManager() as event_manager: + event_calls = [] - def on_event(sleep_secs: int | None = None) -> Callable: async def event_handler(data: Any) -> None: nonlocal event_calls - if sleep_secs: - await asyncio.sleep(sleep_secs) + await asyncio.sleep(2) event_calls.append(data) - return event_handler - - # Create three handlers, all with a different sleep time, and add them - handler_1 = on_event(1) - handler_2 = on_event(2) - handler_3 = on_event(3) - event_manager.on(ActorEventTypes.SYSTEM_INFO, handler_1) - event_manager.on(ActorEventTypes.SYSTEM_INFO, handler_2) - event_manager.on(ActorEventTypes.SYSTEM_INFO, handler_3) - - # Emit the event, record the emitting time - emmitted_at = time.perf_counter() - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(0.1) - - # Wait for all of the handlers to finish and check that it took the right amount of time - await event_manager.wait_for_all_listeners_to_complete() - - duration = time.perf_counter() - emmitted_at - assert duration > 2.8 - assert duration < 4 - assert event_calls == ['DUMMY_SYSTEM_INFO', 'DUMMY_SYSTEM_INFO', 'DUMMY_SYSTEM_INFO'] - event_calls.clear() - - # Emit the event again, record the emitting time - emmitted_at = time.perf_counter() - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(0.1) - - # Wait for all of the handlers to finish and check that it took the right amount of time - # This time add a timeout so that only 1 handler should have time to finish - await event_manager.wait_for_all_listeners_to_complete(timeout_secs=1.5) - - duration = time.perf_counter() - emmitted_at - assert duration > 1.3 - assert duration < 2 - assert event_calls == ['DUMMY_SYSTEM_INFO'] - await asyncio.sleep(2) - assert event_calls == ['DUMMY_SYSTEM_INFO'] - event_calls.clear() - - assert caplog.records[0].levelno == logging.WARNING - assert caplog.records[0].message == 'Timed out waiting for event listeners to complete, unfinished event listeners will be canceled' - - # Emit the event again, test that closing the event manager waits for the handlers to complete - emmitted_at = time.perf_counter() - event_manager.emit(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(0.1) - - await event_manager.close() - - duration = time.perf_counter() - emmitted_at - assert duration > 2.8 - assert duration < 4 - assert event_calls == ['DUMMY_SYSTEM_INFO', 'DUMMY_SYSTEM_INFO', 'DUMMY_SYSTEM_INFO'] + # Test that async event handlers work, and that they don't block the main thread + event_manager.on(event=Event.SYSTEM_INFO, listener=event_handler) + event_manager.emit(event=Event.SYSTEM_INFO, event_data=dummy_system_info) + await asyncio.sleep(1) + assert event_calls == [] + await asyncio.sleep(2) + assert event_calls == [dummy_system_info] class TestEventManagerOnPlatform: @@ -280,14 +121,11 @@ async def test_lifecycle_on_platform_without_websocket( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setenv(ActorEnvVars.EVENTS_WEBSOCKET_URL, 'ws://localhost:56565') - - config = Configuration() - event_manager = EventManager(config) + event_manager = PlatformEventManager(Configuration.get_global_configuration()) with pytest.raises(RuntimeError, match='Error connecting to platform events websocket!'): - await event_manager.init() - - assert event_manager._initialized is False + async with event_manager: + pass async def test_lifecycle_on_platform(self: TestEventManagerOnPlatform, monkeypatch: pytest.MonkeyPatch) -> None: connected_ws_clients: set[websockets.server.WebSocketServerProtocol] = set() @@ -305,17 +143,8 @@ async def handler(websocket: websockets.server.WebSocketServerProtocol) -> None: port: int = ws_server.sockets[0].getsockname()[1] # type: ignore[index] monkeypatch.setenv(ActorEnvVars.EVENTS_WEBSOCKET_URL, f'ws://localhost:{port}') - config = Configuration() - event_manager = EventManager(config) - - await event_manager.init() - assert event_manager._initialized is True - - assert len(connected_ws_clients) == 1 - - await event_manager.close() - - assert event_manager._initialized is False + async with PlatformEventManager(Configuration.get_global_configuration()): + assert len(connected_ws_clients) == 1 async def test_event_handling_on_platform( self: TestEventManagerOnPlatform, @@ -330,8 +159,8 @@ async def handler(websocket: websockets.server.WebSocketServerProtocol) -> None: finally: connected_ws_clients.remove(websocket) - async def send_platform_event(event_name: ActorEventTypes, data: Any = None) -> None: - message: dict[str, Any] = {'name': event_name} + async def send_platform_event(event_name: Event, data: Any = None) -> None: + message: dict[str, Any] = {'name': event_name.value} if data: message['data'] = data @@ -343,23 +172,29 @@ async def send_platform_event(event_name: ActorEventTypes, data: Any = None) -> port: int = ws_server.sockets[0].getsockname()[1] # type: ignore[index] monkeypatch.setenv(ActorEnvVars.EVENTS_WEBSOCKET_URL, f'ws://localhost:{port}') - config = Configuration() - event_manager = EventManager(config) - - await event_manager.init() - - event_calls = [] - event_manager.on(ActorEventTypes.SYSTEM_INFO, lambda data: event_calls.append(data)) - - # Test sending event with data - await send_platform_event(ActorEventTypes.SYSTEM_INFO, 'DUMMY_SYSTEM_INFO') - await asyncio.sleep(0.1) - assert event_calls == ['DUMMY_SYSTEM_INFO'] - event_calls.clear() - - # Test sending event without data - await send_platform_event(ActorEventTypes.SYSTEM_INFO) - await asyncio.sleep(0.1) - assert event_calls == [None] - - await event_manager.close() + dummy_system_info = { + 'memAvgBytes': 19328860.328293584, + 'memCurrentBytes': 65171456, + 'memMaxBytes': 65171456, + 'cpuAvgUsage': 2.0761105633130397, + 'cpuMaxUsage': 53.941134593993326, + 'cpuCurrentUsage': 8.45549815498155, + 'isCpuOverloaded': False, + 'createdAt': '2024-08-09T16:04:16.161Z', + } + SystemInfoEventData.model_validate(dummy_system_info) + + async with PlatformEventManager(Configuration.get_global_configuration()) as event_manager: + event_calls = [] + + def listener(data: Any) -> None: + event_calls.append(json.loads(data.model_dump_json(by_alias=True)) if data else None) + + event_manager.on(event=Event.SYSTEM_INFO, listener=listener) + + # Test sending event with data + await send_platform_event(Event.SYSTEM_INFO, dummy_system_info) + await asyncio.sleep(0.1) + assert len(event_calls) == 1 + assert event_calls[0]['cpuInfo']['usedRatio'] == 8.45549815498155 + event_calls.clear() diff --git a/tests/unit/test_lru_cache.py b/tests/unit/test_lru_cache.py deleted file mode 100644 index fe298ae6..00000000 --- a/tests/unit/test_lru_cache.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import annotations - -import pytest - -from apify._utils import LRUCache - - -@pytest.fixture() -def lru_cache() -> LRUCache[int]: - cache = LRUCache[int](3) - cache['a'] = 1 - cache['c'] = 3 - cache['b'] = 2 - return cache - - -def test_get(lru_cache: LRUCache[int]) -> None: - # Key error with non-existent key - with pytest.raises(KeyError): - _ = lru_cache['non-existent-key'] - # None when using .get instead - assert lru_cache.get('non-existent-key') is None - # Should return correct value for existing key - assert lru_cache['c'] == 3 - # Check if order of keys changed based on LRU rule - for actual, target in zip(lru_cache, ['a', 'b', 'c']): - assert actual == target - - -def test_set(lru_cache: LRUCache[int]) -> None: - assert len(lru_cache) == 3 - lru_cache['d'] = 4 - # Check if max_length is not exceeded - assert len(lru_cache) == 3 - # Check if oldest key is removed - assert 'a' not in lru_cache - # Check if the newest addition is at the end - assert list(lru_cache.items())[-1] == ('d', 4) - - -def test_del(lru_cache: LRUCache[int]) -> None: - # Key error on non-existent key - with pytest.raises(KeyError): - del lru_cache['non-existent-key'] - # No error with existing key - len_before_del = len(lru_cache) - del lru_cache['a'] - assert len(lru_cache) == len_before_del - 1 - assert 'a' not in lru_cache - - -def test_len(lru_cache: LRUCache[int]) -> None: - assert len(lru_cache) == len(lru_cache._cache) - lru_cache.clear() - assert len(lru_cache) == 0 - - -def test_iter(lru_cache: LRUCache[int]) -> None: - assert list(lru_cache) == ['a', 'c', 'b'] diff --git a/tests/unit/test_proxy_configuration.py b/tests/unit/test_proxy_configuration.py index 03dc8b41..7074e395 100644 --- a/tests/unit/test_proxy_configuration.py +++ b/tests/unit/test_proxy_configuration.py @@ -1,15 +1,18 @@ +# ruff: noqa: ARG001 ARG005 from __future__ import annotations import asyncio import re -from typing import TYPE_CHECKING +from dataclasses import asdict +from typing import TYPE_CHECKING, Any import httpx import pytest + from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars -from apify.proxy_configuration import ProxyConfiguration, is_url +from apify._proxy_configuration import ProxyConfiguration, is_url if TYPE_CHECKING: from respx import MockRouter @@ -63,17 +66,17 @@ def test__fails_with_invalid_arguments(self: TestProxyConfiguration) -> None: with pytest.raises(ValueError, match=re.escape(str(invalid_country_code))): ProxyConfiguration(country_code=invalid_country_code) # type: ignore - with pytest.raises(ValueError, match='Cannot combine custom proxies in "proxy_urls" with custom generating function in "new_url_function".'): - ProxyConfiguration(proxy_urls=['http://proxy.com:1111'], new_url_function=lambda _: 'http://proxy.com:2222') + with pytest.raises(ValueError, match='Exactly one of .* must be specified'): + ProxyConfiguration(proxy_urls=['http://proxy.com:1111'], new_url_function=lambda session_id=None, request=None: 'http://proxy.com:2222') with pytest.raises(ValueError, match='Cannot combine custom proxies with Apify Proxy'): ProxyConfiguration(proxy_urls=['http://proxy.com:1111'], groups=['GROUP1']) - with pytest.raises(ValueError, match=re.escape('proxy_urls[0] ("http://bad-url") is not a valid URL')): - ProxyConfiguration(proxy_urls=['http://bad-url']) + with pytest.raises(ValueError, match=re.escape('bad-url')): + ProxyConfiguration(proxy_urls=['bad-url']) with pytest.raises(ValueError, match='Cannot combine custom proxies with Apify Proxy'): - ProxyConfiguration(new_url_function=lambda _: 'http://proxy.com:2222', groups=['GROUP1']) + ProxyConfiguration(new_url_function=lambda session_id=None, request=None: 'http://proxy.com:2222', groups=['GROUP1']) class TestProxyConfigurationNewUrl: @@ -104,7 +107,7 @@ async def test_new_url_session_id(self: TestProxyConfigurationNewUrl) -> None: country_code=country_code, ) - session_ids: list[str | int] = [ + session_ids: list[str] = [ 'a', 'a_b', 'a_2', @@ -112,7 +115,7 @@ async def test_new_url_session_id(self: TestProxyConfigurationNewUrl) -> None: 'aaa~BBB', '1', '0.34252352', - 123456, + '123456', 'XXXXXXXXXXxxxxxxxxxxXXXXXXXXXXxxxxxxxxxxXXXXXXXXXX', ] for session_id in session_ids: @@ -171,7 +174,7 @@ async def test_custom_new_url_function(self: TestProxyConfigurationNewUrl) -> No 'http://proxy.com:6666', ] - def custom_new_url_function(_session_id: str | None) -> str: + def custom_new_url_function(session_id: str | None = None, request: Any = None) -> str: nonlocal custom_urls return custom_urls.pop() @@ -190,7 +193,7 @@ async def test_custom_new_url_function_async(self: TestProxyConfigurationNewUrl) 'http://proxy.com:6666', ] - async def custom_new_url_function(_session_id: str | None) -> str: + async def custom_new_url_function(session_id: str | None = None, request: Any = None) -> str: nonlocal custom_urls await asyncio.sleep(0.1) return custom_urls.pop() @@ -201,7 +204,7 @@ async def custom_new_url_function(_session_id: str | None) -> str: assert await proxy_configuration.new_url() == custom_url async def test_invalid_custom_new_url_function(self: TestProxyConfigurationNewUrl) -> None: - def custom_new_url_function(_session_id: str | None) -> str: + def custom_new_url_function(session_id: str | None = None, request: Any = None) -> str: raise ValueError proxy_configuration = ProxyConfiguration(new_url_function=custom_new_url_function) @@ -245,13 +248,15 @@ async def test_new_proxy_info_basic(self: TestProxyConfigurationNewProxyInfo) -> password=password, country_code=country_code, ) + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None expected_hostname = 'proxy.apify.com' expected_port = 8000 expected_username = f'groups-{"+".join(groups)},country-{country_code}' - assert proxy_info == { + assert asdict(proxy_info) == { 'url': f'http://{expected_username}:{password}@{expected_hostname}:{expected_port}', 'hostname': expected_hostname, 'port': expected_port, @@ -259,18 +264,38 @@ async def test_new_proxy_info_basic(self: TestProxyConfigurationNewProxyInfo) -> 'country_code': country_code, 'username': expected_username, 'password': password, + 'proxy_tier': None, + 'session_id': None, + 'scheme': 'http', } async def test_new_proxy_info_rotates_urls(self: TestProxyConfigurationNewProxyInfo) -> None: proxy_urls = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] proxy_configuration = ProxyConfiguration(proxy_urls=proxy_urls) - assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[0] - assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[1] - assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[2] - assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[0] - assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[1] - assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[2] + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] + + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None + assert proxy_info.url == proxy_urls[1] + + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None + assert proxy_info.url == proxy_urls[2] + + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] + + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None + assert proxy_info.url == proxy_urls[1] + + proxy_info = await proxy_configuration.new_proxy_info() + assert proxy_info is not None + assert proxy_info.url == proxy_urls[2] async def test_new_proxy_info_rotates_urls_with_sessions(self: TestProxyConfigurationNewProxyInfo) -> None: sessions = ['sesssion_01', 'sesssion_02', 'sesssion_03', 'sesssion_04', 'sesssion_05', 'sesssion_06'] @@ -279,20 +304,47 @@ async def test_new_proxy_info_rotates_urls_with_sessions(self: TestProxyConfigur proxy_configuration = ProxyConfiguration(proxy_urls=proxy_urls) # same session should use same proxy URL - assert (await proxy_configuration.new_proxy_info(sessions[0]))['url'] == proxy_urls[0] - assert (await proxy_configuration.new_proxy_info(sessions[0]))['url'] == proxy_urls[0] - assert (await proxy_configuration.new_proxy_info(sessions[0]))['url'] == proxy_urls[0] + proxy_info = await proxy_configuration.new_proxy_info(sessions[0]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[0]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[0]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] # different sessions should rotate different proxies - assert (await proxy_configuration.new_proxy_info(sessions[1]))['url'] == proxy_urls[1] - assert (await proxy_configuration.new_proxy_info(sessions[2]))['url'] == proxy_urls[2] - assert (await proxy_configuration.new_proxy_info(sessions[3]))['url'] == proxy_urls[0] - assert (await proxy_configuration.new_proxy_info(sessions[4]))['url'] == proxy_urls[1] - assert (await proxy_configuration.new_proxy_info(sessions[5]))['url'] == proxy_urls[2] + proxy_info = await proxy_configuration.new_proxy_info(sessions[1]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[1] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[2]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[2] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[3]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[4]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[1] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[5]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[2] # already used sessions should be remembered - assert (await proxy_configuration.new_proxy_info(sessions[1]))['url'] == proxy_urls[1] - assert (await proxy_configuration.new_proxy_info(sessions[3]))['url'] == proxy_urls[0] + proxy_info = await proxy_configuration.new_proxy_info(sessions[1]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[1] + + proxy_info = await proxy_configuration.new_proxy_info(sessions[3]) + assert proxy_info is not None + assert proxy_info.url == proxy_urls[0] @pytest.fixture() diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py deleted file mode 100644 index 3847f5fb..00000000 --- a/tests/unit/test_utils.py +++ /dev/null @@ -1,368 +0,0 @@ -from __future__ import annotations - -import asyncio -import contextlib -import os -import time -from collections import OrderedDict -from datetime import datetime, timezone -from typing import TYPE_CHECKING - -import pytest -from aiofiles.os import mkdir -from apify_shared.consts import ActorEnvVars, ApifyEnvVars - -from apify._utils import ( - budget_ow, - compute_short_hash, - compute_unique_key, - fetch_and_parse_env_var, - force_remove, - force_rename, - get_cpu_usage_percent, - get_memory_usage_bytes, - guess_file_extension, - maybe_parse_bool, - maybe_parse_datetime, - maybe_parse_int, - normalize_url, - raise_on_duplicate_storage, - raise_on_non_existing_storage, - run_func_at_interval_async, - unique_key_to_request_id, -) -from apify.consts import StorageTypes - -if TYPE_CHECKING: - from pathlib import Path - - -def test__fetch_and_parse_env_var(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, 'True') - monkeypatch.setenv(ActorEnvVars.MEMORY_MBYTES, '1024') - monkeypatch.setenv(ApifyEnvVars.META_ORIGIN, 'API') - monkeypatch.setenv(ActorEnvVars.STARTED_AT, '2022-12-02T15:19:34.907Z') - monkeypatch.setenv('DUMMY_BOOL', '1') - monkeypatch.setenv('DUMMY_DATETIME', '2022-12-02T15:19:34.907Z') - monkeypatch.setenv('DUMMY_INT', '1') - monkeypatch.setenv('DUMMY_STRING', 'DUMMY') - - assert fetch_and_parse_env_var(ApifyEnvVars.IS_AT_HOME) is True - assert fetch_and_parse_env_var(ActorEnvVars.MEMORY_MBYTES) == 1024 - assert fetch_and_parse_env_var(ApifyEnvVars.META_ORIGIN) == 'API' - assert fetch_and_parse_env_var(ActorEnvVars.STARTED_AT) == datetime(2022, 12, 2, 15, 19, 34, 907000, tzinfo=timezone.utc) - - assert fetch_and_parse_env_var('DUMMY_BOOL') == '1' # type: ignore - assert fetch_and_parse_env_var('DUMMY_DATETIME') == '2022-12-02T15:19:34.907Z' # type: ignore - assert fetch_and_parse_env_var('DUMMY_INT') == '1' # type: ignore - assert fetch_and_parse_env_var('DUMMY_STRING') == 'DUMMY' # type: ignore - assert fetch_and_parse_env_var('NONEXISTENT_ENV_VAR') is None # type: ignore - assert fetch_and_parse_env_var('NONEXISTENT_ENV_VAR', 'default') == 'default' # type: ignore - - -def test__get_cpu_usage_percent() -> None: - assert get_cpu_usage_percent() >= 0 - assert get_cpu_usage_percent() <= 100 - - -def test__get_memory_usage_bytes() -> None: - assert get_memory_usage_bytes() >= 0 - assert get_memory_usage_bytes() <= 1024 * 1024 * 1024 * 1024 - - -def test__maybe_parse_bool() -> None: - assert maybe_parse_bool('True') is True - assert maybe_parse_bool('true') is True - assert maybe_parse_bool('1') is True - assert maybe_parse_bool('False') is False - assert maybe_parse_bool('false') is False - assert maybe_parse_bool('0') is False - assert maybe_parse_bool(None) is False - assert maybe_parse_bool('bflmpsvz') is False - - -def test__maybe_parse_datetime() -> None: - assert maybe_parse_datetime('2022-12-02T15:19:34.907Z') == datetime(2022, 12, 2, 15, 19, 34, 907000, tzinfo=timezone.utc) - assert maybe_parse_datetime('2022-12-02T15:19:34.907') == '2022-12-02T15:19:34.907' - assert maybe_parse_datetime('anything') == 'anything' - - -def test__maybe_parse_int() -> None: - assert maybe_parse_int('0') == 0 - assert maybe_parse_int('1') == 1 - assert maybe_parse_int('-1') == -1 - assert maybe_parse_int('136749825') == 136749825 - assert maybe_parse_int('') is None - assert maybe_parse_int('abcd') is None - - -async def test__run_func_at_interval_async__sync_function() -> None: - # Test that it works with a synchronous functions - interval = 1.0 - initial_delay = 0.5 - increments = 3 - - test_var = 0 - - def sync_increment() -> None: - nonlocal test_var - test_var += 1 - - started_at = time.perf_counter() - sync_increment_task = asyncio.create_task(run_func_at_interval_async(sync_increment, interval)) - - try: - await asyncio.sleep(initial_delay) - - for i in range(increments): - assert test_var == i - - now = time.perf_counter() - sleep_until = started_at + initial_delay + (i + 1) * interval - sleep_for_secs = sleep_until - now - await asyncio.sleep(sleep_for_secs) - - assert test_var == increments - finally: - sync_increment_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await sync_increment_task - - await asyncio.sleep(1.5) - assert test_var == increments - - -async def test__run_func_at_interval_async_async__function() -> None: - # Test that it works with an asynchronous functions - interval = 1.0 - initial_delay = 0.5 - increments = 3 - - test_var = 0 - - async def async_increment() -> None: - nonlocal test_var - await asyncio.sleep(0.1) - test_var += 1 - - started_at = time.perf_counter() - async_increment_task = asyncio.create_task(run_func_at_interval_async(async_increment, interval)) - - try: - await asyncio.sleep(initial_delay) - - for i in range(increments): - assert test_var == i - - now = time.perf_counter() - sleep_until = started_at + initial_delay + (i + 1) * interval - sleep_for_secs = sleep_until - now - await asyncio.sleep(sleep_for_secs) - - assert test_var == increments - finally: - async_increment_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await async_increment_task - - await asyncio.sleep(1.5) - assert test_var == increments - - -async def test__force_remove(tmp_path: Path) -> None: - test_file_path = os.path.join(tmp_path, 'test.txt') - # Does not crash/raise when the file does not exist - assert os.path.exists(test_file_path) is False - await force_remove(test_file_path) - assert os.path.exists(test_file_path) is False - - # Removes the file if it exists - with open(test_file_path, 'a', encoding='utf-8'): # noqa: ASYNC230 - pass - assert os.path.exists(test_file_path) is True - await force_remove(test_file_path) - assert os.path.exists(test_file_path) is False - - -def test__raise_on_non_existing_storage() -> None: - with pytest.raises(ValueError, match='Dataset with id "kckxQw6j6AtrgyA09" does not exist.'): - raise_on_non_existing_storage(StorageTypes.DATASET, 'kckxQw6j6AtrgyA09') - - -def test__raise_on_duplicate_storage() -> None: - with pytest.raises(ValueError, match='Dataset with name "test" already exists.'): - raise_on_duplicate_storage(StorageTypes.DATASET, 'name', 'test') - - -def test__guess_file_extension() -> None: - # Can guess common types properly - assert guess_file_extension('application/json') == 'json' - assert guess_file_extension('application/xml') == 'xml' - assert guess_file_extension('text/plain') == 'txt' - - # Can handle unusual formats - assert guess_file_extension(' application/json ') == 'json' - assert guess_file_extension('APPLICATION/JSON') == 'json' - assert guess_file_extension('application/json;charset=utf-8') == 'json' - - # Returns None for non-existent content types - assert guess_file_extension('clearly not a content type') is None - assert guess_file_extension('') is None - - -def test__unique_key_to_request_id() -> None: - # Right side from `uniqueKeyToRequestId` in Crawlee - assert unique_key_to_request_id('abc') == 'ungWv48BzpBQUDe' - assert unique_key_to_request_id('test') == 'n4bQgYhMfWWaLqg' - - -async def test__force_rename(tmp_path: Path) -> None: - src_dir = os.path.join(tmp_path, 'src') - dst_dir = os.path.join(tmp_path, 'dst') - src_file = os.path.join(src_dir, 'src_dir.txt') - dst_file = os.path.join(dst_dir, 'dst_dir.txt') - # Won't crash if source directory does not exist - assert os.path.exists(src_dir) is False - await force_rename(src_dir, dst_dir) - - # Will remove dst_dir if it exists (also covers normal case) - # Create the src_dir with a file in it - await mkdir(src_dir) - with open(src_file, 'a', encoding='utf-8'): # noqa: ASYNC230 - pass - # Create the dst_dir with a file in it - await mkdir(dst_dir) - with open(dst_file, 'a', encoding='utf-8'): # noqa: ASYNC230 - pass - assert os.path.exists(src_file) is True - assert os.path.exists(dst_file) is True - await force_rename(src_dir, dst_dir) - assert os.path.exists(src_dir) is False - assert os.path.exists(dst_file) is False - # src_dir.txt should exist in dst_dir - assert os.path.exists(os.path.join(dst_dir, 'src_dir.txt')) is True - - -def test__budget_ow() -> None: - budget_ow( - { - 'a': 123, - 'b': 'string', - 'c': datetime.now(timezone.utc), - }, - { - 'a': (int, True), - 'b': (str, False), - 'c': (datetime, True), - }, - ) - with pytest.raises(ValueError, match='required'): - budget_ow({}, {'id': (str, True)}) - with pytest.raises(ValueError, match='must be of type'): - budget_ow({'id': 123}, {'id': (str, True)}) - # Check if subclasses pass the check - budget_ow( - { - 'ordered_dict': OrderedDict(), - }, - { - 'ordered_dict': (dict, False), - }, - ) - - -def test_get_short_base64_hash_with_known_input() -> None: - data = b'Hello world!' - expected_hash = 'c0535e4b' - assert compute_short_hash(data) == expected_hash, 'The hash does not match the expected output' - - -def test_get_short_base64_hash_with_empty_input() -> None: - data = b'' - expected_hash = 'e3b0c442' - assert compute_short_hash(data) == expected_hash, 'The hash for an empty input should follow the expected pattern' - - -def test_get_short_base64_hash_output_length() -> None: - data = b'some random data' - assert len(compute_short_hash(data)) == 8, 'The output hash should be 8 characters long' - - -def test_get_short_base64_hash_differentiates_input() -> None: - data1 = b'input 1' - data2 = b'input 2' - assert compute_short_hash(data1) != compute_short_hash(data2), 'Different inputs should produce different hashes' - - -@pytest.mark.parametrize( - ('url', 'expected_output', 'keep_url_fragment'), - [ - ('https://example.com/?utm_source=test&utm_medium=test&key=value', 'https://example.com?key=value', False), - ('http://example.com/?key=value&another_key=another_value', 'http://example.com?another_key=another_value&key=value', False), - ('HTTPS://EXAMPLE.COM/?KEY=VALUE', 'https://example.com?key=value', False), - ('', '', False), - ('http://example.com/#fragment', 'http://example.com#fragment', True), - ('http://example.com/#fragment', 'http://example.com', False), - (' https://example.com/ ', 'https://example.com', False), - ('http://example.com/?b=2&a=1', 'http://example.com?a=1&b=2', False), - ], - ids=[ - 'remove_utm_params', - 'retain_sort_non_utm_params', - 'convert_scheme_netloc_to_lowercase', - 'handle_empty_url', - 'retain_fragment', - 'remove_fragment', - 'trim_whitespace', - 'sort_query_params', - ], -) -def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: bool) -> None: - output = normalize_url(url, keep_url_fragment=keep_url_fragment) - assert output == expected_output - - -@pytest.mark.parametrize( - ('url', 'method', 'payload', 'keep_url_fragment', 'use_extended_unique_key', 'expected_output'), - [ - ('http://example.com', 'GET', None, False, False, 'http://example.com'), - ('http://example.com', 'POST', None, False, False, 'http://example.com'), - ('http://example.com', 'GET', b'data', False, False, 'http://example.com'), - ('http://example.com', 'GET', b'data', False, True, 'GET(3a6eb079):http://example.com'), - ('http://example.com', 'POST', b'data', False, True, 'POST(3a6eb079):http://example.com'), - ('http://example.com#fragment', 'GET', None, True, False, 'http://example.com#fragment'), - ('http://example.com#fragment', 'GET', None, False, False, 'http://example.com'), - ('http://example.com', 'DELETE', b'test', False, True, 'DELETE(9f86d081):http://example.com'), - ('https://example.com?utm_content=test', 'GET', None, False, False, 'https://example.com'), - ('https://example.com?utm_content=test', 'GET', None, True, False, 'https://example.com'), - ], - ids=[ - 'simple_get', - 'simple_post', - 'get_with_payload', - 'get_with_payload_extended', - 'post_with_payload_extended', - 'get_with_fragment', - 'get_remove_fragment', - 'delete_with_payload_extended', - 'get_remove_utm', - 'get_keep_utm_fragment', - ], -) -def test_compute_unique_key( - url: str, - method: str, - payload: bytes | None, - *, - keep_url_fragment: bool, - use_extended_unique_key: bool, - expected_output: str, -) -> None: - output = compute_unique_key( - url, - method, - payload, - keep_url_fragment=keep_url_fragment, - use_extended_unique_key=use_extended_unique_key, - ) - assert output == expected_output diff --git a/website/generate_module_shortcuts.py b/website/generate_module_shortcuts.py index 1e245cc1..f671ea9e 100755 --- a/website/generate_module_shortcuts.py +++ b/website/generate_module_shortcuts.py @@ -19,7 +19,7 @@ def get_module_shortcuts(module, parent_classes=None): shortcuts[f'{module.__name__}.{classname}'] = f'{parent_module_name}.{classname}' for _, submodule in inspect.getmembers(module, inspect.ismodule): - if (submodule.__name__.startswith('apify')): + if submodule.__name__.startswith('apify'): shortcuts.update(get_module_shortcuts(submodule, module_classes)) return shortcuts diff --git a/website/src/pages/index.js b/website/src/pages/index.js index f98dca94..8877d47a 100644 --- a/website/src/pages/index.js +++ b/website/src/pages/index.js @@ -15,10 +15,10 @@ function Hero() {

- Apify SDK for Python
is a toolkit for
building actors + Apify SDK for Python
is a toolkit for
building Actors

- Apify SDK for Python
is a toolkit for
building actors + Apify SDK for Python
is a toolkit for
building Actors

@@ -27,7 +27,7 @@ function Hero() {

The Apify SDK for Python is the official library for creating Apify Actors in Python. - It provides useful features like actor lifecycle management, local storage emulation, and actor event handling. + It provides useful features like Actor lifecycle management, local storage emulation, and Actor event handling.

@@ -66,8 +66,8 @@ export default function Home() {

- For example, the Apify SDK makes it easy to read the actor input with the Actor.get_input() method, - and to save scraped data from your actors to a dataset + For example, the Apify SDK makes it easy to read the Actor input with the Actor.get_input() method, + and to save scraped data from your Actors to a dataset {' '}by simply using the Actor.push_data() method.