From 8e2f5d4eb6c1d9fef59359b0532a6fa6a50062a6 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 26 Aug 2025 13:26:51 +0200 Subject: [PATCH 01/26] Draft for tests --- .../_apify/_request_queue_client.py | 107 ++---------------- 1 file changed, 9 insertions(+), 98 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index ec94f201..14af8641 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -17,7 +17,7 @@ from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata -from ._models import CachedRequest, ProlongRequestLockResponse, RequestQueueHead +from ._models import CachedRequest, RequestQueueHead from apify import Request if TYPE_CHECKING: @@ -498,11 +498,6 @@ async def reclaim_request( if forefront: self._should_check_for_forefront_requests = True - # Try to release the lock on the request - try: - await self._delete_request_lock(request.unique_key, forefront=forefront) - except Exception as err: - logger.debug(f'Failed to delete request lock for request {request.unique_key}', exc_info=err) except Exception as exc: logger.debug(f'Error reclaiming request {request.unique_key}: {exc!s}') return None @@ -516,10 +511,10 @@ async def is_empty(self) -> bool: Returns: True if the queue is empty, False otherwise. """ - # Check _list_head and self._queue_has_locked_requests with lock to make sure they are consistent. + # Check _list_head. # Without the lock the `is_empty` is prone to falsely report True with some low probability race condition. async with self._fetch_lock: - head = await self._list_head(limit=1, lock_time=None) + head = await self._list_head(limit=1) return len(head.items) == 0 and not self._queue_has_locked_requests async def _ensure_head_is_non_empty(self) -> None: @@ -529,7 +524,7 @@ async def _ensure_head_is_non_empty(self) -> None: return # Fetch requests from the API and populate the queue head - await self._list_head(lock_time=self._DEFAULT_LOCK_TIME) + await self._list_head() async def _get_or_hydrate_request(self, unique_key: str) -> Request | None: """Get a request by unique key, either from cache or by fetching from API. @@ -544,32 +539,16 @@ async def _get_or_hydrate_request(self, unique_key: str) -> Request | None: cached_entry = self._requests_cache.get(unique_key) if cached_entry and cached_entry.hydrated: - # If we have the request hydrated in cache, check if lock is expired - if cached_entry.lock_expires_at and cached_entry.lock_expires_at < datetime.now(tz=timezone.utc): - # Try to prolong the lock if it's expired - try: - lock_secs = int(self._DEFAULT_LOCK_TIME.total_seconds()) - response = await self._prolong_request_lock(unique_key, lock_secs=lock_secs) - cached_entry.lock_expires_at = response.lock_expires_at - except Exception: - # If prolonging the lock fails, we lost the request - logger.debug(f'Failed to prolong lock for request {unique_key}, returning None') - return None - + # If we have the request hydrated in cache, return it return cached_entry.hydrated # If not in cache or not hydrated, fetch the request try: - # Try to acquire or prolong the lock - lock_secs = int(self._DEFAULT_LOCK_TIME.total_seconds()) - await self._prolong_request_lock(unique_key, lock_secs=lock_secs) - # Fetch the request data request = await self.get_request(unique_key) - # If request is not found, release lock and return None + # If request is not found and return None if not request: - await self._delete_request_lock(unique_key) return None # Update cache with hydrated request @@ -584,7 +563,7 @@ async def _get_or_hydrate_request(self, unique_key: str) -> Request | None: hydrated_request=request, ) except Exception as exc: - logger.debug(f'Error fetching or locking request {unique_key}: {exc!s}') + logger.debug(f'Error fetching request {unique_key}: {exc!s}') return None else: return request @@ -618,14 +597,11 @@ async def _update_request( async def _list_head( self, *, - lock_time: timedelta | None = None, limit: int = 25, ) -> RequestQueueHead: """Retrieve requests from the beginning of the queue. Args: - lock_time: Duration for which to lock the retrieved requests. - If None, requests will not be locked. limit: Maximum number of requests to retrieve. Returns: @@ -648,8 +624,8 @@ async def _list_head( had_multiple_clients=metadata.had_multiple_clients, queue_modified_at=metadata.modified_at, items=items, + lock_time=None, queue_has_locked_requests=self._queue_has_locked_requests, - lock_time=lock_time, ) leftover_buffer = list[str]() if self._should_check_for_forefront_requests: @@ -658,11 +634,7 @@ async def _list_head( self._should_check_for_forefront_requests = False # Otherwise fetch from API - lock_time = lock_time or self._DEFAULT_LOCK_TIME - lock_secs = int(lock_time.total_seconds()) - - response = await self._api_client.list_and_lock_head( - lock_secs=lock_secs, + response = await self._api_client.list_head( limit=limit, ) @@ -701,67 +673,6 @@ async def _list_head( self._queue_head.append(leftover_unique_key) return RequestQueueHead.model_validate(response) - async def _prolong_request_lock( - self, - unique_key: str, - *, - lock_secs: int, - ) -> ProlongRequestLockResponse: - """Prolong the lock on a specific request in the queue. - - Args: - unique_key: Unique key of the request whose lock is to be prolonged. - lock_secs: The additional amount of time, in seconds, that the request will remain locked. - - Returns: - A response containing the time at which the lock will expire. - """ - response = await self._api_client.prolong_request_lock( - request_id=unique_key_to_request_id(unique_key), - # All requests reaching this code were the tip of the queue at the moment when they were fetched, - # so if their lock expires, they should be put back to the forefront as their handling is long overdue. - forefront=True, - lock_secs=lock_secs, - ) - - result = ProlongRequestLockResponse( - lock_expires_at=datetime.fromisoformat(response['lockExpiresAt'].replace('Z', '+00:00')) - ) - - # Update the cache with the new lock expiration - for cached_request in self._requests_cache.values(): - if cached_request.unique_key == unique_key: - cached_request.lock_expires_at = result.lock_expires_at - break - - return result - - async def _delete_request_lock( - self, - unique_key: str, - *, - forefront: bool = False, - ) -> None: - """Delete the lock on a specific request in the queue. - - Args: - unique_key: Unique key of the request to delete the lock. - forefront: Whether to put the request in the beginning or the end of the queue after the lock is deleted. - """ - try: - await self._api_client.delete_request_lock( - request_id=unique_key_to_request_id(unique_key), - forefront=forefront, - ) - - # Update the cache to remove the lock - for cached_request in self._requests_cache.values(): - if cached_request.unique_key == unique_key: - cached_request.lock_expires_at = None - break - except Exception as err: - logger.debug(f'Failed to delete request lock for request {unique_key}', exc_info=err) - def _cache_request( self, cache_key: str, From 1d869a48c3065ed9225b3b2037acf0a2366750f7 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 27 Aug 2025 10:33:50 +0200 Subject: [PATCH 02/26] Updated draft --- .../_apify/_request_queue_client.py | 385 +++++------------- 1 file changed, 97 insertions(+), 288 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 14af8641..236a49ec 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -17,7 +17,6 @@ from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata -from ._models import CachedRequest, RequestQueueHead from apify import Request if TYPE_CHECKING: @@ -54,7 +53,26 @@ def unique_key_to_request_id(unique_key: str, *, request_id_length: int = 15) -> class ApifyRequestQueueClient(RequestQueueClient): - """An Apify platform implementation of the request queue client.""" + """An Apify platform implementation of the request queue client with limited capability. + + This client is designed to use as little resources as possible, but has to be used in constrained context. + Constraints: + - Only one client is consuming the request queue at the time. + - Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to be handled + sooner. (Explanation below) + - This client always consumes first own requests and only if no local requests exists it tries to get requests from + the global queue. + + If the constraints are not met, the client might work in an unpredictable way. + + Optimization notes: + - The client aggressively caches requests to avoid unnecessary API calls. + - The client adds requests to the global queue if they are handled. + - The client adds unhandled requests to the global queue only if local cache size reaches some threshold or based on + external callback. (To prevent double API call per request - adding request to the global queue and marking it as + handled. The client tries to do that in one step if possible.) + - The client tracks own forefront (priority requests), that does not have to be in sync with the global forefront. + """ _DEFAULT_LOCK_TIME: Final[timedelta] = timedelta(minutes=3) """The default lock time for requests in the queue.""" @@ -78,32 +96,25 @@ def __init__( self._metadata = metadata """Additional data related to the RequestQueue.""" - self._queue_head = deque[str]() - """A deque to store request unique keys in the queue head.""" - - self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=self._MAX_CACHED_REQUESTS) + self._requests_cache: LRUCache[str, Request] = LRUCache(maxsize=self._MAX_CACHED_REQUESTS) """A cache to store request objects. Request unique key is used as the cache key.""" - self._queue_has_locked_requests: bool | None = None - """Whether the queue has requests locked by another client.""" + self._head_requests: deque[str] = deque() + """Ordered unique keys of requests that that represents queue head.""" + + self._requests_on_platform: set[str] = set() + """Set of requests unique keys that are already present on the platform. To enable local deduplication.""" - self._should_check_for_forefront_requests = False - """Whether to check for forefront requests in the next list_head call.""" + self._requests_in_progress: set[str] = set() + """Set of requests unique keys that are being processed locally. + + - To avoid double processing of requests that have been processing for a long time, got unlocked on the platform + and got fetched again from platform. (Rare edge case.) + - To help decide if the RQ is finished or not.""" self._fetch_lock = asyncio.Lock() """Fetch lock to minimize race conditions when communicating with API.""" - async def _get_metadata_estimate(self) -> RequestQueueMetadata: - """Try to get cached metadata first. If multiple clients, fuse with global metadata. - - This method is used internally to avoid unnecessary API call unless needed (multiple clients). - Local estimation of metadata is without delay, unlike metadata from API. In situation where there is only one - client, it is the better choice. - """ - if self._metadata.had_multiple_clients: - return await self.get_metadata() - # Get local estimation (will not include changes done bo another client) - return self._metadata @override async def get_metadata(self) -> RequestQueueMetadata: @@ -113,6 +124,7 @@ async def get_metadata(self) -> RequestQueueMetadata: Metadata from the API, merged with local estimation, because in some cases, the data from the API can be delayed. """ + # TODO response = await self._api_client.get() if response is None: raise ValueError('Failed to fetch request queue metadata from the API.') @@ -160,6 +172,7 @@ async def open( are provided, or if neither `id` nor `name` is provided and no default storage ID is available in the configuration. """ + # Could be shared with the normal ApifyRequestQueueClient token = configuration.token if not token: raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') @@ -238,6 +251,7 @@ async def open( @override async def purge(self) -> None: + # Could be shared with the normal ApifyRequestQueueClient raise NotImplementedError( 'Purging the request queue is not supported in the Apify platform. ' 'Use the `drop` method to delete the request queue instead.' @@ -245,6 +259,7 @@ async def purge(self) -> None: @override async def drop(self) -> None: + # Could be shared with the normal ApifyRequestQueueClient await self._api_client.delete() @override @@ -265,14 +280,11 @@ async def add_batch_of_requests( """ # Do not try to add previously added requests to avoid pointless expensive calls to API - new_requests: list[Request] = [] + new_requests: list[ProcessedRequest] = [] already_present_requests: list[ProcessedRequest] = [] for request in requests: if self._requests_cache.get(request.unique_key): - # We are not sure if it was already handled at this point, and it is not worth calling API for it. - # It could have been handled by another client in the meantime, so cached information about - # `request.was_already_handled` is not reliable. already_present_requests.append( ProcessedRequest.model_validate( { @@ -284,61 +296,32 @@ async def add_batch_of_requests( ) else: - # Add new request to the cache. - processed_request = ProcessedRequest.model_validate( - { - 'uniqueKey': request.unique_key, - 'wasAlreadyPresent': True, - 'wasAlreadyHandled': request.was_already_handled, - } - ) - self._cache_request( - request.unique_key, - processed_request, - ) - new_requests.append(request) - - if new_requests: - # Prepare requests for API by converting to dictionaries. - requests_dict = [ - request.model_dump( - by_alias=True, - exclude={'id'}, # Exclude ID fields from requests since the API doesn't accept them. + new_requests.append( + ProcessedRequest.model_validate( + { + 'uniqueKey': request.unique_key, + 'wasAlreadyPresent': False, + 'wasAlreadyHandled': request.was_already_handled, + } + ) ) - for request in new_requests - ] - # Send requests to API. - api_response = AddRequestsResponse.model_validate( - await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront) - ) - # Add the locally known already present processed requests based on the local cache. - api_response.processed_requests.extend(already_present_requests) + # Update local caches + self._requests_cache[request.unique_key] = request + if forefront: + self._head_requests.append(request.unique_key) + else: + self._head_requests.appendleft(request.unique_key) - # Remove unprocessed requests from the cache - for unprocessed_request in api_response.unprocessed_requests: - self._requests_cache.pop(unprocessed_request.unique_key, None) - else: - api_response = AddRequestsResponse.model_validate( - {'unprocessedRequests': [], 'processedRequests': already_present_requests} - ) - - logger.debug( - f'Tried to add new requests: {len(new_requests)}, ' - f'succeeded to add new requests: {len(api_response.processed_requests) - len(already_present_requests)}, ' - f'skipped already present requests: {len(already_present_requests)}' + api_response = AddRequestsResponse.model_validate( + {'unprocessedRequests': [], 'processedRequests': already_present_requests+new_requests} ) - # Update assumed total count for newly added requests. - new_request_count = 0 - for processed_request in api_response.processed_requests: - if not processed_request.was_already_present and not processed_request.was_already_handled: - new_request_count += 1 - - self._metadata.total_request_count += new_request_count + # Update assumed total count for newly added requests. + self._metadata.total_request_count += len(new_requests) return api_response @override @@ -351,6 +334,9 @@ async def get_request(self, unique_key: str) -> Request | None: Returns: The request or None if not found. """ + if unique_key in self._requests_cache: + return self._requests_cache[unique_key] + response = await self._api_client.get_request(unique_key_to_request_id(unique_key)) if response is None: @@ -370,45 +356,35 @@ async def fetch_next_request(self) -> Request | None: Returns: The request or `None` if there are no more pending requests. """ - # Ensure the queue head has requests if available. Fetching the head with lock to prevent race conditions. async with self._fetch_lock: await self._ensure_head_is_non_empty() - # If queue head is empty after ensuring, there are no requests - if not self._queue_head: - return None - - # Get the next request ID from the queue head - next_unique_key = self._queue_head.popleft() - - request = await self._get_or_hydrate_request(next_unique_key) - - # Handle potential inconsistency where request might not be in the main table yet - if request is None: - logger.debug( - 'Cannot find a request from the beginning of queue, will be retried later', - extra={'nextRequestUniqueKey': next_unique_key}, - ) + while self._head_requests: + request_unique_key = self._head_requests.pop() + if request_unique_key not in self._requests_in_progress: + self._requests_in_progress.add(request_unique_key) + return await self.get_request(request_unique_key) + # No request locally and the ones returned from the platform are already in progress. return None - # If the request was already handled, skip it - if request.handled_at is not None: - logger.debug( - 'Request fetched from the beginning of queue was already handled', - extra={'nextRequestUniqueKey': next_unique_key}, - ) - return None - - # Use get request to ensure we have the full request object. - request = await self.get_request(request.unique_key) - if request is None: - logger.debug( - 'Request fetched from the beginning of queue was not found in the RQ', - extra={'nextRequestUniqueKey': next_unique_key}, - ) - return None + async def _ensure_head_is_non_empty(self) -> None: + """Ensure that the queue head has requests if they are available in the queue.""" + if not self._head_requests: + response = await self._api_client.list_and_lock_head(limit=25, + lock_secs=int(self._DEFAULT_LOCK_TIME.total_seconds())) + # Update the queue head cache + self._queue_has_locked_requests = response.get('queueHasLockedRequests', False) + # Check if there is another client working with the RequestQueue + self._metadata.had_multiple_clients = response.get('hadMultipleClients', False) + if modified_at:= response.get('queueModifiedAt'): + self._metadata.modified_at = max(self._metadata.modified_at, modified_at) + + for request_data in response.get('items', []): + request = Request.model_validate(request_data) + self._requests_cache[request.unique_key] = request + self._head_requests.append(request.unique_key) + self._requests_on_platform.add(request.unique_key) - return request @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: @@ -423,27 +399,23 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | Information about the queue operation. `None` if the given request was not in progress. """ # Set the handled_at timestamp if not already set + if request.handled_at is None: request.handled_at = datetime.now(tz=timezone.utc) + self._metadata.handled_request_count += 1 if cached_request := self._requests_cache[request.unique_key]: - cached_request.was_already_handled = request.was_already_handled + cached_request.handled_at = request.handled_at try: # Update the request in the API + # Works as upsert - adds the request if it does not exist yet. (Local request that was handled before adding + # to the queue.) processed_request = await self._update_request(request) - processed_request.unique_key = request.unique_key - - # Update assumed handled count if this wasn't already handled - if not processed_request.was_already_handled: - self._metadata.handled_request_count += 1 - - # Update the cache with the handled request - cache_key = request.unique_key - self._cache_request( - cache_key, - processed_request, - hydrated_request=request, - ) + # Remove request from cache. It will no longer bee needed. + self._requests_cache.pop(request.unique_key) + self._requests_in_progress.discard(request.unique_key) + self._requests_on_platform.add(request.unique_key) + except Exception as exc: logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}') return None @@ -477,27 +449,17 @@ async def reclaim_request( async with self._fetch_lock: try: # Update the request in the API. + self._requests_cache[request.unique_key] = request + self._requests_in_progress.discard(request.unique_key) + self._head_requests.append(request.unique_key) + processed_request = await self._update_request(request, forefront=forefront) processed_request.unique_key = request.unique_key - # If the request was previously handled, decrement our handled count since # we're putting it back for processing. if request.was_already_handled and not processed_request.was_already_handled: self._metadata.handled_request_count -= 1 - # Update the cache - cache_key = request.unique_key - self._cache_request( - cache_key, - processed_request, - hydrated_request=request, - ) - - # If we're adding to the forefront, we need to check for forefront requests - # in the next list_head call - if forefront: - self._should_check_for_forefront_requests = True - except Exception as exc: logger.debug(f'Error reclaiming request {request.unique_key}: {exc!s}') return None @@ -511,62 +473,10 @@ async def is_empty(self) -> bool: Returns: True if the queue is empty, False otherwise. """ - # Check _list_head. # Without the lock the `is_empty` is prone to falsely report True with some low probability race condition. async with self._fetch_lock: - head = await self._list_head(limit=1) - return len(head.items) == 0 and not self._queue_has_locked_requests - - async def _ensure_head_is_non_empty(self) -> None: - """Ensure that the queue head has requests if they are available in the queue.""" - # If queue head has adequate requests, skip fetching more - if len(self._queue_head) > 1 and not self._should_check_for_forefront_requests: - return - - # Fetch requests from the API and populate the queue head - await self._list_head() - - async def _get_or_hydrate_request(self, unique_key: str) -> Request | None: - """Get a request by unique key, either from cache or by fetching from API. - - Args: - unique_key: Unique key of the request to get. - - Returns: - The request if found and valid, otherwise None. - """ - # First check if the request is in our cache - cached_entry = self._requests_cache.get(unique_key) - - if cached_entry and cached_entry.hydrated: - # If we have the request hydrated in cache, return it - return cached_entry.hydrated - - # If not in cache or not hydrated, fetch the request - try: - # Fetch the request data - request = await self.get_request(unique_key) - - # If request is not found and return None - if not request: - return None - - # Update cache with hydrated request - cache_key = request.unique_key - self._cache_request( - cache_key, - ProcessedRequest( - unique_key=request.unique_key, - was_already_present=True, - was_already_handled=request.handled_at is not None, - ), - hydrated_request=request, - ) - except Exception as exc: - logger.debug(f'Error fetching request {unique_key}: {exc!s}') - return None - else: - return request + await self._ensure_head_is_non_empty() + return not self._head_requests and not self._queue_has_locked_requests and not self._requests_in_progress async def _update_request( self, @@ -593,104 +503,3 @@ async def _update_request( return ProcessedRequest.model_validate( {'uniqueKey': request.unique_key} | response, ) - - async def _list_head( - self, - *, - limit: int = 25, - ) -> RequestQueueHead: - """Retrieve requests from the beginning of the queue. - - Args: - limit: Maximum number of requests to retrieve. - - Returns: - A collection of requests from the beginning of the queue. - """ - # Return from cache if available and we're not checking for new forefront requests - if self._queue_head and not self._should_check_for_forefront_requests: - logger.debug(f'Using cached queue head with {len(self._queue_head)} requests') - # Create a list of requests from the cached queue head - items = [] - for unique_key in list(self._queue_head)[:limit]: - cached_request = self._requests_cache.get(unique_key) - if cached_request and cached_request.hydrated: - items.append(cached_request.hydrated) - - metadata = await self._get_metadata_estimate() - - return RequestQueueHead( - limit=limit, - had_multiple_clients=metadata.had_multiple_clients, - queue_modified_at=metadata.modified_at, - items=items, - lock_time=None, - queue_has_locked_requests=self._queue_has_locked_requests, - ) - leftover_buffer = list[str]() - if self._should_check_for_forefront_requests: - leftover_buffer = list(self._queue_head) - self._queue_head.clear() - self._should_check_for_forefront_requests = False - - # Otherwise fetch from API - response = await self._api_client.list_head( - limit=limit, - ) - - # Update the queue head cache - self._queue_has_locked_requests = response.get('queueHasLockedRequests', False) - # Check if there is another client working with the RequestQueue - self._metadata.had_multiple_clients = response.get('hadMultipleClients', False) - - for request_data in response.get('items', []): - request = Request.model_validate(request_data) - - # Skip requests without ID or unique key - if not request.unique_key: - logger.debug( - 'Skipping request from queue head, missing ID or unique key', - extra={ - 'unique_key': request.unique_key, - }, - ) - continue - - # Cache the request - self._cache_request( - request.unique_key, - ProcessedRequest( - unique_key=request.unique_key, - was_already_present=True, - was_already_handled=False, - ), - hydrated_request=request, - ) - self._queue_head.append(request.unique_key) - - for leftover_unique_key in leftover_buffer: - # After adding new requests to the forefront, any existing leftover locked request is kept in the end. - self._queue_head.append(leftover_unique_key) - return RequestQueueHead.model_validate(response) - - def _cache_request( - self, - cache_key: str, - processed_request: ProcessedRequest, - *, - hydrated_request: Request | None = None, - ) -> None: - """Cache a request for future use. - - Args: - cache_key: The key to use for caching the request. It should be request ID. - processed_request: The processed request information. - forefront: Whether the request was added to the forefront of the queue. - hydrated_request: The hydrated request object, if available. - """ - self._requests_cache[cache_key] = CachedRequest( - unique_key=processed_request.unique_key, - was_already_handled=processed_request.was_already_handled, - hydrated=hydrated_request, - lock_expires_at=None, - ) From 08df986ca542838e81022abc581c1eacbceb259c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 27 Aug 2025 14:06:37 +0200 Subject: [PATCH 03/26] Try to use list_head --- .../_apify/_request_queue_client.py | 167 +++++++++++++----- 1 file changed, 121 insertions(+), 46 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 236a49ec..41d7f56c 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -4,7 +4,7 @@ import re from base64 import b64encode from collections import deque -from datetime import datetime, timedelta, timezone +from datetime import datetime, timezone from hashlib import sha256 from logging import getLogger from typing import TYPE_CHECKING, Final @@ -61,21 +61,24 @@ class ApifyRequestQueueClient(RequestQueueClient): - Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to be handled sooner. (Explanation below) - This client always consumes first own requests and only if no local requests exists it tries to get requests from - the global queue. + the global queue. ??? + - Requests are only added to the queue, never deleted. (Marking as handled is ok.) If the constraints are not met, the client might work in an unpredictable way. Optimization notes: - The client aggressively caches requests to avoid unnecessary API calls. - - The client adds requests to the global queue if they are handled. + - The client adds requests to the global queue if they are handled. (Potential optimization, but problematic, + probably not worth it) - The client adds unhandled requests to the global queue only if local cache size reaches some threshold or based on external callback. (To prevent double API call per request - adding request to the global queue and marking it as - handled. The client tries to do that in one step if possible.) + handled. The client tries to do that in one step if possible.) (Potential optimization, but problematic, + probably not worth it) - The client tracks own forefront (priority requests), that does not have to be in sync with the global forefront. """ - _DEFAULT_LOCK_TIME: Final[timedelta] = timedelta(minutes=3) - """The default lock time for requests in the queue.""" + _MAX_HEAD_ITEMS: Final[int] = 200 + """The maximum head items read count limited by API.""" _MAX_CACHED_REQUESTS: Final[int] = 1_000_000 """Maximum number of requests that can be cached.""" @@ -102,8 +105,11 @@ def __init__( self._head_requests: deque[str] = deque() """Ordered unique keys of requests that that represents queue head.""" - self._requests_on_platform: set[str] = set() - """Set of requests unique keys that are already present on the platform. To enable local deduplication.""" + self._requests_already_handled: set[str] = set() + """Local estimation of requests unique keys that are already present and handled on the platform. + + (Could be persisted to optimize migrations) + To enhance local deduplication and track handled requests to reduce amount of API calls.""" self._requests_in_progress: set[str] = set() """Set of requests unique keys that are being processed locally. @@ -279,12 +285,27 @@ async def add_batch_of_requests( Response containing information about the added requests. """ # Do not try to add previously added requests to avoid pointless expensive calls to API + # Check if request is known to be already handled (it has to be present as well.) + # Check if request is known to be already present, but unhandled + # Push to the platform. Probably not there, or we are not aware of it + # (added by another producer or before migration). + new_requests: list[ProcessedRequest] = [] already_present_requests: list[ProcessedRequest] = [] for request in requests: - if self._requests_cache.get(request.unique_key): + if request.unique_key in self._requests_already_handled: + already_present_requests.append( + ProcessedRequest.model_validate( + { + 'uniqueKey': request.unique_key, + 'wasAlreadyPresent': True, + 'wasAlreadyHandled': True, + } + ) + ) + elif self._requests_cache.get(request.unique_key): already_present_requests.append( ProcessedRequest.model_validate( { @@ -294,7 +315,6 @@ async def add_batch_of_requests( } ) ) - else: new_requests.append( ProcessedRequest.model_validate( @@ -314,14 +334,38 @@ async def add_batch_of_requests( else: self._head_requests.appendleft(request.unique_key) + if new_requests: + # Prepare requests for API by converting to dictionaries. + requests_dict = [ + request.model_dump( + by_alias=True, + ) + for request in new_requests + ] - api_response = AddRequestsResponse.model_validate( - {'unprocessedRequests': [], 'processedRequests': already_present_requests+new_requests} - ) + # Send requests to API. + api_response = AddRequestsResponse.model_validate( + await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront) + ) + # Add the locally known already present processed requests based on the local cache. + api_response.processed_requests.extend(already_present_requests) + # Remove unprocessed requests from the cache + for unprocessed_request in api_response.unprocessed_requests: + self._requests_cache.pop(unprocessed_request.unique_key, None) + + else: + api_response = AddRequestsResponse.model_validate( + {'unprocessedRequests': [], 'processedRequests': already_present_requests} + ) # Update assumed total count for newly added requests. - self._metadata.total_request_count += len(new_requests) + new_request_count = 0 + for processed_request in api_response.processed_requests: + if not processed_request.was_already_present and not processed_request.was_already_handled: + new_request_count += 1 + self._metadata.total_request_count += new_request_count + return api_response @override @@ -361,7 +405,10 @@ async def fetch_next_request(self) -> Request | None: while self._head_requests: request_unique_key = self._head_requests.pop() - if request_unique_key not in self._requests_in_progress: + if ( + request_unique_key not in self._requests_in_progress and + request_unique_key not in self._requests_already_handled + ): self._requests_in_progress.add(request_unique_key) return await self.get_request(request_unique_key) # No request locally and the ones returned from the platform are already in progress. @@ -369,21 +416,39 @@ async def fetch_next_request(self) -> Request | None: async def _ensure_head_is_non_empty(self) -> None: """Ensure that the queue head has requests if they are available in the queue.""" - if not self._head_requests: - response = await self._api_client.list_and_lock_head(limit=25, - lock_secs=int(self._DEFAULT_LOCK_TIME.total_seconds())) - # Update the queue head cache - self._queue_has_locked_requests = response.get('queueHasLockedRequests', False) - # Check if there is another client working with the RequestQueue - self._metadata.had_multiple_clients = response.get('hadMultipleClients', False) - if modified_at:= response.get('queueModifiedAt'): - self._metadata.modified_at = max(self._metadata.modified_at, modified_at) - - for request_data in response.get('items', []): - request = Request.model_validate(request_data) + if len(self._head_requests)<=1: + await self._list_head() + + + async def _list_head(self) -> None: + desired_new_head_items = 100 + # The head will contain in progress requests as well, so we need to fetch more, to get some new ones. + requested_head_items = max(self._MAX_HEAD_ITEMS, desired_new_head_items + len(self._requests_in_progress)) + response = await self._api_client.list_head(limit=requested_head_items) + + # Update metadata + self._queue_has_locked_requests = response.get('queueHasLockedRequests', False) + # Check if there is another client working with the RequestQueue + self._metadata.had_multiple_clients = response.get('hadMultipleClients', False) + # Should warn once? This might be outside expected context if the other consumers consumes at the same time + + if modified_at := response.get('queueModifiedAt'): + self._metadata.modified_at = max(self._metadata.modified_at, modified_at) + + # Update the cached data + for request_data in response.get('items', []): + request = Request.model_validate(request_data) + + if request.unique_key in self._requests_in_progress: + # Ignore requests that are already in progress, we will not process them again. + continue + if request.was_already_handled: + # Do not cache fully handled requests, we do not need them. Just cache their unique_key. + self._requests_already_handled.add(request.unique_key) + else: self._requests_cache[request.unique_key] = request - self._head_requests.append(request.unique_key) - self._requests_on_platform.add(request.unique_key) + # Add new requests to the end of the head + self._head_requests.appendleft(request.unique_key) @override @@ -406,21 +471,24 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | if cached_request := self._requests_cache[request.unique_key]: cached_request.handled_at = request.handled_at - try: - # Update the request in the API - # Works as upsert - adds the request if it does not exist yet. (Local request that was handled before adding - # to the queue.) - processed_request = await self._update_request(request) - # Remove request from cache. It will no longer bee needed. - self._requests_cache.pop(request.unique_key) - self._requests_in_progress.discard(request.unique_key) - self._requests_on_platform.add(request.unique_key) - - except Exception as exc: - logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}') - return None - else: - return processed_request + + async with self._fetch_lock: + try: + # Update the request in the API + # Works as upsert - adds the request if it does not exist yet. (Local request that was handled before + # adding to the queue.) + processed_request = await self._update_request(request) + # Remove request from cache. It will most likely not be needed. + self._requests_cache.pop(request.unique_key) + self._requests_in_progress.discard(request.unique_key) + # Remember that we handled this request, to optimize local deduplication. + self._requests_already_handled.add(request.unique_key) + + except Exception as exc: + logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}') + return None + else: + return processed_request @override async def reclaim_request( @@ -448,10 +516,17 @@ async def reclaim_request( # Reclaim with lock to prevent race conditions that could lead to double processing of the same request. async with self._fetch_lock: try: - # Update the request in the API. + # Make sure request is in the local cache. We might need it. self._requests_cache[request.unique_key] = request + + # No longer in progress self._requests_in_progress.discard(request.unique_key) - self._head_requests.append(request.unique_key) + # No longer handled + self._requests_already_handled.discard(request.unique_key) + + if forefront: + # Append to top of the local head estimation + self._head_requests.append(request.unique_key) processed_request = await self._update_request(request, forefront=forefront) processed_request.unique_key = request.unique_key From 6131fff02b0923c1fa17ab545801d98c6bb67169 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 27 Aug 2025 14:47:44 +0200 Subject: [PATCH 04/26] Locks not needed with in_progress --- .../_apify/_request_queue_client.py | 120 ++++++++---------- 1 file changed, 53 insertions(+), 67 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 41d7f56c..e876c12a 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -291,7 +291,7 @@ async def add_batch_of_requests( # (added by another producer or before migration). - new_requests: list[ProcessedRequest] = [] + new_requests: list[Request] = [] already_present_requests: list[ProcessedRequest] = [] for request in requests: @@ -316,15 +316,7 @@ async def add_batch_of_requests( ) ) else: - new_requests.append( - ProcessedRequest.model_validate( - { - 'uniqueKey': request.unique_key, - 'wasAlreadyPresent': False, - 'wasAlreadyHandled': request.was_already_handled, - } - ) - ) + new_requests.append(request) # Update local caches @@ -400,19 +392,18 @@ async def fetch_next_request(self) -> Request | None: Returns: The request or `None` if there are no more pending requests. """ - async with self._fetch_lock: - await self._ensure_head_is_non_empty() - - while self._head_requests: - request_unique_key = self._head_requests.pop() - if ( - request_unique_key not in self._requests_in_progress and - request_unique_key not in self._requests_already_handled - ): - self._requests_in_progress.add(request_unique_key) - return await self.get_request(request_unique_key) - # No request locally and the ones returned from the platform are already in progress. - return None + await self._ensure_head_is_non_empty() + + while self._head_requests: + request_unique_key = self._head_requests.pop() + if ( + request_unique_key not in self._requests_in_progress and + request_unique_key not in self._requests_already_handled + ): + self._requests_in_progress.add(request_unique_key) + return await self.get_request(request_unique_key) + # No request locally and the ones returned from the platform are already in progress. + return None async def _ensure_head_is_non_empty(self) -> None: """Ensure that the queue head has requests if they are available in the queue.""" @@ -427,7 +418,6 @@ async def _list_head(self) -> None: response = await self._api_client.list_head(limit=requested_head_items) # Update metadata - self._queue_has_locked_requests = response.get('queueHasLockedRequests', False) # Check if there is another client working with the RequestQueue self._metadata.had_multiple_clients = response.get('hadMultipleClients', False) # Should warn once? This might be outside expected context if the other consumers consumes at the same time @@ -472,23 +462,22 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | if cached_request := self._requests_cache[request.unique_key]: cached_request.handled_at = request.handled_at - async with self._fetch_lock: - try: - # Update the request in the API - # Works as upsert - adds the request if it does not exist yet. (Local request that was handled before - # adding to the queue.) - processed_request = await self._update_request(request) - # Remove request from cache. It will most likely not be needed. - self._requests_cache.pop(request.unique_key) - self._requests_in_progress.discard(request.unique_key) - # Remember that we handled this request, to optimize local deduplication. - self._requests_already_handled.add(request.unique_key) - - except Exception as exc: - logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}') - return None - else: - return processed_request + try: + # Update the request in the API + # Works as upsert - adds the request if it does not exist yet. (Local request that was handled before + # adding to the queue.) + processed_request = await self._update_request(request) + # Remember that we handled this request, to optimize local deduplication. + self._requests_already_handled.add(request.unique_key) + # Remove request from cache. It will most likely not be needed. + self._requests_cache.pop(request.unique_key) + self._requests_in_progress.discard(request.unique_key) + + except Exception as exc: + logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}') + return None + else: + return processed_request @override async def reclaim_request( @@ -513,33 +502,31 @@ async def reclaim_request( if request.was_already_handled: request.handled_at = None - # Reclaim with lock to prevent race conditions that could lead to double processing of the same request. - async with self._fetch_lock: - try: - # Make sure request is in the local cache. We might need it. - self._requests_cache[request.unique_key] = request + try: + # Make sure request is in the local cache. We might need it. + self._requests_cache[request.unique_key] = request - # No longer in progress - self._requests_in_progress.discard(request.unique_key) - # No longer handled - self._requests_already_handled.discard(request.unique_key) + # No longer in progress + self._requests_in_progress.discard(request.unique_key) + # No longer handled + self._requests_already_handled.discard(request.unique_key) - if forefront: - # Append to top of the local head estimation - self._head_requests.append(request.unique_key) + if forefront: + # Append to top of the local head estimation + self._head_requests.append(request.unique_key) - processed_request = await self._update_request(request, forefront=forefront) - processed_request.unique_key = request.unique_key - # If the request was previously handled, decrement our handled count since - # we're putting it back for processing. - if request.was_already_handled and not processed_request.was_already_handled: - self._metadata.handled_request_count -= 1 + processed_request = await self._update_request(request, forefront=forefront) + processed_request.unique_key = request.unique_key + # If the request was previously handled, decrement our handled count since + # we're putting it back for processing. + if request.was_already_handled and not processed_request.was_already_handled: + self._metadata.handled_request_count -= 1 - except Exception as exc: - logger.debug(f'Error reclaiming request {request.unique_key}: {exc!s}') - return None - else: - return processed_request + except Exception as exc: + logger.debug(f'Error reclaiming request {request.unique_key}: {exc!s}') + return None + else: + return processed_request @override async def is_empty(self) -> bool: @@ -549,9 +536,8 @@ async def is_empty(self) -> bool: True if the queue is empty, False otherwise. """ # Without the lock the `is_empty` is prone to falsely report True with some low probability race condition. - async with self._fetch_lock: - await self._ensure_head_is_non_empty() - return not self._head_requests and not self._queue_has_locked_requests and not self._requests_in_progress + await self._ensure_head_is_non_empty() + return not self._head_requests and not self._requests_in_progress async def _update_request( self, From 553663a42c7ecdccd2a3443a60ddb297688271f5 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 27 Aug 2025 16:08:01 +0200 Subject: [PATCH 05/26] Add alternate client --- .../_apify/_request_queue_client.py | 358 +--------- .../_apify/_request_queue_client_full.py | 672 ++++++++++++++++++ .../_apify/_request_queue_client_simple.py | 392 ++++++++++ .../storage_clients/_apify/_storage_client.py | 6 +- 4 files changed, 1070 insertions(+), 358 deletions(-) create mode 100644 src/apify/storage_clients/_apify/_request_queue_client_full.py create mode 100644 src/apify/storage_clients/_apify/_request_queue_client_simple.py diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index e876c12a..a2a347b3 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -1,27 +1,19 @@ from __future__ import annotations -import asyncio import re from base64 import b64encode -from collections import deque -from datetime import datetime, timezone from hashlib import sha256 from logging import getLogger from typing import TYPE_CHECKING, Final -from cachetools import LRUCache from typing_extensions import override from apify_client import ApifyClientAsync from crawlee._utils.crypto import crypto_random_object_id from crawlee.storage_clients._base import RequestQueueClient -from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata - -from apify import Request +from crawlee.storage_clients.models import RequestQueueMetadata if TYPE_CHECKING: - from collections.abc import Sequence - from apify_client.clients import RequestQueueClientAsync from apify import Configuration @@ -53,32 +45,7 @@ def unique_key_to_request_id(unique_key: str, *, request_id_length: int = 15) -> class ApifyRequestQueueClient(RequestQueueClient): - """An Apify platform implementation of the request queue client with limited capability. - - This client is designed to use as little resources as possible, but has to be used in constrained context. - Constraints: - - Only one client is consuming the request queue at the time. - - Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to be handled - sooner. (Explanation below) - - This client always consumes first own requests and only if no local requests exists it tries to get requests from - the global queue. ??? - - Requests are only added to the queue, never deleted. (Marking as handled is ok.) - - If the constraints are not met, the client might work in an unpredictable way. - - Optimization notes: - - The client aggressively caches requests to avoid unnecessary API calls. - - The client adds requests to the global queue if they are handled. (Potential optimization, but problematic, - probably not worth it) - - The client adds unhandled requests to the global queue only if local cache size reaches some threshold or based on - external callback. (To prevent double API call per request - adding request to the global queue and marking it as - handled. The client tries to do that in one step if possible.) (Potential optimization, but problematic, - probably not worth it) - - The client tracks own forefront (priority requests), that does not have to be in sync with the global forefront. - """ - - _MAX_HEAD_ITEMS: Final[int] = 200 - """The maximum head items read count limited by API.""" + """Base class for Apify platform implementations of the request queue client.""" _MAX_CACHED_REQUESTS: Final[int] = 1_000_000 """Maximum number of requests that can be cached.""" @@ -99,29 +66,6 @@ def __init__( self._metadata = metadata """Additional data related to the RequestQueue.""" - self._requests_cache: LRUCache[str, Request] = LRUCache(maxsize=self._MAX_CACHED_REQUESTS) - """A cache to store request objects. Request unique key is used as the cache key.""" - - self._head_requests: deque[str] = deque() - """Ordered unique keys of requests that that represents queue head.""" - - self._requests_already_handled: set[str] = set() - """Local estimation of requests unique keys that are already present and handled on the platform. - - (Could be persisted to optimize migrations) - To enhance local deduplication and track handled requests to reduce amount of API calls.""" - - self._requests_in_progress: set[str] = set() - """Set of requests unique keys that are being processed locally. - - - To avoid double processing of requests that have been processing for a long time, got unlocked on the platform - and got fetched again from platform. (Rare edge case.) - - To help decide if the RQ is finished or not.""" - - self._fetch_lock = asyncio.Lock() - """Fetch lock to minimize race conditions when communicating with API.""" - - @override async def get_metadata(self) -> RequestQueueMetadata: """Get metadata about the request queue. @@ -130,7 +74,6 @@ async def get_metadata(self) -> RequestQueueMetadata: Metadata from the API, merged with local estimation, because in some cases, the data from the API can be delayed. """ - # TODO response = await self._api_client.get() if response is None: raise ValueError('Failed to fetch request queue metadata from the API.') @@ -267,300 +210,3 @@ async def purge(self) -> None: async def drop(self) -> None: # Could be shared with the normal ApifyRequestQueueClient await self._api_client.delete() - - @override - async def add_batch_of_requests( - self, - requests: Sequence[Request], - *, - forefront: bool = False, - ) -> AddRequestsResponse: - """Add a batch of requests to the queue. - - Args: - requests: The requests to add. - forefront: Whether to add the requests to the beginning of the queue. - - Returns: - Response containing information about the added requests. - """ - # Do not try to add previously added requests to avoid pointless expensive calls to API - # Check if request is known to be already handled (it has to be present as well.) - # Check if request is known to be already present, but unhandled - # Push to the platform. Probably not there, or we are not aware of it - # (added by another producer or before migration). - - - new_requests: list[Request] = [] - already_present_requests: list[ProcessedRequest] = [] - - for request in requests: - if request.unique_key in self._requests_already_handled: - already_present_requests.append( - ProcessedRequest.model_validate( - { - 'uniqueKey': request.unique_key, - 'wasAlreadyPresent': True, - 'wasAlreadyHandled': True, - } - ) - ) - elif self._requests_cache.get(request.unique_key): - already_present_requests.append( - ProcessedRequest.model_validate( - { - 'uniqueKey': request.unique_key, - 'wasAlreadyPresent': True, - 'wasAlreadyHandled': request.was_already_handled, - } - ) - ) - else: - new_requests.append(request) - - - # Update local caches - self._requests_cache[request.unique_key] = request - if forefront: - self._head_requests.append(request.unique_key) - else: - self._head_requests.appendleft(request.unique_key) - - if new_requests: - # Prepare requests for API by converting to dictionaries. - requests_dict = [ - request.model_dump( - by_alias=True, - ) - for request in new_requests - ] - - # Send requests to API. - api_response = AddRequestsResponse.model_validate( - await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront) - ) - # Add the locally known already present processed requests based on the local cache. - api_response.processed_requests.extend(already_present_requests) - # Remove unprocessed requests from the cache - for unprocessed_request in api_response.unprocessed_requests: - self._requests_cache.pop(unprocessed_request.unique_key, None) - - else: - api_response = AddRequestsResponse.model_validate( - {'unprocessedRequests': [], 'processedRequests': already_present_requests} - ) - - - # Update assumed total count for newly added requests. - new_request_count = 0 - for processed_request in api_response.processed_requests: - if not processed_request.was_already_present and not processed_request.was_already_handled: - new_request_count += 1 - self._metadata.total_request_count += new_request_count - - return api_response - - @override - async def get_request(self, unique_key: str) -> Request | None: - """Get a request by unique key. - - Args: - unique_key: Unique key of the request to get. - - Returns: - The request or None if not found. - """ - if unique_key in self._requests_cache: - return self._requests_cache[unique_key] - - response = await self._api_client.get_request(unique_key_to_request_id(unique_key)) - - if response is None: - return None - - return Request.model_validate(response) - - @override - async def fetch_next_request(self) -> Request | None: - """Return the next request in the queue to be processed. - - Once you successfully finish processing of the request, you need to call `mark_request_as_handled` - to mark the request as handled in the queue. If there was some error in processing the request, call - `reclaim_request` instead, so that the queue will give the request to some other consumer - in another call to the `fetch_next_request` method. - - Returns: - The request or `None` if there are no more pending requests. - """ - await self._ensure_head_is_non_empty() - - while self._head_requests: - request_unique_key = self._head_requests.pop() - if ( - request_unique_key not in self._requests_in_progress and - request_unique_key not in self._requests_already_handled - ): - self._requests_in_progress.add(request_unique_key) - return await self.get_request(request_unique_key) - # No request locally and the ones returned from the platform are already in progress. - return None - - async def _ensure_head_is_non_empty(self) -> None: - """Ensure that the queue head has requests if they are available in the queue.""" - if len(self._head_requests)<=1: - await self._list_head() - - - async def _list_head(self) -> None: - desired_new_head_items = 100 - # The head will contain in progress requests as well, so we need to fetch more, to get some new ones. - requested_head_items = max(self._MAX_HEAD_ITEMS, desired_new_head_items + len(self._requests_in_progress)) - response = await self._api_client.list_head(limit=requested_head_items) - - # Update metadata - # Check if there is another client working with the RequestQueue - self._metadata.had_multiple_clients = response.get('hadMultipleClients', False) - # Should warn once? This might be outside expected context if the other consumers consumes at the same time - - if modified_at := response.get('queueModifiedAt'): - self._metadata.modified_at = max(self._metadata.modified_at, modified_at) - - # Update the cached data - for request_data in response.get('items', []): - request = Request.model_validate(request_data) - - if request.unique_key in self._requests_in_progress: - # Ignore requests that are already in progress, we will not process them again. - continue - if request.was_already_handled: - # Do not cache fully handled requests, we do not need them. Just cache their unique_key. - self._requests_already_handled.add(request.unique_key) - else: - self._requests_cache[request.unique_key] = request - # Add new requests to the end of the head - self._head_requests.appendleft(request.unique_key) - - - @override - async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: - """Mark a request as handled after successful processing. - - Handled requests will never again be returned by the `fetch_next_request` method. - - Args: - request: The request to mark as handled. - - Returns: - Information about the queue operation. `None` if the given request was not in progress. - """ - # Set the handled_at timestamp if not already set - - if request.handled_at is None: - request.handled_at = datetime.now(tz=timezone.utc) - self._metadata.handled_request_count += 1 - - if cached_request := self._requests_cache[request.unique_key]: - cached_request.handled_at = request.handled_at - - try: - # Update the request in the API - # Works as upsert - adds the request if it does not exist yet. (Local request that was handled before - # adding to the queue.) - processed_request = await self._update_request(request) - # Remember that we handled this request, to optimize local deduplication. - self._requests_already_handled.add(request.unique_key) - # Remove request from cache. It will most likely not be needed. - self._requests_cache.pop(request.unique_key) - self._requests_in_progress.discard(request.unique_key) - - except Exception as exc: - logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}') - return None - else: - return processed_request - - @override - async def reclaim_request( - self, - request: Request, - *, - forefront: bool = False, - ) -> ProcessedRequest | None: - """Reclaim a failed request back to the queue. - - The request will be returned for processing later again by another call to `fetch_next_request`. - - Args: - request: The request to return to the queue. - forefront: Whether to add the request to the head or the end of the queue. - - Returns: - Information about the queue operation. `None` if the given request was not in progress. - """ - # Check if the request was marked as handled and clear it. When reclaiming, - # we want to put the request back for processing. - if request.was_already_handled: - request.handled_at = None - - try: - # Make sure request is in the local cache. We might need it. - self._requests_cache[request.unique_key] = request - - # No longer in progress - self._requests_in_progress.discard(request.unique_key) - # No longer handled - self._requests_already_handled.discard(request.unique_key) - - if forefront: - # Append to top of the local head estimation - self._head_requests.append(request.unique_key) - - processed_request = await self._update_request(request, forefront=forefront) - processed_request.unique_key = request.unique_key - # If the request was previously handled, decrement our handled count since - # we're putting it back for processing. - if request.was_already_handled and not processed_request.was_already_handled: - self._metadata.handled_request_count -= 1 - - except Exception as exc: - logger.debug(f'Error reclaiming request {request.unique_key}: {exc!s}') - return None - else: - return processed_request - - @override - async def is_empty(self) -> bool: - """Check if the queue is empty. - - Returns: - True if the queue is empty, False otherwise. - """ - # Without the lock the `is_empty` is prone to falsely report True with some low probability race condition. - await self._ensure_head_is_non_empty() - return not self._head_requests and not self._requests_in_progress - - async def _update_request( - self, - request: Request, - *, - forefront: bool = False, - ) -> ProcessedRequest: - """Update a request in the queue. - - Args: - request: The updated request. - forefront: Whether to put the updated request in the beginning or the end of the queue. - - Returns: - The updated request - """ - request_dict = request.model_dump(by_alias=True) - request_dict['id'] = unique_key_to_request_id(request.unique_key) - response = await self._api_client.update_request( - request=request_dict, - forefront=forefront, - ) - - return ProcessedRequest.model_validate( - {'uniqueKey': request.unique_key} | response, - ) diff --git a/src/apify/storage_clients/_apify/_request_queue_client_full.py b/src/apify/storage_clients/_apify/_request_queue_client_full.py new file mode 100644 index 00000000..6aafb30a --- /dev/null +++ b/src/apify/storage_clients/_apify/_request_queue_client_full.py @@ -0,0 +1,672 @@ +from __future__ import annotations + +import asyncio +from collections import deque +from datetime import datetime, timedelta, timezone +from logging import getLogger +from typing import TYPE_CHECKING, Final + +from cachetools import LRUCache +from typing_extensions import override + +from apify_client import ApifyClientAsync +from crawlee._utils.crypto import crypto_random_object_id +from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata + +from . import ApifyRequestQueueClient +from ._models import CachedRequest, RequestQueueHead +from ._request_queue_client import unique_key_to_request_id +from apify import Request + +if TYPE_CHECKING: + from collections.abc import Sequence + + from apify_client.clients import RequestQueueClientAsync + + from apify import Configuration + +logger = getLogger(__name__) + + +class ApifyRequestQueueClientFull(ApifyRequestQueueClient): + """An Apify platform implementation of the request queue client. + + This implementation supports multiple producers and multiple consumers scenario. + """ + + _DEFAULT_LOCK_TIME: Final[timedelta] = timedelta(minutes=3) + """The default lock time for requests in the queue.""" + + def __init__( + self, + *, + api_client: RequestQueueClientAsync, + metadata: RequestQueueMetadata, + ) -> None: + """Initialize a new instance. + + Preferably use the `ApifyRequestQueueClient.open` class method to create a new instance. + """ + self._api_client = api_client + """The Apify request queue client for API operations.""" + + self._metadata = metadata + """Additional data related to the RequestQueue.""" + + self._queue_head = deque[str]() + """A deque to store request unique keys in the queue head.""" + + self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=self._MAX_CACHED_REQUESTS) + """A cache to store request objects. Request unique key is used as the cache key.""" + + self._queue_has_locked_requests: bool | None = None + """Whether the queue has requests locked by another client.""" + + self._should_check_for_forefront_requests = False + """Whether to check for forefront requests in the next list_head call.""" + + self._fetch_lock = asyncio.Lock() + """Fetch lock to minimize race conditions when communicating with API.""" + + async def _get_metadata_estimate(self) -> RequestQueueMetadata: + """Try to get cached metadata first. If multiple clients, fuse with global metadata. + + This method is used internally to avoid unnecessary API call unless needed (multiple clients). + Local estimation of metadata is without delay, unlike metadata from API. In situation where there is only one + client, it is the better choice. + """ + if self._metadata.had_multiple_clients: + return await self.get_metadata() + # Get local estimation (will not include changes done bo another client) + return self._metadata + + @override + async def get_metadata(self) -> RequestQueueMetadata: + """Get metadata about the request queue. + + Returns: + Metadata from the API, merged with local estimation, because in some cases, the data from the API can + be delayed. + """ + response = await self._api_client.get() + if response is None: + raise ValueError('Failed to fetch request queue metadata from the API.') + # Enhance API response by local estimations (API can be delayed few seconds, while local estimation not.) + return RequestQueueMetadata( + id=response['id'], + name=response['name'], + total_request_count=max(response['totalRequestCount'], self._metadata.total_request_count), + handled_request_count=max(response['handledRequestCount'], self._metadata.handled_request_count), + pending_request_count=response['pendingRequestCount'], + created_at=min(response['createdAt'], self._metadata.created_at), + modified_at=max(response['modifiedAt'], self._metadata.modified_at), + accessed_at=max(response['accessedAt'], self._metadata.accessed_at), + had_multiple_clients=response['hadMultipleClients'] or self._metadata.had_multiple_clients, + ) + + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + configuration: Configuration, + ) -> ApifyRequestQueueClient: + """Open an Apify request queue client. + + This method creates and initializes a new instance of the Apify request queue client. It handles + authentication, storage lookup/creation, and metadata retrieval, and sets up internal caching and queue + management structures. + + Args: + id: The ID of an existing request queue to open. If provided, the client will connect to this specific + storage. Cannot be used together with `name`. + name: The name of a request queue to get or create. If a storage with this name exists, it will be opened; + otherwise, a new one will be created. Cannot be used together with `id`. + configuration: The configuration object containing API credentials and settings. Must include a valid + `token` and `api_base_url`. May also contain a `default_request_queue_id` for fallback when neither + `id` nor `name` is provided. + + Returns: + An instance for the opened or created storage client. + + Raises: + ValueError: If the configuration is missing required fields (token, api_base_url), if both `id` and `name` + are provided, or if neither `id` nor `name` is provided and no default storage ID is available + in the configuration. + """ + token = configuration.token + if not token: + raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).') + + api_url = configuration.api_base_url + if not api_url: + raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).') + + api_public_base_url = configuration.api_public_base_url + if not api_public_base_url: + raise ValueError( + 'Apify storage client requires a valid API public base URL in Configuration ' + f'(api_public_base_url={api_public_base_url}).' + ) + + # Create Apify client with the provided token and API URL. + apify_client_async = ApifyClientAsync( + token=token, + api_url=api_url, + max_retries=8, + min_delay_between_retries_millis=500, + timeout_secs=360, + ) + apify_rqs_client = apify_client_async.request_queues() + + match (id, name): + case (None, None): + # If both id and name are None, try to get the default storage ID from environment variables. + # The default storage ID environment variable is set by the Apify platform. It also contains + # a new storage ID after Actor's reboot or migration. + id = configuration.default_request_queue_id + case (None, name): + # If only name is provided, get or create the storage by name. + id = RequestQueueMetadata.model_validate( + await apify_rqs_client.get_or_create(name=name), + ).id + case (_, None): + # If only id is provided, use it. + pass + case (_, _): + # If both id and name are provided, raise an error. + raise ValueError('Only one of "id" or "name" can be specified, not both.') + if id is None: + raise RuntimeError('Unreachable code') + + # Use suitable client_key to make `hadMultipleClients` response of Apify API useful. + # It should persist across migrated or resurrected Actor runs on the Apify platform. + _api_max_client_key_length = 32 + client_key = (configuration.actor_run_id or crypto_random_object_id(length=_api_max_client_key_length))[ + :_api_max_client_key_length + ] + + apify_rq_client = apify_client_async.request_queue(request_queue_id=id, client_key=client_key) + + # Fetch its metadata. + metadata = await apify_rq_client.get() + + # If metadata is None, it means the storage does not exist, so we create it. + if metadata is None: + id = RequestQueueMetadata.model_validate( + await apify_rqs_client.get_or_create(), + ).id + apify_rq_client = apify_client_async.request_queue(request_queue_id=id, client_key=client_key) + + # Verify that the storage exists by fetching its metadata again. + metadata = await apify_rq_client.get() + if metadata is None: + raise ValueError(f'Opening request queue with id={id} and name={name} failed.') + + metadata_model = RequestQueueMetadata.model_validate(metadata) + + return cls( + api_client=apify_rq_client, + metadata=metadata_model, + ) + + @override + async def purge(self) -> None: + raise NotImplementedError( + 'Purging the request queue is not supported in the Apify platform. ' + 'Use the `drop` method to delete the request queue instead.' + ) + + @override + async def drop(self) -> None: + await self._api_client.delete() + + @override + async def add_batch_of_requests( + self, + requests: Sequence[Request], + *, + forefront: bool = False, + ) -> AddRequestsResponse: + """Add a batch of requests to the queue. + + Args: + requests: The requests to add. + forefront: Whether to add the requests to the beginning of the queue. + + Returns: + Response containing information about the added requests. + """ + # Do not try to add previously added requests to avoid pointless expensive calls to API + + new_requests: list[Request] = [] + already_present_requests: list[ProcessedRequest] = [] + + for request in requests: + if self._requests_cache.get(request.unique_key): + # We are not sure if it was already handled at this point, and it is not worth calling API for it. + # It could have been handled by another client in the meantime, so cached information about + # `request.was_already_handled` is not reliable. + already_present_requests.append( + ProcessedRequest.model_validate( + { + 'uniqueKey': request.unique_key, + 'wasAlreadyPresent': True, + 'wasAlreadyHandled': request.was_already_handled, + } + ) + ) + + else: + # Add new request to the cache. + processed_request = ProcessedRequest.model_validate( + { + 'uniqueKey': request.unique_key, + 'wasAlreadyPresent': True, + 'wasAlreadyHandled': request.was_already_handled, + } + ) + self._cache_request( + request.unique_key, + processed_request, + ) + new_requests.append(request) + + if new_requests: + # Prepare requests for API by converting to dictionaries. + requests_dict = [ + request.model_dump( + by_alias=True, + exclude={'id'}, # Exclude ID fields from requests since the API doesn't accept them. + ) + for request in new_requests + ] + + # Send requests to API. + api_response = AddRequestsResponse.model_validate( + await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront) + ) + + # Add the locally known already present processed requests based on the local cache. + api_response.processed_requests.extend(already_present_requests) + + # Remove unprocessed requests from the cache + for unprocessed_request in api_response.unprocessed_requests: + self._requests_cache.pop(unprocessed_request.unique_key, None) + + else: + api_response = AddRequestsResponse.model_validate( + {'unprocessedRequests': [], 'processedRequests': already_present_requests} + ) + + logger.debug( + f'Tried to add new requests: {len(new_requests)}, ' + f'succeeded to add new requests: {len(api_response.processed_requests) - len(already_present_requests)}, ' + f'skipped already present requests: {len(already_present_requests)}' + ) + + # Update assumed total count for newly added requests. + new_request_count = 0 + for processed_request in api_response.processed_requests: + if not processed_request.was_already_present and not processed_request.was_already_handled: + new_request_count += 1 + + self._metadata.total_request_count += new_request_count + + return api_response + + @override + async def get_request(self, unique_key: str) -> Request | None: + """Get a request by unique key. + + Args: + unique_key: Unique key of the request to get. + + Returns: + The request or None if not found. + """ + response = await self._api_client.get_request(unique_key_to_request_id(unique_key)) + + if response is None: + return None + + return Request.model_validate(response) + + @override + async def fetch_next_request(self) -> Request | None: + """Return the next request in the queue to be processed. + + Once you successfully finish processing of the request, you need to call `mark_request_as_handled` + to mark the request as handled in the queue. If there was some error in processing the request, call + `reclaim_request` instead, so that the queue will give the request to some other consumer + in another call to the `fetch_next_request` method. + + Returns: + The request or `None` if there are no more pending requests. + """ + # Ensure the queue head has requests if available. Fetching the head with lock to prevent race conditions. + async with self._fetch_lock: + await self._ensure_head_is_non_empty() + + # If queue head is empty after ensuring, there are no requests + if not self._queue_head: + return None + + # Get the next request ID from the queue head + next_unique_key = self._queue_head.popleft() + + request = await self._get_or_hydrate_request(next_unique_key) + + # Handle potential inconsistency where request might not be in the main table yet + if request is None: + logger.debug( + 'Cannot find a request from the beginning of queue, will be retried later', + extra={'nextRequestUniqueKey': next_unique_key}, + ) + return None + + # If the request was already handled, skip it + if request.handled_at is not None: + logger.debug( + 'Request fetched from the beginning of queue was already handled', + extra={'nextRequestUniqueKey': next_unique_key}, + ) + return None + + # Use get request to ensure we have the full request object. + request = await self.get_request(request.unique_key) + if request is None: + logger.debug( + 'Request fetched from the beginning of queue was not found in the RQ', + extra={'nextRequestUniqueKey': next_unique_key}, + ) + return None + + return request + + @override + async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: + """Mark a request as handled after successful processing. + + Handled requests will never again be returned by the `fetch_next_request` method. + + Args: + request: The request to mark as handled. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + # Set the handled_at timestamp if not already set + if request.handled_at is None: + request.handled_at = datetime.now(tz=timezone.utc) + + if cached_request := self._requests_cache[request.unique_key]: + cached_request.was_already_handled = request.was_already_handled + try: + # Update the request in the API + processed_request = await self._update_request(request) + processed_request.unique_key = request.unique_key + + # Update assumed handled count if this wasn't already handled + if not processed_request.was_already_handled: + self._metadata.handled_request_count += 1 + + # Update the cache with the handled request + cache_key = request.unique_key + self._cache_request( + cache_key, + processed_request, + hydrated_request=request, + ) + except Exception as exc: + logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}') + return None + else: + return processed_request + + @override + async def reclaim_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest | None: + """Reclaim a failed request back to the queue. + + The request will be returned for processing later again by another call to `fetch_next_request`. + + Args: + request: The request to return to the queue. + forefront: Whether to add the request to the head or the end of the queue. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + # Check if the request was marked as handled and clear it. When reclaiming, + # we want to put the request back for processing. + if request.was_already_handled: + request.handled_at = None + + # Reclaim with lock to prevent race conditions that could lead to double processing of the same request. + async with self._fetch_lock: + try: + # Update the request in the API. + processed_request = await self._update_request(request, forefront=forefront) + processed_request.unique_key = request.unique_key + + # If the request was previously handled, decrement our handled count since + # we're putting it back for processing. + if request.was_already_handled and not processed_request.was_already_handled: + self._metadata.handled_request_count -= 1 + + # Update the cache + cache_key = request.unique_key + self._cache_request( + cache_key, + processed_request, + hydrated_request=request, + ) + + # If we're adding to the forefront, we need to check for forefront requests + # in the next list_head call + if forefront: + self._should_check_for_forefront_requests = True + + except Exception as exc: + logger.debug(f'Error reclaiming request {request.unique_key}: {exc!s}') + return None + else: + return processed_request + + @override + async def is_empty(self) -> bool: + """Check if the queue is empty. + + Returns: + True if the queue is empty, False otherwise. + """ + # Check _list_head. + # Without the lock the `is_empty` is prone to falsely report True with some low probability race condition. + async with self._fetch_lock: + head = await self._list_head(limit=1) + return len(head.items) == 0 and not self._queue_has_locked_requests + + async def _ensure_head_is_non_empty(self) -> None: + """Ensure that the queue head has requests if they are available in the queue.""" + # If queue head has adequate requests, skip fetching more + if len(self._queue_head) > 1 and not self._should_check_for_forefront_requests: + return + + # Fetch requests from the API and populate the queue head + await self._list_head() + + async def _get_or_hydrate_request(self, unique_key: str) -> Request | None: + """Get a request by unique key, either from cache or by fetching from API. + + Args: + unique_key: Unique key of the request to get. + + Returns: + The request if found and valid, otherwise None. + """ + # First check if the request is in our cache + cached_entry = self._requests_cache.get(unique_key) + + if cached_entry and cached_entry.hydrated: + # If we have the request hydrated in cache, return it + return cached_entry.hydrated + + # If not in cache or not hydrated, fetch the request + try: + # Fetch the request data + request = await self.get_request(unique_key) + + # If request is not found and return None + if not request: + return None + + # Update cache with hydrated request + cache_key = request.unique_key + self._cache_request( + cache_key, + ProcessedRequest( + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=request.handled_at is not None, + ), + hydrated_request=request, + ) + except Exception as exc: + logger.debug(f'Error fetching request {unique_key}: {exc!s}') + return None + else: + return request + + async def _update_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest: + """Update a request in the queue. + + Args: + request: The updated request. + forefront: Whether to put the updated request in the beginning or the end of the queue. + + Returns: + The updated request + """ + request_dict = request.model_dump(by_alias=True) + request_dict['id'] = unique_key_to_request_id(request.unique_key) + response = await self._api_client.update_request( + request=request_dict, + forefront=forefront, + ) + + return ProcessedRequest.model_validate( + {'uniqueKey': request.unique_key} | response, + ) + + async def _list_head( + self, + *, + limit: int = 25, + ) -> RequestQueueHead: + """Retrieve requests from the beginning of the queue. + + Args: + limit: Maximum number of requests to retrieve. + + Returns: + A collection of requests from the beginning of the queue. + """ + # Return from cache if available and we're not checking for new forefront requests + if self._queue_head and not self._should_check_for_forefront_requests: + logger.debug(f'Using cached queue head with {len(self._queue_head)} requests') + # Create a list of requests from the cached queue head + items = [] + for unique_key in list(self._queue_head)[:limit]: + cached_request = self._requests_cache.get(unique_key) + if cached_request and cached_request.hydrated: + items.append(cached_request.hydrated) + + metadata = await self._get_metadata_estimate() + + return RequestQueueHead( + limit=limit, + had_multiple_clients=metadata.had_multiple_clients, + queue_modified_at=metadata.modified_at, + items=items, + lock_time=None, + queue_has_locked_requests=self._queue_has_locked_requests, + ) + leftover_buffer = list[str]() + if self._should_check_for_forefront_requests: + leftover_buffer = list(self._queue_head) + self._queue_head.clear() + self._should_check_for_forefront_requests = False + + # Otherwise fetch from API + response = await self._api_client.list_and_lock_head( + lock_secs=int(self._DEFAULT_LOCK_TIME.total_seconds()), + limit=limit, + ) + + # Update the queue head cache + self._queue_has_locked_requests = response.get('queueHasLockedRequests', False) + # Check if there is another client working with the RequestQueue + self._metadata.had_multiple_clients = response.get('hadMultipleClients', False) + + for request_data in response.get('items', []): + request = Request.model_validate(request_data) + + # Skip requests without ID or unique key + if not request.unique_key: + logger.debug( + 'Skipping request from queue head, missing unique key', + extra={ + 'unique_key': request.unique_key, + }, + ) + continue + + # Cache the request + self._cache_request( + request.unique_key, + ProcessedRequest( + unique_key=request.unique_key, + was_already_present=True, + was_already_handled=False, + ), + hydrated_request=request, + ) + self._queue_head.append(request.unique_key) + + for leftover_unique_key in leftover_buffer: + # After adding new requests to the forefront, any existing leftover locked request is kept in the end. + self._queue_head.append(leftover_unique_key) + return RequestQueueHead.model_validate(response) + + def _cache_request( + self, + cache_key: str, + processed_request: ProcessedRequest, + *, + hydrated_request: Request | None = None, + ) -> None: + """Cache a request for future use. + + Args: + cache_key: The key to use for caching the request. It should be request ID. + processed_request: The processed request information. + forefront: Whether the request was added to the forefront of the queue. + hydrated_request: The hydrated request object, if available. + """ + self._requests_cache[cache_key] = CachedRequest( + unique_key=processed_request.unique_key, + was_already_handled=processed_request.was_already_handled, + hydrated=hydrated_request, + lock_expires_at=None, + ) diff --git a/src/apify/storage_clients/_apify/_request_queue_client_simple.py b/src/apify/storage_clients/_apify/_request_queue_client_simple.py new file mode 100644 index 00000000..9a39a9c4 --- /dev/null +++ b/src/apify/storage_clients/_apify/_request_queue_client_simple.py @@ -0,0 +1,392 @@ +from __future__ import annotations + +from collections import deque +from datetime import datetime, timezone +from logging import getLogger +from typing import TYPE_CHECKING, Final + +from cachetools import LRUCache +from typing_extensions import override + +from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata + +from apify import Request +from apify.storage_clients._apify import ApifyRequestQueueClient +from apify.storage_clients._apify._request_queue_client import unique_key_to_request_id + +if TYPE_CHECKING: + from collections.abc import Sequence + + from apify_client.clients import RequestQueueClientAsync + + +logger = getLogger(__name__) + + +class ApifyRequestQueueClientSimple(ApifyRequestQueueClient): + """An Apify platform implementation of the request queue client with limited capability. + + This client is designed to use as little resources as possible, but has to be used in constrained context. + Constraints: + - Only one client is consuming the request queue at the time. + - Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to be handled + so quickly as this client does not aggressively fetch the forefront and relies on local head estimation. + - Requests are only added to the queue, never deleted. (Marking as handled is ok.) + + If the constraints are not met, the client might work in an unpredictable way. + """ + + _MAX_HEAD_ITEMS: Final[int] = 1000 + """The maximum head items read count limited by API.""" + + def __init__( + self, + *, + api_client: RequestQueueClientAsync, + metadata: RequestQueueMetadata, + ) -> None: + """Initialize a new instance. + + Preferably use the `ApifyRequestQueueClient.open` class method to create a new instance. + """ + self._api_client = api_client + """The Apify request queue client for API operations.""" + + self._metadata = metadata + """Additional data related to the RequestQueue.""" + + self._requests_cache: LRUCache[str, Request] = LRUCache(maxsize=self._MAX_CACHED_REQUESTS) + """A cache to store request objects. Request unique key is used as the cache key.""" + + self._head_requests: deque[str] = deque() + """Ordered unique keys of requests that represent queue head.""" + + self._requests_already_handled: set[str] = set() + """Local estimation of requests unique keys that are already present and handled on the platform. + + - To enhance local deduplication. + - To reduce the _requests_cache size. Already handled requests are most likely not going to be needed again, + so no need to cache more than their unique_key. + """ + + self._requests_in_progress: set[str] = set() + """Set of requests unique keys that are being processed locally. + + - To help decide if the RQ is finished or not. This is the only consumer, so it can be tracked locally. + """ + + self._initialized_caches = False + """This flag indicates whether the local caches were already initialized. + + Initialization is done lazily only if deduplication is needed (When calling add_batch_of_requests). + """ + + @override + async def add_batch_of_requests( + self, + requests: Sequence[Request], + *, + forefront: bool = False, + ) -> AddRequestsResponse: + """Add a batch of requests to the queue. + + Args: + requests: The requests to add. + forefront: Whether to add the requests to the beginning of the queue. + + Returns: + Response containing information about the added requests. + """ + if not self._initialized_caches: + # One time process to initialize local caches for existing request queues. + await self._init_caches() + self._initialized_caches = True + + new_requests: list[Request] = [] + already_present_requests: list[ProcessedRequest] = [] + + for request in requests: + # Check if request is known to be already handled (it has to be present as well.) + if request.unique_key in self._requests_already_handled: + already_present_requests.append( + ProcessedRequest.model_validate( + { + 'uniqueKey': request.unique_key, + 'wasAlreadyPresent': True, + 'wasAlreadyHandled': True, + } + ) + ) + # Check if request is known to be already present, but unhandled + elif self._requests_cache.get(request.unique_key): + already_present_requests.append( + ProcessedRequest.model_validate( + { + 'uniqueKey': request.unique_key, + 'wasAlreadyPresent': True, + 'wasAlreadyHandled': request.was_already_handled, + } + ) + ) + else: + # Push the request to the platform. Probably not there, or we are not aware of it + new_requests.append(request) + + # Update local caches + self._requests_cache[request.unique_key] = request + if forefront: + self._head_requests.append(request.unique_key) + else: + self._head_requests.appendleft(request.unique_key) + + if new_requests: + # Prepare requests for API by converting to dictionaries. + requests_dict = [ + request.model_dump( + by_alias=True, + ) + for request in new_requests + ] + + # Send requests to API. + api_response = AddRequestsResponse.model_validate( + await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront) + ) + # Add the locally known already present processed requests based on the local cache. + api_response.processed_requests.extend(already_present_requests) + # Remove unprocessed requests from the cache + for unprocessed_request in api_response.unprocessed_requests: + self._requests_cache.pop(unprocessed_request.unique_key, None) + + else: + api_response = AddRequestsResponse.model_validate( + {'unprocessedRequests': [], 'processedRequests': already_present_requests} + ) + + # Update assumed total count for newly added requests. + new_request_count = 0 + for processed_request in api_response.processed_requests: + if not processed_request.was_already_present and not processed_request.was_already_handled: + new_request_count += 1 + self._metadata.total_request_count += new_request_count + + return api_response + + @override + async def get_request(self, unique_key: str) -> Request | None: + """Get a request by unique key. + + Args: + unique_key: Unique key of the request to get. + + Returns: + The request or None if not found. + """ + if unique_key in self._requests_cache: + return self._requests_cache[unique_key] + + response = await self._api_client.get_request(unique_key_to_request_id(unique_key)) + + if response is None: + return None + + return Request.model_validate(response) + + @override + async def fetch_next_request(self) -> Request | None: + """Return the next request in the queue to be processed. + + Once you successfully finish processing of the request, you need to call `mark_request_as_handled` + to mark the request as handled in the queue. If there was some error in processing the request, call + `reclaim_request` instead, so that the queue will give the request to some other consumer + in another call to the `fetch_next_request` method. + + Returns: + The request or `None` if there are no more pending requests. + """ + await self._ensure_head_is_non_empty() + + while self._head_requests: + request_unique_key = self._head_requests.pop() + if ( + request_unique_key not in self._requests_in_progress + and request_unique_key not in self._requests_already_handled + ): + self._requests_in_progress.add(request_unique_key) + return await self.get_request(request_unique_key) + # No request locally and the ones returned from the platform are already in progress. + return None + + async def _ensure_head_is_non_empty(self) -> None: + """Ensure that the queue head has requests if they are available in the queue.""" + if len(self._head_requests) <= 1: + await self._list_head() + + async def _list_head(self) -> None: + desired_new_head_items = 200 + # The head will contain in progress requests as well, so we need to fetch more, to get some new ones. + requested_head_items = max(self._MAX_HEAD_ITEMS, desired_new_head_items + len(self._requests_in_progress)) + response = await self._api_client.list_head(limit=requested_head_items) + + # Update metadata + # Check if there is another client working with the RequestQueue + self._metadata.had_multiple_clients = response.get('hadMultipleClients', False) + # Should warn once? This might be outside expected context if the other consumers consumes at the same time + + if modified_at := response.get('queueModifiedAt'): + self._metadata.modified_at = max(self._metadata.modified_at, modified_at) + + # Update the cached data + for request_data in response.get('items', []): + request = Request.model_validate(request_data) + + if request.unique_key in self._requests_in_progress: + # Ignore requests that are already in progress, we will not process them again. + continue + if request.was_already_handled: + # Do not cache fully handled requests, we do not need them. Just cache their unique_key. + self._requests_already_handled.add(request.unique_key) + else: + self._requests_cache[request.unique_key] = request + # Add new requests to the end of the head + self._head_requests.appendleft(request.unique_key) + + @override + async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: + """Mark a request as handled after successful processing. + + Handled requests will never again be returned by the `fetch_next_request` method. + + Args: + request: The request to mark as handled. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + # Set the handled_at timestamp if not already set + + if request.handled_at is None: + request.handled_at = datetime.now(tz=timezone.utc) + self._metadata.handled_request_count += 1 + + if cached_request := self._requests_cache[request.unique_key]: + cached_request.handled_at = request.handled_at + + try: + # Update the request in the API + # Works as upsert - adds the request if it does not exist yet. (Local request that was handled before + # adding to the queue.) + processed_request = await self._update_request(request) + # Remember that we handled this request, to optimize local deduplication. + self._requests_already_handled.add(request.unique_key) + # Remove request from cache. It will most likely not be needed. + self._requests_cache.pop(request.unique_key) + self._requests_in_progress.discard(request.unique_key) + + except Exception as exc: + logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}') + return None + else: + return processed_request + + @override + async def reclaim_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest | None: + """Reclaim a failed request back to the queue. + + The request will be returned for processing later again by another call to `fetch_next_request`. + + Args: + request: The request to return to the queue. + forefront: Whether to add the request to the head or the end of the queue. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + # Check if the request was marked as handled and clear it. When reclaiming, + # we want to put the request back for processing. + if request.was_already_handled: + request.handled_at = None + + try: + # Make sure request is in the local cache. We might need it. + self._requests_cache[request.unique_key] = request + + # No longer in progress + self._requests_in_progress.discard(request.unique_key) + # No longer handled + self._requests_already_handled.discard(request.unique_key) + + if forefront: + # Append to top of the local head estimation + self._head_requests.append(request.unique_key) + + processed_request = await self._update_request(request, forefront=forefront) + processed_request.unique_key = request.unique_key + # If the request was previously handled, decrement our handled count since + # we're putting it back for processing. + if request.was_already_handled and not processed_request.was_already_handled: + self._metadata.handled_request_count -= 1 + + except Exception as exc: + logger.debug(f'Error reclaiming request {request.unique_key}: {exc!s}') + return None + else: + return processed_request + + @override + async def is_empty(self) -> bool: + """Check if the queue is empty. + + Returns: + True if the queue is empty, False otherwise. + """ + # Without the lock the `is_empty` is prone to falsely report True with some low probability race condition. + await self._ensure_head_is_non_empty() + return not self._head_requests and not self._requests_in_progress + + async def _update_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest: + """Update a request in the queue. + + Args: + request: The updated request. + forefront: Whether to put the updated request in the beginning or the end of the queue. + + Returns: + The updated request + """ + request_dict = request.model_dump(by_alias=True) + request_dict['id'] = unique_key_to_request_id(request.unique_key) + response = await self._api_client.update_request( + request=request_dict, + forefront=forefront, + ) + + return ProcessedRequest.model_validate( + {'uniqueKey': request.unique_key} | response, + ) + + async def _init_caches(self) -> None: + """Initialize the local caches by getting requests from the existing queue. + + This is mainly done to improve local deduplication capability. List request can return up to 10k requests, but + their order is implementation detail and does not respect head order or insertion order. + """ + response = await self._api_client.list_requests(limit=10_000) + for request_data in response.get('items', []): + request = Request.model_validate(request_data) + if request.was_already_handled: + # Cache just unique_key for deduplication + self._requests_already_handled.add(request.unique_key) + else: + # Cache full request + self._requests_cache[request.unique_key] = request diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index 689e2c77..d43c637f 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -8,12 +8,14 @@ from ._dataset_client import ApifyDatasetClient from ._key_value_store_client import ApifyKeyValueStoreClient -from ._request_queue_client import ApifyRequestQueueClient +from ._request_queue_client_simple import ApifyRequestQueueClientSimple from apify._utils import docs_group if TYPE_CHECKING: from crawlee.configuration import Configuration + from ._request_queue_client import ApifyRequestQueueClient + @docs_group('Storage clients') class ApifyStorageClient(StorageClient): @@ -72,7 +74,7 @@ async def create_rq_client( configuration = configuration or ApifyConfiguration.get_global_configuration() if isinstance(configuration, ApifyConfiguration): - return await ApifyRequestQueueClient.open(id=id, name=name, configuration=configuration) + return await ApifyRequestQueueClientSimple.open(id=id, name=name, configuration=configuration) raise TypeError( f'Expected "configuration" to be an instance of "apify.Configuration", ' From eadab26144960782911f8ec3dfed5e81526da222 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 28 Aug 2025 13:03:29 +0200 Subject: [PATCH 06/26] WIP --- pyproject.toml | 2 +- .../_apify/_request_queue_client_simple.py | 7 +++-- .../test_crawlers_with_storages.py | 29 +++++++++++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fdd13e7d..94e8e19a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "apify" -version = "2.7.4" +version = "2.8.1" description = "Apify SDK for Python" authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }] license = { file = "LICENSE" } diff --git a/src/apify/storage_clients/_apify/_request_queue_client_simple.py b/src/apify/storage_clients/_apify/_request_queue_client_simple.py index 9a39a9c4..e994dc25 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client_simple.py +++ b/src/apify/storage_clients/_apify/_request_queue_client_simple.py @@ -248,8 +248,9 @@ async def _list_head(self) -> None: self._requests_already_handled.add(request.unique_key) else: self._requests_cache[request.unique_key] = request - # Add new requests to the end of the head - self._head_requests.appendleft(request.unique_key) + # Add new requests to the end of the head, unless already present in head + if request.unique_key not in self._head_requests: + self._head_requests.appendleft(request.unique_key) @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: @@ -269,7 +270,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | request.handled_at = datetime.now(tz=timezone.utc) self._metadata.handled_request_count += 1 - if cached_request := self._requests_cache[request.unique_key]: + if cached_request := self._requests_cache.get(request.unique_key): cached_request.handled_at = request.handled_at try: diff --git a/tests/integration/test_crawlers_with_storages.py b/tests/integration/test_crawlers_with_storages.py index a2ba1e4d..7508213a 100644 --- a/tests/integration/test_crawlers_with_storages.py +++ b/tests/integration/test_crawlers_with_storages.py @@ -41,6 +41,35 @@ async def default_handler(context: ParselCrawlingContext) -> None: assert run_result.status == 'SUCCEEDED' +async def test_actor_on_platform_max_crawl_depth( +) -> None: + """Test that the actor respects max_crawl_depth.""" + + """The crawler entry point.""" + import re + + from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + from apify import Actor + + async with Actor: + rq= await Actor.open_request_queue(force_cloud=True) + crawler = ParselCrawler(max_crawl_depth=2, request_manager=rq) + finished = [] + enqueue_pattern = re.compile(r'http://localhost:8080/2+$') + + @crawler.router.default_handler + async def default_handler(context: ParselCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + await context.enqueue_links(include=[enqueue_pattern]) + finished.append(context.request.url) + + await crawler.run(['http://localhost:8080/']) + assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22'] + + + async def test_actor_on_platform_max_requests_per_crawl( make_actor: MakeActorFunction, run_actor: RunActorFunction, From 249f8f5a54edb6c5f912f908857f62ba3a7f9e05 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 28 Aug 2025 14:54:33 +0200 Subject: [PATCH 07/26] Find the chacing problem. Migrate most actor based tests to normal force cloud rq tests (for future parametrization of the Apify clients) --- .../_apify/_request_queue_client_simple.py | 9 +- .../test_crawlers_with_storages.py | 29 - tests/integration/test_request_queue.py | 1634 +++++++---------- uv.lock | 2 +- 4 files changed, 721 insertions(+), 953 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client_simple.py b/src/apify/storage_clients/_apify/_request_queue_client_simple.py index e994dc25..28c01b1c 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client_simple.py +++ b/src/apify/storage_clients/_apify/_request_queue_client_simple.py @@ -32,6 +32,7 @@ class ApifyRequestQueueClientSimple(ApifyRequestQueueClient): - Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to be handled so quickly as this client does not aggressively fetch the forefront and relies on local head estimation. - Requests are only added to the queue, never deleted. (Marking as handled is ok.) + - Other producers can add new requests, but not modify existing ones (otherwise caching can miss the updates) If the constraints are not met, the client might work in an unpredictable way. """ @@ -247,7 +248,13 @@ async def _list_head(self) -> None: # Do not cache fully handled requests, we do not need them. Just cache their unique_key. self._requests_already_handled.add(request.unique_key) else: - self._requests_cache[request.unique_key] = request + # Only fetch the request if we do not know it yet. + if request.unique_key not in self._requests_cache: + request = Request.model_validate( + await self._api_client.get_request(unique_key_to_request_id(request.unique_key)) + ) + self._requests_cache[request.unique_key] = request + # Add new requests to the end of the head, unless already present in head if request.unique_key not in self._head_requests: self._head_requests.appendleft(request.unique_key) diff --git a/tests/integration/test_crawlers_with_storages.py b/tests/integration/test_crawlers_with_storages.py index 7508213a..a2ba1e4d 100644 --- a/tests/integration/test_crawlers_with_storages.py +++ b/tests/integration/test_crawlers_with_storages.py @@ -41,35 +41,6 @@ async def default_handler(context: ParselCrawlingContext) -> None: assert run_result.status == 'SUCCEEDED' -async def test_actor_on_platform_max_crawl_depth( -) -> None: - """Test that the actor respects max_crawl_depth.""" - - """The crawler entry point.""" - import re - - from crawlee.crawlers import ParselCrawler, ParselCrawlingContext - - from apify import Actor - - async with Actor: - rq= await Actor.open_request_queue(force_cloud=True) - crawler = ParselCrawler(max_crawl_depth=2, request_manager=rq) - finished = [] - enqueue_pattern = re.compile(r'http://localhost:8080/2+$') - - @crawler.router.default_handler - async def default_handler(context: ParselCrawlingContext) -> None: - """Default request handler.""" - context.log.info(f'Processing {context.request.url} ...') - await context.enqueue_links(include=[enqueue_pattern]) - finished.append(context.request.url) - - await crawler.run(['http://localhost:8080/']) - assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22'] - - - async def test_actor_on_platform_max_requests_per_crawl( make_actor: MakeActorFunction, run_actor: RunActorFunction, diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index ed913b89..a0ed67ea 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -16,735 +16,579 @@ from .conftest import MakeActorFunction, RunActorFunction -async def test_add_and_fetch_requests( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: +async def test_add_and_fetch_requests(request_queue_force_cloud: RequestQueue) -> None: """Test basic functionality of adding and fetching requests.""" - async def main() -> None: - async with Actor: - desired_request_count = 100 - Actor.log.info('Opening request queue...') - rq = await Actor.open_request_queue() - - # Add some requests - for i in range(desired_request_count): - Actor.log.info(f'Adding request {i}...') - await rq.add_request(f'https://example.com/{i}') - - handled_request_count = 0 - while next_request := await rq.fetch_next_request(): - Actor.log.info('Fetching next request...') - queue_operation_info = await rq.mark_request_as_handled(next_request) - assert queue_operation_info is not None, f'queue_operation_info={queue_operation_info}' - assert queue_operation_info.was_already_handled is False, ( - f'queue_operation_info.was_already_handled={queue_operation_info.was_already_handled}' - ) - handled_request_count += 1 - - assert handled_request_count == desired_request_count, ( - f'handled_request_count={handled_request_count}', - f'desired_request_count={desired_request_count}', - ) - Actor.log.info('Waiting for queue to be finished...') - is_finished = await rq.is_finished() - assert is_finished is True, f'is_finished={is_finished}' - - actor = await make_actor(label='rq-simple-test', main_func=main) - run_result = await run_actor(actor) - - assert run_result.status == 'SUCCEEDED' - - -async def test_add_requests_in_batches( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: + desired_request_count = 100 + Actor.log.info('Opening request queue...') + rq = request_queue_force_cloud + + # Add some requests + for i in range(desired_request_count): + Actor.log.info(f'Adding request {i}...') + await rq.add_request(f'https://example.com/{i}') + + handled_request_count = 0 + while next_request := await rq.fetch_next_request(): + Actor.log.info('Fetching next request...') + queue_operation_info = await rq.mark_request_as_handled(next_request) + assert queue_operation_info is not None, f'queue_operation_info={queue_operation_info}' + assert queue_operation_info.was_already_handled is False, ( + f'queue_operation_info.was_already_handled={queue_operation_info.was_already_handled}' + ) + handled_request_count += 1 + + assert handled_request_count == desired_request_count, ( + f'handled_request_count={handled_request_count}', + f'desired_request_count={desired_request_count}', + ) + Actor.log.info('Waiting for queue to be finished...') + is_finished = await rq.is_finished() + assert is_finished is True, f'is_finished={is_finished}' + + +async def test_add_requests_in_batches(request_queue_force_cloud: RequestQueue) -> None: """Test adding multiple requests in a single batch operation.""" - async def main() -> None: - async with Actor: - desired_request_count = 100 - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') - - # Add some requests - await rq.add_requests([f'https://example.com/{i}' for i in range(desired_request_count)]) - total_count = await rq.get_total_count() - Actor.log.info(f'Added {desired_request_count} requests in batch, total in queue: {total_count}') - - handled_request_count = 0 - while next_request := await rq.fetch_next_request(): - if handled_request_count % 20 == 0: - Actor.log.info(f'Processing request {handled_request_count + 1}...') - queue_operation_info = await rq.mark_request_as_handled(next_request) - assert queue_operation_info is not None, f'queue_operation_info={queue_operation_info}' - assert queue_operation_info.was_already_handled is False, ( - f'queue_operation_info.was_already_handled={queue_operation_info.was_already_handled}' - ) - handled_request_count += 1 - - assert handled_request_count == desired_request_count, ( - f'handled_request_count={handled_request_count}', - f'desired_request_count={desired_request_count}', - ) - is_finished = await rq.is_finished() - assert is_finished is True, f'is_finished={is_finished}' - - actor = await make_actor(label='rq-batch-test', main_func=main) - run_result = await run_actor(actor) - - assert run_result.status == 'SUCCEEDED' - - -async def test_add_non_unique_requests_in_batch( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: + desired_request_count = 100 + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') + + # Add some requests + await rq.add_requests([f'https://example.com/{i}' for i in range(desired_request_count)]) + total_count = await rq.get_total_count() + Actor.log.info(f'Added {desired_request_count} requests in batch, total in queue: {total_count}') + + handled_request_count = 0 + while next_request := await rq.fetch_next_request(): + if handled_request_count % 20 == 0: + Actor.log.info(f'Processing request {handled_request_count + 1}...') + queue_operation_info = await rq.mark_request_as_handled(next_request) + assert queue_operation_info is not None, f'queue_operation_info={queue_operation_info}' + assert queue_operation_info.was_already_handled is False, ( + f'queue_operation_info.was_already_handled={queue_operation_info.was_already_handled}' + ) + handled_request_count += 1 + + assert handled_request_count == desired_request_count, ( + f'handled_request_count={handled_request_count}', + f'desired_request_count={desired_request_count}', + ) + is_finished = await rq.is_finished() + assert is_finished is True, f'is_finished={is_finished}' + + +async def test_add_non_unique_requests_in_batch(request_queue_force_cloud: RequestQueue) -> None: """Test adding requests with duplicate unique keys in batch.""" - async def main() -> None: - from apify import Request - - async with Actor: - desired_request_count = 100 - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') - - # Add some requests - requests_to_add = [ - Request.from_url(f'https://example.com/{i}', unique_key=str(i - 1 if i % 4 == 1 else i)) - for i in range(desired_request_count) - ] - await rq.add_requests(requests_to_add) - total_count = await rq.get_total_count() - Actor.log.info( - f'Added {desired_request_count} requests with duplicate unique keys, total in queue: {total_count}' - ) - - handled_request_count = 0 - while next_request := await rq.fetch_next_request(): - if handled_request_count % 20 == 0: - Actor.log.info(f'Processing request {handled_request_count + 1}: {next_request.url}') - queue_operation_info = await rq.mark_request_as_handled(next_request) - assert queue_operation_info is not None, f'queue_operation_info={queue_operation_info}' - assert queue_operation_info.was_already_handled is False, ( - f'queue_operation_info.was_already_handled={queue_operation_info.was_already_handled}' - ) - handled_request_count += 1 - - expected_count = int(desired_request_count * 3 / 4) - assert handled_request_count == expected_count, ( - f'handled_request_count={handled_request_count}', - f'expected_count={expected_count}', - ) - is_finished = await rq.is_finished() - Actor.log.info(f'Processed {handled_request_count}/{expected_count} requests, finished: {is_finished}') - assert is_finished is True, f'is_finished={is_finished}' - - actor = await make_actor(label='rq-batch-test', main_func=main) - run_result = await run_actor(actor) - - assert run_result.status == 'SUCCEEDED' - - -async def test_forefront_requests_ordering( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: + desired_request_count = 100 + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') + + # Add some requests + requests_to_add = [ + Request.from_url(f'https://example.com/{i}', unique_key=str(i - 1 if i % 4 == 1 else i)) + for i in range(desired_request_count) + ] + await rq.add_requests(requests_to_add) + total_count = await rq.get_total_count() + Actor.log.info(f'Added {desired_request_count} requests with duplicate unique keys, total in queue: {total_count}') + + handled_request_count = 0 + while next_request := await rq.fetch_next_request(): + if handled_request_count % 20 == 0: + Actor.log.info(f'Processing request {handled_request_count + 1}: {next_request.url}') + queue_operation_info = await rq.mark_request_as_handled(next_request) + assert queue_operation_info is not None, f'queue_operation_info={queue_operation_info}' + assert queue_operation_info.was_already_handled is False, ( + f'queue_operation_info.was_already_handled={queue_operation_info.was_already_handled}' + ) + handled_request_count += 1 + + expected_count = int(desired_request_count * 3 / 4) + assert handled_request_count == expected_count, ( + f'handled_request_count={handled_request_count}', + f'expected_count={expected_count}', + ) + is_finished = await rq.is_finished() + Actor.log.info(f'Processed {handled_request_count}/{expected_count} requests, finished: {is_finished}') + assert is_finished is True, f'is_finished={is_finished}' + + +async def test_forefront_requests_ordering(request_queue_force_cloud: RequestQueue) -> None: """Test that forefront requests are processed before regular requests.""" - async def main() -> None: - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') - - # Add regular requests - await rq.add_request('https://example.com/1') - await rq.add_request('https://example.com/2') - await rq.add_request('https://example.com/3') - Actor.log.info('Added 3 regular requests') - - # Add forefront requests - await rq.add_request('https://example.com/priority1', forefront=True) - await rq.add_request('https://example.com/priority2', forefront=True) - total_count = await rq.get_total_count() - Actor.log.info(f'Added 2 forefront requests, total in queue: {total_count}') - - # Fetch requests and verify order - fetched_urls = [] - while next_request := await rq.fetch_next_request(): - Actor.log.info(f'Fetched request: {next_request.url}') - fetched_urls.append(next_request.url) - await rq.mark_request_as_handled(next_request) - - # Forefront requests should come first (in reverse order of addition) - expected_order = [ - 'https://example.com/priority2', - 'https://example.com/priority1', - 'https://example.com/1', - 'https://example.com/2', - 'https://example.com/3', - ] - assert fetched_urls == expected_order, ( - f'fetched_urls={fetched_urls}', - f'expected_order={expected_order}', - ) - - actor = await make_actor(label='rq-forefront-order-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' - - -async def test_request_unique_key_behavior( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') + + # Add regular requests + await rq.add_request('https://example.com/1') + await rq.add_request('https://example.com/2') + await rq.add_request('https://example.com/3') + Actor.log.info('Added 3 regular requests') + + # Add forefront requests + await rq.add_request('https://example.com/priority1', forefront=True) + await rq.add_request('https://example.com/priority2', forefront=True) + total_count = await rq.get_total_count() + Actor.log.info(f'Added 2 forefront requests, total in queue: {total_count}') + + # Fetch requests and verify order + fetched_urls = [] + while next_request := await rq.fetch_next_request(): + Actor.log.info(f'Fetched request: {next_request.url}') + fetched_urls.append(next_request.url) + await rq.mark_request_as_handled(next_request) + + # Forefront requests should come first (in reverse order of addition) + expected_order = [ + 'https://example.com/priority2', + 'https://example.com/priority1', + 'https://example.com/1', + 'https://example.com/2', + 'https://example.com/3', + ] + assert fetched_urls == expected_order, ( + f'fetched_urls={fetched_urls}', + f'expected_order={expected_order}', + ) + + +async def test_request_unique_key_behavior(request_queue_force_cloud: RequestQueue) -> None: """Test behavior of custom unique keys.""" - async def main() -> None: - from apify import Request + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') - - # Add requests with custom unique keys - req1 = Request.from_url('https://example.com/page1', unique_key='custom-key-1') - req2 = Request.from_url('https://example.com/page2', unique_key='custom-key-1') # Same key - req3 = Request.from_url('https://example.com/page3', unique_key='custom-key-2') # Different key - - result1 = await rq.add_request(req1) - result2 = await rq.add_request(req2) - result3 = await rq.add_request(req3) - - Actor.log.info( - f'Added requests - was_already_present: [{result1.was_already_present}, ' - f'{result2.was_already_present}, {result3.was_already_present}]' - ) - - # Second request should be marked as already present - assert result1.was_already_present is False, f'result1.was_already_present={result1.was_already_present}' - assert result2.was_already_present is True, f'result2.was_already_present={result2.was_already_present}' - assert result3.was_already_present is False, f'result3.was_already_present={result3.was_already_present}' - - # Only 2 requests should be fetchable - fetched_count = 0 - fetched_requests = [] - while next_request := await rq.fetch_next_request(): - fetched_count += 1 - fetched_requests.append(next_request) - await rq.mark_request_as_handled(next_request) - - assert fetched_count == 2, f'fetched_count={fetched_count}' - - # Verify the fetched requests have the correct unique keys - unique_keys = {req.unique_key for req in fetched_requests} - expected_keys = {'custom-key-1', 'custom-key-2'} - assert unique_keys == expected_keys, ( - f'unique_keys={unique_keys}', - f'expected_keys={expected_keys}', - ) - - actor = await make_actor(label='rq-unique-key-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' + # Add requests with custom unique keys + req1 = Request.from_url('https://example.com/page1', unique_key='custom-key-1') + req2 = Request.from_url('https://example.com/page2', unique_key='custom-key-1') # Same key + req3 = Request.from_url('https://example.com/page3', unique_key='custom-key-2') # Different key + result1 = await rq.add_request(req1) + result2 = await rq.add_request(req2) + result3 = await rq.add_request(req3) -async def test_request_reclaim_functionality( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: - """Test request reclaiming for failed processing.""" + Actor.log.info( + f'Added requests - was_already_present: [{result1.was_already_present}, ' + f'{result2.was_already_present}, {result3.was_already_present}]' + ) - async def main() -> None: - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') + # Second request should be marked as already present + assert result1.was_already_present is False, f'result1.was_already_present={result1.was_already_present}' + assert result2.was_already_present is True, f'result2.was_already_present={result2.was_already_present}' + assert result3.was_already_present is False, f'result3.was_already_present={result3.was_already_present}' - # Add a test request - await rq.add_request('https://example.com/test') - Actor.log.info('Added test request') + # Only 2 requests should be fetchable + fetched_count = 0 + fetched_requests = [] + while next_request := await rq.fetch_next_request(): + fetched_count += 1 + fetched_requests.append(next_request) + await rq.mark_request_as_handled(next_request) - # Fetch and reclaim the request - request = await rq.fetch_next_request() - assert request is not None, f'request={request}' - Actor.log.info(f'Fetched request: {request.url}') - - # Reclaim the request (simulate failed processing) - reclaim_result = await rq.reclaim_request(request) - assert reclaim_result is not None, f'reclaim_result={reclaim_result}' - assert reclaim_result.was_already_handled is False, ( - f'reclaim_result.was_already_handled={reclaim_result.was_already_handled}' - ) - Actor.log.info('Request reclaimed successfully') - - # Should be able to fetch the same request again - request2 = await rq.fetch_next_request() - assert request2 is not None, f'request2={request2}' - assert request2.url == request.url, ( - f'request2.url={request2.url}', - f'request.url={request.url}', - ) - Actor.log.info(f'Successfully fetched reclaimed request: {request2.url}') + assert fetched_count == 2, f'fetched_count={fetched_count}' - # Mark as handled this time - await rq.mark_request_as_handled(request2) - is_finished = await rq.is_finished() - assert is_finished is True, f'is_finished={is_finished}' + # Verify the fetched requests have the correct unique keys + unique_keys = {req.unique_key for req in fetched_requests} + expected_keys = {'custom-key-1', 'custom-key-2'} + assert unique_keys == expected_keys, ( + f'unique_keys={unique_keys}', + f'expected_keys={expected_keys}', + ) - actor = await make_actor(label='rq-reclaim-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' +async def test_request_reclaim_functionality(request_queue_force_cloud: RequestQueue) -> None: + """Test request reclaiming for failed processing.""" -async def test_request_reclaim_with_forefront( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') + + # Add a test request + await rq.add_request('https://example.com/test') + Actor.log.info('Added test request') + + # Fetch and reclaim the request + request = await rq.fetch_next_request() + assert request is not None, f'request={request}' + Actor.log.info(f'Fetched request: {request.url}') + + # Reclaim the request (simulate failed processing) + reclaim_result = await rq.reclaim_request(request) + assert reclaim_result is not None, f'reclaim_result={reclaim_result}' + assert reclaim_result.was_already_handled is False, ( + f'reclaim_result.was_already_handled={reclaim_result.was_already_handled}' + ) + Actor.log.info('Request reclaimed successfully') + + # Should be able to fetch the same request again + request2 = await rq.fetch_next_request() + assert request2 is not None, f'request2={request2}' + assert request2.url == request.url, ( + f'request2.url={request2.url}', + f'request.url={request.url}', + ) + Actor.log.info(f'Successfully fetched reclaimed request: {request2.url}') + + # Mark as handled this time + await rq.mark_request_as_handled(request2) + is_finished = await rq.is_finished() + assert is_finished is True, f'is_finished={is_finished}' + + +async def test_request_reclaim_with_forefront(request_queue_force_cloud: RequestQueue) -> None: """Test reclaiming requests to the front of the queue.""" - async def main() -> None: - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') - # Add multiple requests - await rq.add_request('https://example.com/1') - await rq.add_request('https://example.com/2') - await rq.add_request('https://example.com/3') - Actor.log.info('Added 3 requests') - - # Fetch first request - first_request = await rq.fetch_next_request() - assert first_request is not None, f'first_request={first_request}' - Actor.log.info(f'Fetched first request: {first_request.url}') - - # Reclaim to forefront - await rq.reclaim_request(first_request, forefront=True) - Actor.log.info('Request reclaimed to forefront') - - # The reclaimed request should be fetched first again - next_request = await rq.fetch_next_request() - assert next_request is not None, f'next_request={next_request}' - assert next_request.url == first_request.url, ( - f'next_request.url={next_request.url}', - f'first_request.url={first_request.url}', - ) - Actor.log.info(f'Confirmed reclaimed request came first: {next_request.url}') + # Add multiple requests + await rq.add_request('https://example.com/1') + await rq.add_request('https://example.com/2') + await rq.add_request('https://example.com/3') + Actor.log.info('Added 3 requests') - # Clean up - await rq.mark_request_as_handled(next_request) - remaining_count = 0 + # Fetch first request + first_request = await rq.fetch_next_request() + assert first_request is not None, f'first_request={first_request}' + Actor.log.info(f'Fetched first request: {first_request.url}') - while next_request := await rq.fetch_next_request(): - remaining_count += 1 - await rq.mark_request_as_handled(next_request) + # Reclaim to forefront + await rq.reclaim_request(first_request, forefront=True) + Actor.log.info('Request reclaimed to forefront') - Actor.log.info(f'Test completed - processed {remaining_count} additional requests') - - actor = await make_actor(label='rq-reclaim-forefront-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' + # The reclaimed request should be fetched first again + next_request = await rq.fetch_next_request() + assert next_request is not None, f'next_request={next_request}' + assert next_request.url == first_request.url, ( + f'next_request.url={next_request.url}', + f'first_request.url={first_request.url}', + ) + Actor.log.info(f'Confirmed reclaimed request came first: {next_request.url}') + # Clean up + await rq.mark_request_as_handled(next_request) + remaining_count = 0 -async def test_complex_request_objects( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: - """Test handling complex Request objects with various properties.""" - - async def main() -> None: - from apify import Request - - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') - - # Create request with various properties - request = Request.from_url( - 'https://example.com/api/data', - method='POST', - headers={'Authorization': 'Bearer token123', 'Content-Type': 'application/json'}, - user_data={'category': 'api', 'priority': 'high'}, - unique_key='api-request-1', - ) - await rq.add_request(request) - Actor.log.info(f'Added complex request: {request.url} with method {request.method}') - - # Fetch and verify all properties are preserved - fetched_request = await rq.fetch_next_request() - assert fetched_request is not None, f'fetched_request={fetched_request}' - Actor.log.info(f'Fetched request: {fetched_request.url}') - - assert fetched_request.url == 'https://example.com/api/data', f'fetched_request.url={fetched_request.url}' - assert fetched_request.method == 'POST', f'fetched_request.method={fetched_request.method}' - assert fetched_request.headers['Authorization'] == 'Bearer token123', ( - f'fetched_request.headers["Authorization"]={fetched_request.headers["Authorization"]}' - ) - assert fetched_request.headers['Content-Type'] == 'application/json', ( - f'fetched_request.headers["Content-Type"]={fetched_request.headers["Content-Type"]}' - ) - assert fetched_request.user_data['category'] == 'api', ( - f'fetched_request.user_data["category"]={fetched_request.user_data["category"]}' - ) - assert fetched_request.user_data['priority'] == 'high', ( - f'fetched_request.user_data["priority"]={fetched_request.user_data["priority"]}' - ) - assert fetched_request.unique_key == 'api-request-1', ( - f'fetched_request.unique_key={fetched_request.unique_key}' - ) - Actor.log.info('All properties verified successfully') + while next_request := await rq.fetch_next_request(): + remaining_count += 1 + await rq.mark_request_as_handled(next_request) - await rq.mark_request_as_handled(fetched_request) - Actor.log.info('Complex request test completed') + Actor.log.info(f'Test completed - processed {remaining_count} additional requests') - actor = await make_actor(label='rq-complex-request-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' +async def test_complex_request_objects(request_queue_force_cloud: RequestQueue) -> None: + """Test handling complex Request objects with various properties.""" -async def test_get_request_by_unique_key( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') + + # Create request with various properties + request = Request.from_url( + 'https://example.com/api/data', + method='POST', + headers={'Authorization': 'Bearer token123', 'Content-Type': 'application/json'}, + user_data={'category': 'api', 'priority': 'high'}, + unique_key='api-request-1', + ) + await rq.add_request(request) + Actor.log.info(f'Added complex request: {request.url} with method {request.method}') + + # Fetch and verify all properties are preserved + fetched_request = await rq.fetch_next_request() + assert fetched_request is not None, f'fetched_request={fetched_request}' + Actor.log.info(f'Fetched request: {fetched_request.url}') + + assert fetched_request.url == 'https://example.com/api/data', f'fetched_request.url={fetched_request.url}' + assert fetched_request.method == 'POST', f'fetched_request.method={fetched_request.method}' + assert fetched_request.headers['Authorization'] == 'Bearer token123', ( + f'fetched_request.headers["Authorization"]={fetched_request.headers["Authorization"]}' + ) + assert fetched_request.headers['Content-Type'] == 'application/json', ( + f'fetched_request.headers["Content-Type"]={fetched_request.headers["Content-Type"]}' + ) + assert fetched_request.user_data['category'] == 'api', ( + f'fetched_request.user_data["category"]={fetched_request.user_data["category"]}' + ) + assert fetched_request.user_data['priority'] == 'high', ( + f'fetched_request.user_data["priority"]={fetched_request.user_data["priority"]}' + ) + assert fetched_request.unique_key == 'api-request-1', f'fetched_request.unique_key={fetched_request.unique_key}' + Actor.log.info('All properties verified successfully') + + await rq.mark_request_as_handled(fetched_request) + Actor.log.info('Complex request test completed') + + +async def test_get_request_by_unique_key(request_queue_force_cloud: RequestQueue) -> None: """Test retrieving specific requests by their unique_key.""" - async def main() -> None: - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') - - # Add a request and get its unique_key - add_result = await rq.add_request('https://example.com/test') - request_unique_key = add_result.unique_key - Actor.log.info(f'Request added with unique_key: {request_unique_key}') + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') - # Retrieve the request by unique_key - retrieved_request = await rq.get_request(request_unique_key) - assert retrieved_request is not None, f'retrieved_request={retrieved_request}' - assert retrieved_request.url == 'https://example.com/test', f'retrieved_request.url={retrieved_request.url}' - assert retrieved_request.unique_key == request_unique_key, (f'{request_unique_key=}',) - Actor.log.info('Request retrieved successfully by unique_key') + # Add a request and get its unique_key + add_result = await rq.add_request('https://example.com/test') + request_unique_key = add_result.unique_key + Actor.log.info(f'Request added with unique_key: {request_unique_key}') - # Test with non-existent unique_key - non_existent_request = await rq.get_request('non-existent-unique_key') - assert non_existent_request is None, f'non_existent_request={non_existent_request}' - Actor.log.info('Non-existent unique_key correctly returned None') + # Retrieve the request by unique_key + retrieved_request = await rq.get_request(request_unique_key) + assert retrieved_request is not None, f'retrieved_request={retrieved_request}' + assert retrieved_request.url == 'https://example.com/test', f'retrieved_request.url={retrieved_request.url}' + assert retrieved_request.unique_key == request_unique_key, (f'{request_unique_key=}',) + Actor.log.info('Request retrieved successfully by unique_key') - actor = await make_actor(label='rq-get-by-unique-key-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' + # Test with non-existent unique_key + non_existent_request = await rq.get_request('non-existent-unique_key') + assert non_existent_request is None, f'non_existent_request={non_existent_request}' + Actor.log.info('Non-existent unique_key correctly returned None') -async def test_metadata_tracking( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: +async def test_metadata_tracking(request_queue_force_cloud: RequestQueue) -> None: """Test request queue metadata and counts.""" - async def main() -> None: - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') - # Check initial state - initial_total = await rq.get_total_count() - initial_handled = await rq.get_handled_count() - Actor.log.info(f'Initial state - Total: {initial_total}, Handled: {initial_handled}') - assert initial_total == 0, f'initial_total={initial_total}' - assert initial_handled == 0, f'initial_handled={initial_handled}' - - # Add requests - await rq.add_requests([f'https://example.com/{i}' for i in range(5)]) - Actor.log.info('Added 5 requests in batch') - - # Check counts after adding - total_after_add = await rq.get_total_count() - handled_after_add = await rq.get_handled_count() - Actor.log.info(f'After adding - Total: {total_after_add}, Handled: {handled_after_add}') - assert total_after_add == 5, f'total_after_add={total_after_add}' - assert handled_after_add == 0, f'handled_after_add={handled_after_add}' - - # Process some requests - for _ in range(3): - request = await rq.fetch_next_request() - if request: - await rq.mark_request_as_handled(request) + # Check initial state + initial_total = await rq.get_total_count() + initial_handled = await rq.get_handled_count() + Actor.log.info(f'Initial state - Total: {initial_total}, Handled: {initial_handled}') + assert initial_total == 0, f'initial_total={initial_total}' + assert initial_handled == 0, f'initial_handled={initial_handled}' - Actor.log.info('Processed 3 requests') + # Add requests + await rq.add_requests([f'https://example.com/{i}' for i in range(5)]) + Actor.log.info('Added 5 requests in batch') - # Check counts after processing - final_total = await rq.get_total_count() - final_handled = await rq.get_handled_count() - Actor.log.info(f'Final state - Total: {final_total}, Handled: {final_handled}') - assert final_total == 5, f'final_total={final_total}' - assert final_handled == 3, f'final_handled={final_handled}' + # Check counts after adding + total_after_add = await rq.get_total_count() + handled_after_add = await rq.get_handled_count() + Actor.log.info(f'After adding - Total: {total_after_add}, Handled: {handled_after_add}') + assert total_after_add == 5, f'total_after_add={total_after_add}' + assert handled_after_add == 0, f'handled_after_add={handled_after_add}' - actor = await make_actor(label='rq-metadata-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' + # Process some requests + for _ in range(3): + request = await rq.fetch_next_request() + if request: + await rq.mark_request_as_handled(request) + Actor.log.info('Processed 3 requests') -async def test_batch_operations_performance( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: + # Check counts after processing + final_total = await rq.get_total_count() + final_handled = await rq.get_handled_count() + Actor.log.info(f'Final state - Total: {final_total}, Handled: {final_handled}') + assert final_total == 5, f'final_total={final_total}' + assert final_handled == 3, f'final_handled={final_handled}' + + +async def test_batch_operations_performance(request_queue_force_cloud: RequestQueue) -> None: """Test batch operations vs individual operations.""" - async def main() -> None: - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') - # Test batch add vs individual adds - batch_requests = [f'https://example.com/batch/{i}' for i in range(50)] - Actor.log.info(f'Prepared {len(batch_requests)} requests for batch add') + # Test batch add vs individual adds + batch_requests = [f'https://example.com/batch/{i}' for i in range(50)] + Actor.log.info(f'Prepared {len(batch_requests)} requests for batch add') - # Add in batch - await rq.add_requests(batch_requests) - Actor.log.info('Batch add completed') + # Add in batch + await rq.add_requests(batch_requests) + Actor.log.info('Batch add completed') - # Verify all requests were added - total_count = await rq.get_total_count() - handled_count = await rq.get_handled_count() - Actor.log.info(f'After batch add - Total: {total_count}, Handled: {handled_count}') - assert total_count == 50, f'total_count={total_count}' - assert handled_count == 0, f'handled_count={handled_count}' - - # Process all requests - processed_count = 0 - while next_request := await rq.fetch_next_request(): - processed_count += 1 - await rq.mark_request_as_handled(next_request) - if processed_count >= 50: # Safety break - break + # Verify all requests were added + total_count = await rq.get_total_count() + handled_count = await rq.get_handled_count() + Actor.log.info(f'After batch add - Total: {total_count}, Handled: {handled_count}') + assert total_count == 50, f'total_count={total_count}' + assert handled_count == 0, f'handled_count={handled_count}' - Actor.log.info(f'Processing completed. Total processed: {processed_count}') - assert processed_count == 50, f'processed_count={processed_count}' + # Process all requests + processed_count = 0 + while next_request := await rq.fetch_next_request(): + processed_count += 1 + await rq.mark_request_as_handled(next_request) + if processed_count >= 50: # Safety break + break - is_finished = await rq.is_finished() - assert is_finished is True, f'is_finished={is_finished}' + Actor.log.info(f'Processing completed. Total processed: {processed_count}') + assert processed_count == 50, f'processed_count={processed_count}' - actor = await make_actor(label='rq-batch-performance-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' + is_finished = await rq.is_finished() + assert is_finished is True, f'is_finished={is_finished}' -async def test_state_consistency( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: +async def test_state_consistency(request_queue_force_cloud: RequestQueue) -> None: """Test queue state consistency during concurrent operations.""" - async def main() -> None: - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') - # Add initial requests - for i in range(10): - await rq.add_request(f'https://example.com/{i}') - Actor.log.info('Added 10 initial requests') + # Add initial requests + for i in range(10): + await rq.add_request(f'https://example.com/{i}') + Actor.log.info('Added 10 initial requests') - initial_total = await rq.get_total_count() - Actor.log.info(f'Initial total count: {initial_total}') + initial_total = await rq.get_total_count() + Actor.log.info(f'Initial total count: {initial_total}') - # Simulate some requests being processed and others being reclaimed - processed_requests = [] - reclaimed_requests = [] + # Simulate some requests being processed and others being reclaimed + processed_requests = [] + reclaimed_requests = [] - for i in range(5): - request = await rq.fetch_next_request() - if request: - if i % 2 == 0: # Process even indices - await rq.mark_request_as_handled(request) - processed_requests.append(request) - else: # Reclaim odd indices - await rq.reclaim_request(request) - reclaimed_requests.append(request) + for i in range(5): + request = await rq.fetch_next_request() + if request: + if i % 2 == 0: # Process even indices + await rq.mark_request_as_handled(request) + processed_requests.append(request) + else: # Reclaim odd indices + await rq.reclaim_request(request) + reclaimed_requests.append(request) - Actor.log.info(f'Processed {len(processed_requests)} requests, reclaimed {len(reclaimed_requests)}') + Actor.log.info(f'Processed {len(processed_requests)} requests, reclaimed {len(reclaimed_requests)}') - # Verify queue state - expected_handled = len(processed_requests) - current_handled = await rq.get_handled_count() - current_total = await rq.get_total_count() + # Verify queue state + expected_handled = len(processed_requests) + current_handled = await rq.get_handled_count() + current_total = await rq.get_total_count() - Actor.log.info(f'Expected handled: {expected_handled}, Actual handled: {current_handled}') - Actor.log.info(f'Current total: {current_total}') + Actor.log.info(f'Expected handled: {expected_handled}, Actual handled: {current_handled}') + Actor.log.info(f'Current total: {current_total}') - assert current_handled == expected_handled, ( - f'current_handled={current_handled}', - f'expected_handled={expected_handled}', - ) - assert current_total == 10, f'current_total={current_total}' + assert current_handled == expected_handled, ( + f'current_handled={current_handled}', + f'expected_handled={expected_handled}', + ) + assert current_total == 10, f'current_total={current_total}' - # Process remaining requests - remaining_count = 0 - while next_request := await rq.fetch_next_request(): - remaining_count += 1 - await rq.mark_request_as_handled(next_request) + # Process remaining requests + remaining_count = 0 + while next_request := await rq.fetch_next_request(): + remaining_count += 1 + await rq.mark_request_as_handled(next_request) - Actor.log.info(f'Processed {remaining_count} remaining requests') - is_finished = await rq.is_finished() - assert is_finished is True, f'is_finished={is_finished}' + Actor.log.info(f'Processed {remaining_count} remaining requests') + is_finished = await rq.is_finished() + assert is_finished is True, f'is_finished={is_finished}' - actor = await make_actor(label='rq-state-consistency-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' - -async def test_empty_rq_behavior( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: +async def test_empty_rq_behavior(request_queue_force_cloud: RequestQueue) -> None: """Test behavior with empty queues.""" - async def main() -> None: - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') - - # Test empty queue operations - is_empty = await rq.is_empty() - is_finished = await rq.is_finished() - Actor.log.info(f'Empty queue - is_empty: {is_empty}, is_finished: {is_finished}') - assert is_empty is True, f'is_empty={is_empty}' - assert is_finished is True, f'is_finished={is_finished}' - - # Fetch from empty queue - request = await rq.fetch_next_request() - Actor.log.info(f'Fetch result from empty queue: {request}') - assert request is None, f'request={request}' - - # Check metadata for empty queue - metadata = await rq.get_metadata() - assert metadata is not None, f'metadata={metadata}' - Actor.log.info( - f'Empty queue metadata - Total: {metadata.total_request_count}, ' - f'Handled: {metadata.handled_request_count}, ' - f'Pending: {metadata.pending_request_count}' - ) - assert metadata.total_request_count == 0, f'metadata.total_request_count={metadata.total_request_count}' - assert metadata.handled_request_count == 0, ( - f'metadata.handled_request_count={metadata.handled_request_count}' - ) - assert metadata.pending_request_count == 0, ( - f'metadata.pending_request_count={metadata.pending_request_count}' - ) - - actor = await make_actor(label='rq-empty-queue-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' - - -async def test_large_batch_operations( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') + + # Test empty queue operations + is_empty = await rq.is_empty() + is_finished = await rq.is_finished() + Actor.log.info(f'Empty queue - is_empty: {is_empty}, is_finished: {is_finished}') + assert is_empty is True, f'is_empty={is_empty}' + assert is_finished is True, f'is_finished={is_finished}' + + # Fetch from empty queue + request = await rq.fetch_next_request() + Actor.log.info(f'Fetch result from empty queue: {request}') + assert request is None, f'request={request}' + + # Check metadata for empty queue + metadata = await rq.get_metadata() + assert metadata is not None, f'metadata={metadata}' + Actor.log.info( + f'Empty queue metadata - Total: {metadata.total_request_count}, ' + f'Handled: {metadata.handled_request_count}, ' + f'Pending: {metadata.pending_request_count}' + ) + assert metadata.total_request_count == 0, f'metadata.total_request_count={metadata.total_request_count}' + assert metadata.handled_request_count == 0, f'metadata.handled_request_count={metadata.handled_request_count}' + assert metadata.pending_request_count == 0, f'metadata.pending_request_count={metadata.pending_request_count}' + + +async def test_large_batch_operations(request_queue_force_cloud: RequestQueue) -> None: """Test handling large batches of requests.""" - async def main() -> None: - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') - # Create a large batch of requests - large_batch = [f'https://example.com/large/{i}' for i in range(500)] - Actor.log.info(f'Created batch of {len(large_batch)} requests') + # Create a large batch of requests + large_batch = [f'https://example.com/large/{i}' for i in range(500)] + Actor.log.info(f'Created batch of {len(large_batch)} requests') - # Add in batch - await rq.add_requests(large_batch, batch_size=100, wait_for_all_requests_to_be_added=True) - Actor.log.info('Large batch add completed') + # Add in batch + await rq.add_requests(large_batch, batch_size=100, wait_for_all_requests_to_be_added=True) + Actor.log.info('Large batch add completed') - # Verify all requests were added - total_count = await rq.get_total_count() - assert total_count == 500, f'total_count={total_count}' + # Verify all requests were added + total_count = await rq.get_total_count() + assert total_count == 500, f'total_count={total_count}' - # Process all in chunks to test performance - processed_count = 0 + # Process all in chunks to test performance + processed_count = 0 - while not await rq.is_empty(): - request = await rq.fetch_next_request() + while not await rq.is_empty(): + request = await rq.fetch_next_request() - # The RQ is_empty should ensure we don't get None - assert request is not None, f'request={request}' - - await rq.mark_request_as_handled(request) - processed_count += 1 + # The RQ is_empty should ensure we don't get None + assert request is not None, f'request={request}' - Actor.log.info(f'Processing completed. Total processed: {processed_count}') - assert processed_count == 500, f'processed_count={processed_count}' + await rq.mark_request_as_handled(request) + processed_count += 1 - is_finished = await rq.is_finished() - assert is_finished is True, f'is_finished={is_finished}' + Actor.log.info(f'Processing completed. Total processed: {processed_count}') + assert processed_count == 500, f'processed_count={processed_count}' - actor = await make_actor(label='rq-large-batch-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' + is_finished = await rq.is_finished() + assert is_finished is True, f'is_finished={is_finished}' -async def test_mixed_string_and_request_objects( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: +async def test_mixed_string_and_request_objects(request_queue_force_cloud: RequestQueue) -> None: """Test adding both string URLs and Request objects.""" - async def main() -> None: - from apify import Request + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') + # Add string URLs + await rq.add_request('https://example.com/string1') + await rq.add_request('https://example.com/string2') + Actor.log.info('Added string URL requests') - # Add string URLs - await rq.add_request('https://example.com/string1') - await rq.add_request('https://example.com/string2') - Actor.log.info('Added string URL requests') - - # Add Request objects - request_obj = Request.from_url('https://example.com/object1', user_data={'type': 'request_object'}) - await rq.add_request(request_obj) - Actor.log.info('Added Request object with user_data') - - # Add mixed batch - mixed_batch: list[str | Request] = [ - 'https://example.com/mixed1', - Request.from_url('https://example.com/mixed2', method='POST'), - 'https://example.com/mixed3', - ] - await rq.add_requests(mixed_batch) - Actor.log.info('Added mixed batch of strings and Request objects') + # Add Request objects + request_obj = Request.from_url('https://example.com/object1', user_data={'type': 'request_object'}) + await rq.add_request(request_obj) + Actor.log.info('Added Request object with user_data') - total_count = await rq.get_total_count() - Actor.log.info(f'Total requests in queue: {total_count}') + # Add mixed batch + mixed_batch: list[str | Request] = [ + 'https://example.com/mixed1', + Request.from_url('https://example.com/mixed2', method='POST'), + 'https://example.com/mixed3', + ] + await rq.add_requests(mixed_batch) + Actor.log.info('Added mixed batch of strings and Request objects') - # Fetch and verify all types work - fetched_requests = [] - while next_request := await rq.fetch_next_request(): - fetched_requests.append(next_request) - await rq.mark_request_as_handled(next_request) + total_count = await rq.get_total_count() + Actor.log.info(f'Total requests in queue: {total_count}') - assert len(fetched_requests) == 6, f'len(fetched_requests)={len(fetched_requests)}' + # Fetch and verify all types work + fetched_requests = [] + while next_request := await rq.fetch_next_request(): + fetched_requests.append(next_request) + await rq.mark_request_as_handled(next_request) - # Find the request object we added - request_obj_found = None - for req in fetched_requests: - if req.user_data and req.user_data.get('type') == 'request_object': - request_obj_found = req - break + assert len(fetched_requests) == 6, f'len(fetched_requests)={len(fetched_requests)}' - assert request_obj_found is not None, f'request_obj_found={request_obj_found}' - assert request_obj_found.url == 'https://example.com/object1', ( - f'request_obj_found.url={request_obj_found.url}' - ) - Actor.log.info('Mixed types verified - found request object with user_data') + # Find the request object we added + request_obj_found = None + for req in fetched_requests: + if req.user_data and req.user_data.get('type') == 'request_object': + request_obj_found = req + break - actor = await make_actor(label='rq-mixed-types-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' + assert request_obj_found is not None, f'request_obj_found={request_obj_found}' + assert request_obj_found.url == 'https://example.com/object1', f'request_obj_found.url={request_obj_found.url}' + Actor.log.info('Mixed types verified - found request object with user_data') @pytest.mark.skip( @@ -840,193 +684,160 @@ async def worker() -> int: assert run_result.status == 'SUCCEEDED' -async def test_persistence_across_operations( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: +async def test_persistence_across_operations(request_queue_force_cloud: RequestQueue) -> None: """Test that queue state persists across different operations.""" - async def main() -> None: - async with Actor: - # Open queue and add some requests - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') - - # Add initial batch - initial_requests = [f'https://example.com/persist/{i}' for i in range(10)] - await rq.add_requests(initial_requests, wait_for_all_requests_to_be_added=True) - Actor.log.info(f'Added initial batch of {len(initial_requests)} requests') - - initial_total = await rq.get_total_count() - Actor.log.info(f'Total count after initial batch: {initial_total}') - - # Process some requests - processed_count = 0 - for _ in range(5): - request = await rq.fetch_next_request() - if request: - await rq.mark_request_as_handled(request) - processed_count += 1 - - Actor.log.info(f'Processed {processed_count} requests from initial batch') - handled_after_first_batch = await rq.get_handled_count() - Actor.log.info(f'Handled count after processing: {handled_after_first_batch}') - - # Add more requests - additional_requests = [f'https://example.com/additional/{i}' for i in range(5)] - await rq.add_requests(additional_requests, wait_for_all_requests_to_be_added=True) - Actor.log.info(f'Added additional batch of {len(additional_requests)} requests') - - # Check final state - total_after_additional = await rq.get_total_count() - handled_after_additional = await rq.get_handled_count() - Actor.log.info( - f'After adding additional batch - Total: {total_after_additional}, Handled: {handled_after_additional}' - ) - assert total_after_additional == 15, f'total_after_additional={total_after_additional}' - assert handled_after_additional == 5, f'handled_after_additional={handled_after_additional}' - - # Process remaining - remaining_processed = 0 - while not await rq.is_finished(): - request = await rq.fetch_next_request() - if request: - remaining_processed += 1 - await rq.mark_request_as_handled(request) - else: - break - - Actor.log.info(f'Processed {remaining_processed} remaining requests') - is_finished = await rq.is_finished() - final_total = await rq.get_total_count() - final_handled = await rq.get_handled_count() - - Actor.log.info(f'Final state - Finished: {is_finished}, Total: {final_total}, Handled: {final_handled}') - assert is_finished is True, f'is_finished={is_finished}' - assert final_total == 15, f'final_total={final_total}' - assert final_handled == 15, f'final_handled={final_handled}' - - actor = await make_actor(label='rq-persistence-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' - - -async def test_request_deduplication_edge_cases( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: + # Open queue and add some requests + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') + + # Add initial batch + initial_requests = [f'https://example.com/persist/{i}' for i in range(10)] + await rq.add_requests(initial_requests, wait_for_all_requests_to_be_added=True) + Actor.log.info(f'Added initial batch of {len(initial_requests)} requests') + + initial_total = await rq.get_total_count() + Actor.log.info(f'Total count after initial batch: {initial_total}') + + # Process some requests + processed_count = 0 + for _ in range(5): + request = await rq.fetch_next_request() + if request: + await rq.mark_request_as_handled(request) + processed_count += 1 + + Actor.log.info(f'Processed {processed_count} requests from initial batch') + handled_after_first_batch = await rq.get_handled_count() + Actor.log.info(f'Handled count after processing: {handled_after_first_batch}') + + # Add more requests + additional_requests = [f'https://example.com/additional/{i}' for i in range(5)] + await rq.add_requests(additional_requests, wait_for_all_requests_to_be_added=True) + Actor.log.info(f'Added additional batch of {len(additional_requests)} requests') + + # Check final state + total_after_additional = await rq.get_total_count() + handled_after_additional = await rq.get_handled_count() + Actor.log.info( + f'After adding additional batch - Total: {total_after_additional}, Handled: {handled_after_additional}' + ) + assert total_after_additional == 15, f'total_after_additional={total_after_additional}' + assert handled_after_additional == 5, f'handled_after_additional={handled_after_additional}' + + # Process remaining + remaining_processed = 0 + while not await rq.is_finished(): + request = await rq.fetch_next_request() + if request: + remaining_processed += 1 + await rq.mark_request_as_handled(request) + else: + break + + Actor.log.info(f'Processed {remaining_processed} remaining requests') + is_finished = await rq.is_finished() + final_total = await rq.get_total_count() + final_handled = await rq.get_handled_count() + + Actor.log.info(f'Final state - Finished: {is_finished}, Total: {final_total}, Handled: {final_handled}') + assert is_finished is True, f'is_finished={is_finished}' + assert final_total == 15, f'final_total={final_total}' + assert final_handled == 15, f'final_handled={final_handled}' + + +async def test_request_deduplication_edge_cases(request_queue_force_cloud: RequestQueue) -> None: """Test edge cases in request deduplication.""" - - async def main() -> None: - from apify import Request - - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') - - # Test URL normalization and deduplication with expected results - urls_and_deduplication_expectations: list[tuple[str | Request, bool]] = [ - ('https://example.com/page', False), - ('https://example.com/page/', True), # Should be deduplicated (same as first) - ('https://example.com/page?', True), # Should be deduplicated (same as first) - ( - Request.from_url('https://example.com/page#fragment', use_extended_unique_key=True), - False, - ), # Different extended unique key - ('https://example.com/page?param=1', False), # Different unique key - ] - Actor.log.info(f'Testing deduplication with {len(urls_and_deduplication_expectations)} URLs') - - results = list[bool]() - for url, expected_duplicate in urls_and_deduplication_expectations: - result = await rq.add_request(url) - results.append(result.was_already_present) - assert result.was_already_present == expected_duplicate, ( - f'url={url}', - f'expected_duplicate={expected_duplicate}', - f'actual_was_already_present={result.was_already_present}', - ) - - Actor.log.info(f'was_already_present results: {results}') - - # Calculate expected unique count - expected_unique_count = sum( - 1 for _, is_duplicate in urls_and_deduplication_expectations if not is_duplicate - ) - Actor.log.info(f'Expected {expected_unique_count} unique requests') - - # Fetch all unique requests - fetched_urls = list[str]() - while next_request := await rq.fetch_next_request(): - fetched_urls.append(next_request.url) - await rq.mark_request_as_handled(next_request) - - # Assert exact expected count - assert len(fetched_urls) == expected_unique_count, ( - f'len(fetched_urls)={len(fetched_urls)}', - f'expected_unique_count={expected_unique_count}', - ) - Actor.log.info( - f'Added {len(urls_and_deduplication_expectations)} URLs, ' - f'got {len(fetched_urls)} unique requests as expected' - ) - - actor = await make_actor(label='rq-deduplication-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' - - -async def test_request_ordering_with_mixed_operations( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') + + # Test URL normalization and deduplication with expected results + urls_and_deduplication_expectations: list[tuple[str | Request, bool]] = [ + ('https://example.com/page', False), + ('https://example.com/page/', True), # Should be deduplicated (same as first) + ('https://example.com/page?', True), # Should be deduplicated (same as first) + ( + Request.from_url('https://example.com/page#fragment', use_extended_unique_key=True), + False, + ), # Different extended unique key + ('https://example.com/page?param=1', False), # Different unique key + ] + Actor.log.info(f'Testing deduplication with {len(urls_and_deduplication_expectations)} URLs') + + results = list[bool]() + for url, expected_duplicate in urls_and_deduplication_expectations: + result = await rq.add_request(url) + results.append(result.was_already_present) + assert result.was_already_present == expected_duplicate, ( + f'url={url}', + f'expected_duplicate={expected_duplicate}', + f'actual_was_already_present={result.was_already_present}', + ) + + Actor.log.info(f'was_already_present results: {results}') + + # Calculate expected unique count + expected_unique_count = sum(1 for _, is_duplicate in urls_and_deduplication_expectations if not is_duplicate) + Actor.log.info(f'Expected {expected_unique_count} unique requests') + + # Fetch all unique requests + fetched_urls = list[str]() + while next_request := await rq.fetch_next_request(): + fetched_urls.append(next_request.url) + await rq.mark_request_as_handled(next_request) + + # Assert exact expected count + assert len(fetched_urls) == expected_unique_count, ( + f'len(fetched_urls)={len(fetched_urls)}', + f'expected_unique_count={expected_unique_count}', + ) + Actor.log.info( + f'Added {len(urls_and_deduplication_expectations)} URLs, got {len(fetched_urls)} unique requests as expected' + ) + + +async def test_request_ordering_with_mixed_operations(request_queue_force_cloud: RequestQueue) -> None: """Test request ordering with mixed add/reclaim operations.""" - async def main() -> None: - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') - - # Add initial requests - await rq.add_request('https://example.com/1') - await rq.add_request('https://example.com/2') - Actor.log.info('Added initial requests') + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') - # Fetch one and reclaim to forefront - request1 = await rq.fetch_next_request() - assert request1 is not None, f'request1={request1}' - assert request1.url == 'https://example.com/1', f'request1.url={request1.url}' - Actor.log.info(f'Fetched request: {request1.url}') + # Add initial requests + await rq.add_request('https://example.com/1') + await rq.add_request('https://example.com/2') + Actor.log.info('Added initial requests') - await rq.reclaim_request(request1, forefront=True) - Actor.log.info('Reclaimed request to forefront') + # Fetch one and reclaim to forefront + request1 = await rq.fetch_next_request() + assert request1 is not None, f'request1={request1}' + assert request1.url == 'https://example.com/1', f'request1.url={request1.url}' + Actor.log.info(f'Fetched request: {request1.url}') - # Add forefront request - await rq.add_request('https://example.com/priority', forefront=True) - Actor.log.info('Added new forefront request') + await rq.reclaim_request(request1, forefront=True) + Actor.log.info('Reclaimed request to forefront') - # Fetch all requests and verify forefront behavior - urls_ordered = list[str]() - while next_request := await rq.fetch_next_request(): - urls_ordered.append(next_request.url) - await rq.mark_request_as_handled(next_request) + # Add forefront request + await rq.add_request('https://example.com/priority', forefront=True) + Actor.log.info('Added new forefront request') - Actor.log.info(f'Final order of fetched URLs: {urls_ordered}') + # Fetch all requests and verify forefront behavior + urls_ordered = list[str]() + while next_request := await rq.fetch_next_request(): + urls_ordered.append(next_request.url) + await rq.mark_request_as_handled(next_request) - # Verify that we got all 3 requests - assert len(urls_ordered) == 3, f'len(urls_ordered)={len(urls_ordered)}' + Actor.log.info(f'Final order of fetched URLs: {urls_ordered}') - assert urls_ordered[0] == 'https://example.com/priority', f'urls_ordered[0]={urls_ordered[0]}' - assert urls_ordered[1] == request1.url, ( - f'urls_ordered[1]={urls_ordered[1]}', - f'request1.url={request1.url}', - ) - assert urls_ordered[2] == 'https://example.com/2', f'urls_ordered[2]={urls_ordered[2]}' - Actor.log.info('Request ordering verified successfully') + # Verify that we got all 3 requests + assert len(urls_ordered) == 3, f'len(urls_ordered)={len(urls_ordered)}' - actor = await make_actor(label='rq-mixed-ordering-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' + assert urls_ordered[0] == 'https://example.com/priority', f'urls_ordered[0]={urls_ordered[0]}' + assert urls_ordered[1] == request1.url, ( + f'urls_ordered[1]={urls_ordered[1]}', + f'request1.url={request1.url}', + ) + assert urls_ordered[2] == 'https://example.com/2', f'urls_ordered[2]={urls_ordered[2]}' + Actor.log.info('Request ordering verified successfully') async def test_rq_isolation( @@ -1080,124 +891,103 @@ async def main() -> None: assert run_result.status == 'SUCCEEDED' -async def test_finished_state_accuracy( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: +async def test_finished_state_accuracy(request_queue_force_cloud: RequestQueue) -> None: """Test accuracy of is_finished() method in various scenarios.""" - async def main() -> None: - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') - - # Initially should be finished - initial_finished = await rq.is_finished() - Actor.log.info(f'Initial finished state: {initial_finished}') - assert initial_finished is True, f'initial_finished={initial_finished}' - - # Add requests - should not be finished - await rq.add_request('https://example.com/test1') - await rq.add_request('https://example.com/test2') - after_add_finished = await rq.is_finished() - Actor.log.info(f'Finished state after adding requests: {after_add_finished}') - assert after_add_finished is False, f'after_add_finished={after_add_finished}' - - # Fetch but don't handle - should not be finished - request1 = await rq.fetch_next_request() - assert request1 is not None, f'request1={request1}' - after_fetch_finished = await rq.is_finished() - Actor.log.info(f'Finished state after fetch (not handled): {after_fetch_finished}') - assert after_fetch_finished is False, f'after_fetch_finished={after_fetch_finished}' - - # Reclaim request - should still not be finished - await rq.reclaim_request(request1) - after_reclaim_finished = await rq.is_finished() - Actor.log.info(f'Finished state after reclaim: {after_reclaim_finished}') - assert after_reclaim_finished is False, f'after_reclaim_finished={after_reclaim_finished}' - - # Handle all requests - should be finished - processed_count = 0 - while next_request := await rq.fetch_next_request(): - processed_count += 1 - await rq.mark_request_as_handled(next_request) - - Actor.log.info(f'Processed {processed_count} requests') - final_finished = await rq.is_finished() - assert final_finished is True, f'final_finished={final_finished}' - - actor = await make_actor(label='rq-finished-state-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' - - -async def test_operations_performance_pattern( - make_actor: MakeActorFunction, - run_actor: RunActorFunction, -) -> None: + rq = request_queue_force_cloud + Actor.log.info('Request queue opened') + + # Initially should be finished + initial_finished = await rq.is_finished() + Actor.log.info(f'Initial finished state: {initial_finished}') + assert initial_finished is True, f'initial_finished={initial_finished}' + + # Add requests - should not be finished + await rq.add_request('https://example.com/test1') + await rq.add_request('https://example.com/test2') + after_add_finished = await rq.is_finished() + Actor.log.info(f'Finished state after adding requests: {after_add_finished}') + assert after_add_finished is False, f'after_add_finished={after_add_finished}' + + # Fetch but don't handle - should not be finished + request1 = await rq.fetch_next_request() + assert request1 is not None, f'request1={request1}' + after_fetch_finished = await rq.is_finished() + Actor.log.info(f'Finished state after fetch (not handled): {after_fetch_finished}') + assert after_fetch_finished is False, f'after_fetch_finished={after_fetch_finished}' + + # Reclaim request - should still not be finished + await rq.reclaim_request(request1) + after_reclaim_finished = await rq.is_finished() + Actor.log.info(f'Finished state after reclaim: {after_reclaim_finished}') + assert after_reclaim_finished is False, f'after_reclaim_finished={after_reclaim_finished}' + + # Handle all requests - should be finished + processed_count = 0 + while next_request := await rq.fetch_next_request(): + processed_count += 1 + await rq.mark_request_as_handled(next_request) + + Actor.log.info(f'Processed {processed_count} requests') + final_finished = await rq.is_finished() + assert final_finished is True, f'final_finished={final_finished}' + + +async def test_operations_performance_pattern(request_queue_force_cloud: RequestQueue) -> None: """Test a common performance pattern: producer-consumer.""" - - async def main() -> None: - import asyncio - - async with Actor: - rq = await Actor.open_request_queue() - Actor.log.info('Request queue opened') - - # Producer: Add requests in background - async def producer() -> None: - for i in range(20): - await rq.add_request(f'https://example.com/item/{i}') - if i % 5 == 0: # Add some delay to simulate real production - await asyncio.sleep(0.01) - Actor.log.info('Producer finished adding all 20 requests') - - # Consumer: Process requests as they become available - async def consumer() -> int: - processed = 0 - consecutive_empty = 0 - max_empty_attempts = 5 - - while consecutive_empty < max_empty_attempts: - request = await rq.fetch_next_request() - if request is None: - consecutive_empty += 1 - await asyncio.sleep(0.01) # Brief wait for more requests - continue - - consecutive_empty = 0 - await rq.mark_request_as_handled(request) - processed += 1 - - Actor.log.info(f'Consumer finished initial processing, processed {processed} requests') - return processed - - # Run producer and consumer concurrently - producer_task = asyncio.create_task(producer()) - consumer_task = asyncio.create_task(consumer()) - - # Wait for both to complete - await producer_task - processed_count = await consumer_task - Actor.log.info(f'Concurrent phase completed, processed {processed_count} requests') - - # Process any remaining requests - remaining_count = 0 - while next_request := await rq.fetch_next_request(): - await rq.mark_request_as_handled(next_request) - processed_count += 1 - remaining_count += 1 - - Actor.log.info(f'Processed {remaining_count} remaining requests') - Actor.log.info(f'Total processed: {processed_count} requests') - assert processed_count == 20, f'processed_count={processed_count}' - - final_finished = await rq.is_finished() - assert final_finished is True, f'final_finished={final_finished}' - - actor = await make_actor(label='rq-performance-pattern-test', main_func=main) - run_result = await run_actor(actor) - assert run_result.status == 'SUCCEEDED' + Actor.log.info('Request queue opened') + rq = request_queue_force_cloud + + # Producer: Add requests in background + async def producer() -> None: + for i in range(20): + await rq.add_request(f'https://example.com/item/{i}') + if i % 5 == 0: # Add some delay to simulate real production + await asyncio.sleep(0.01) + Actor.log.info('Producer finished adding all 20 requests') + + # Consumer: Process requests as they become available + async def consumer() -> int: + processed = 0 + consecutive_empty = 0 + max_empty_attempts = 5 + + while consecutive_empty < max_empty_attempts: + request = await rq.fetch_next_request() + if request is None: + consecutive_empty += 1 + await asyncio.sleep(0.01) # Brief wait for more requests + continue + + consecutive_empty = 0 + await rq.mark_request_as_handled(request) + processed += 1 + + Actor.log.info(f'Consumer finished initial processing, processed {processed} requests') + return processed + + # Run producer and consumer concurrently + producer_task = asyncio.create_task(producer()) + consumer_task = asyncio.create_task(consumer()) + + # Wait for both to complete + await producer_task + processed_count = await consumer_task + Actor.log.info(f'Concurrent phase completed, processed {processed_count} requests') + + # Process any remaining requests + remaining_count = 0 + while next_request := await rq.fetch_next_request(): + await rq.mark_request_as_handled(next_request) + processed_count += 1 + remaining_count += 1 + + Actor.log.info(f'Processed {remaining_count} remaining requests') + Actor.log.info(f'Total processed: {processed_count} requests') + assert processed_count == 20, f'processed_count={processed_count}' + + final_finished = await rq.is_finished() + assert final_finished is True, f'final_finished={final_finished}' async def test_request_queue_enhanced_metadata( diff --git a/uv.lock b/uv.lock index 8ca12078..846d2d76 100644 --- a/uv.lock +++ b/uv.lock @@ -28,7 +28,7 @@ wheels = [ [[package]] name = "apify" -version = "2.7.4" +version = "2.8.1" source = { editable = "." } dependencies = [ { name = "apify-client" }, From 10e06524ff991624b55b47868985509738ec17f0 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 28 Aug 2025 15:51:55 +0200 Subject: [PATCH 08/26] Wip changes --- .../storage_clients/_apify/_storage_client.py | 17 ++++++++++- tests/integration/test_request_queue.py | 30 +++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index d43c637f..274ca46f 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -8,6 +8,7 @@ from ._dataset_client import ApifyDatasetClient from ._key_value_store_client import ApifyKeyValueStoreClient +from ._request_queue_client_full import ApifyRequestQueueClientFull from ._request_queue_client_simple import ApifyRequestQueueClientSimple from apify._utils import docs_group @@ -21,6 +22,17 @@ class ApifyStorageClient(StorageClient): """Apify storage client.""" + def __init__(self, simple_request_queue: bool = True) -> None: + """Initialize the Apify storage client. + + Args: + simple_request_queue: If True, the `create_rq_client` will always return `ApifyRequestQueueClientSimple`, + if false it will return `ApifyRequestQueueClientFull`. Simple client is suitable for single consumer + scenarios and makes less API calls. Full client is suitable for multiple consumers scenarios at the + cost of higher API usage + """ + self._simple_request_queue = simple_request_queue + @override async def create_dataset_client( self, @@ -74,7 +86,10 @@ async def create_rq_client( configuration = configuration or ApifyConfiguration.get_global_configuration() if isinstance(configuration, ApifyConfiguration): - return await ApifyRequestQueueClientSimple.open(id=id, name=name, configuration=configuration) + if not self._simple_request_queue: + return await ApifyRequestQueueClientSimple.open(id=id, name=name, configuration=configuration) + else: + return await ApifyRequestQueueClientFull.open(id=id, name=name, configuration=configuration) raise TypeError( f'Expected "configuration" to be an instance of "apify.Configuration", ' diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index a0ed67ea..a4b46cec 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -4,10 +4,12 @@ from typing import TYPE_CHECKING import pytest +from apify_shared.consts import ApifyEnvVars from crawlee import Request from apify import Actor +from ._utils import generate_unique_resource_name if TYPE_CHECKING: from apify_client import ApifyClientAsync @@ -1068,3 +1070,31 @@ async def test_request_queue_not_had_multiple_clients( api_response = await api_client.get() assert api_response assert api_response['hadMultipleClients'] is False + + +async def test_cache_initialization( + apify_token: str, monkeypatch: pytest.MonkeyPatch, apify_client_async: ApifyClientAsync +) -> None: + """Test that same `RequestQueue` created from Actor does not act as multiple clients.""" + + """Create an instance of the Apify request queue on the platform and drop it when the test is finished.""" + request_queue_name = generate_unique_resource_name('request_queue') + monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) + + async with Actor: + rq = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) + yield rq + await rq.drop() + + + await request_queue_force_cloud.fetch_next_request() + await request_queue_force_cloud.fetch_next_request() + + # Check that it is correctly in the RequestQueueClient metadata + assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is False + + # Check that it is correctly in the API + api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id) + api_response = await api_client.get() + assert api_response + assert api_response['hadMultipleClients'] is False From 359c46e32c1bc1a3ff7ed7960765a67bb486004d Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 12 Sep 2025 15:33:04 +0200 Subject: [PATCH 09/26] Add init cache test, update upgrading guide --- docs/04_upgrading/upgrading_to_v3.md | 29 +++++++++ .../_apify/_request_queue_client_simple.py | 3 + .../storage_clients/_apify/_storage_client.py | 7 +- tests/integration/test_request_queue.py | 65 ++++++++++++------- 4 files changed, 77 insertions(+), 27 deletions(-) diff --git a/docs/04_upgrading/upgrading_to_v3.md b/docs/04_upgrading/upgrading_to_v3.md index d9f179e5..4d63dbe8 100644 --- a/docs/04_upgrading/upgrading_to_v3.md +++ b/docs/04_upgrading/upgrading_to_v3.md @@ -16,3 +16,32 @@ Support for Python 3.9 has been dropped. The Apify Python SDK v3.x now requires ## Storage clients + +## The default use of optimized ApifyRequestQueueClient + +- The default client for working with Apify platform based `RequestQueue` is now optimized and simplified client which has significantly lower amount of API calls, but does not support multiple consumers working on the same queue. It is cheaper and faster and is suitable for the majority of the use cases. +- The full client is still available, but it has to be explicitly requested via `simple_request_queue=False` argument when using the `ApifyStorageClient`. + +**Before (v2.x):** + +```python +from apify import Actor + +async def main(): + async with Actor: + ... +``` + +**Now (v3.0):** + +```python +from apify import Actor +from crawlee import service_locator +from apify.storage_clients import ApifyStorageClient + +async def main(): + # Use the full-featured RequestQueue client only if you really need it. + service_locator.set_storage_client(ApifyStorageClient(simple_request_queue=False)) + async with Actor: + ... +``` diff --git a/src/apify/storage_clients/_apify/_request_queue_client_simple.py b/src/apify/storage_clients/_apify/_request_queue_client_simple.py index 28c01b1c..f7b46d8b 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client_simple.py +++ b/src/apify/storage_clients/_apify/_request_queue_client_simple.py @@ -388,6 +388,9 @@ async def _init_caches(self) -> None: This is mainly done to improve local deduplication capability. List request can return up to 10k requests, but their order is implementation detail and does not respect head order or insertion order. + + Deduplication on platform is expensive, it takes 1 API call per request and 1 write operation per request. + Local deduplication is cheaper, it takes 1 API call for whole cache and 1 read operation per request. """ response = await self._api_client.list_requests(limit=10_000) for request_data in response.get('items', []): diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index 274ca46f..5d0cb4f4 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -22,7 +22,7 @@ class ApifyStorageClient(StorageClient): """Apify storage client.""" - def __init__(self, simple_request_queue: bool = True) -> None: + def __init__(self, *, simple_request_queue: bool = True) -> None: """Initialize the Apify storage client. Args: @@ -86,10 +86,9 @@ async def create_rq_client( configuration = configuration or ApifyConfiguration.get_global_configuration() if isinstance(configuration, ApifyConfiguration): - if not self._simple_request_queue: + if self._simple_request_queue: return await ApifyRequestQueueClientSimple.open(id=id, name=name, configuration=configuration) - else: - return await ApifyRequestQueueClientFull.open(id=id, name=name, configuration=configuration) + return await ApifyRequestQueueClientFull.open(id=id, name=name, configuration=configuration) raise TypeError( f'Expected "configuration" to be an instance of "apify.Configuration", ' diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index a4b46cec..d4a4bd9d 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -1,15 +1,16 @@ from __future__ import annotations import asyncio +from datetime import datetime, timezone from typing import TYPE_CHECKING import pytest -from apify_shared.consts import ApifyEnvVars -from crawlee import Request +from apify_shared.consts import ApifyEnvVars +from crawlee import Request, service_locator -from apify import Actor from ._utils import generate_unique_resource_name +from apify import Actor if TYPE_CHECKING: from apify_client import ApifyClientAsync @@ -1072,29 +1073,47 @@ async def test_request_queue_not_had_multiple_clients( assert api_response['hadMultipleClients'] is False -async def test_cache_initialization( - apify_token: str, monkeypatch: pytest.MonkeyPatch, apify_client_async: ApifyClientAsync -) -> None: - """Test that same `RequestQueue` created from Actor does not act as multiple clients.""" +async def test_cache_initialization(apify_token: str, monkeypatch: pytest.MonkeyPatch) -> None: + """Test that Apify based `RequestQueue` initializes cache correctly to reduce unnecessary API calls.""" - """Create an instance of the Apify request queue on the platform and drop it when the test is finished.""" + # Create an instance of the Apify request queue on the platform and drop it when the test is finished. request_queue_name = generate_unique_resource_name('request_queue') monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) + requests = [Request.from_url(f'http://example.com/{i}', handled_at=datetime.now(timezone.utc)) for i in range(10)] + async with Actor: rq = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) - yield rq - await rq.drop() - - - await request_queue_force_cloud.fetch_next_request() - await request_queue_force_cloud.fetch_next_request() - - # Check that it is correctly in the RequestQueueClient metadata - assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is False - - # Check that it is correctly in the API - api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id) - api_response = await api_client.get() - assert api_response - assert api_response['hadMultipleClients'] is False + try: + await rq.add_requests(requests) + + # Check that it is correctly in the API + await asyncio.sleep(10) # Wait to be sure that metadata are updated + + # Get raw client, because stats are not exposed in `RequestQueue` class, but are available in raw client + rq_client = Actor.apify_client.request_queue(request_queue_id=rq.id) + _rq = await rq_client.get() + assert _rq + stats_before = _rq.get('stats', {}) + Actor.log.info(stats_before) + + # Clear service locator cache to simulate creating RQ instance from scratch + service_locator.storage_instance_manager.clear_cache() + + # Try to enqueue same requests again. It should be deduplicated from local cache created on initialization + rq = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) + await rq.add_requests(requests) + + await asyncio.sleep(10) # Wait to be sure that metadata are updated + _rq = await rq_client.get() + assert _rq + stats_after = _rq.get('stats', {}) + Actor.log.info(stats_after) + + # Cache was actually initialized, readCount increased + assert (stats_after['readCount'] - stats_before['readCount']) == len(requests) + # Deduplication happened locally, writeCount should be the same + assert stats_after['writeCount'] == stats_before['writeCount'] + + finally: + await rq.drop() From b5110117388036b3dee0906d96acd8b113a22d4d Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 19 Sep 2025 10:31:09 +0200 Subject: [PATCH 10/26] Finalize change and add few more tests --- docs/04_upgrading/upgrading_to_v3.md | 21 +- .../storage_clients/_apify/_storage_client.py | 6 +- tests/integration/conftest.py | 15 +- tests/integration/test_actor_request_queue.py | 11 +- tests/integration/test_request_queue.py | 189 ++++++++++++------ 5 files changed, 153 insertions(+), 89 deletions(-) diff --git a/docs/04_upgrading/upgrading_to_v3.md b/docs/04_upgrading/upgrading_to_v3.md index 6f9096a5..46216d99 100644 --- a/docs/04_upgrading/upgrading_to_v3.md +++ b/docs/04_upgrading/upgrading_to_v3.md @@ -58,26 +58,15 @@ async def main(): - The default client for working with Apify platform based `RequestQueue` is now optimized and simplified client which has significantly lower amount of API calls, but does not support multiple consumers working on the same queue. It is cheaper and faster and is suitable for the majority of the use cases. - The full client is still available, but it has to be explicitly requested via `simple_request_queue=False` argument when using the `ApifyStorageClient`. -**Before (v2.x):** - -```python -from apify import Actor - -async def main(): - async with Actor: - ... -``` - **Now (v3.0):** ```python -from apify import Actor -from crawlee import service_locator +from apify.storages import RequestQueue from apify.storage_clients import ApifyStorageClient async def main(): - # Use the full-featured RequestQueue client only if you really need it. - service_locator.set_storage_client(ApifyStorageClient(simple_request_queue=False)) - async with Actor: - ... + # Full client + rq_full = await RequestQueue.open(storage_client=ApifyStorageClient(simple_request_queue=False)) + # Default optimized client + rq_simple = await RequestQueue.open(storage_client=ApifyStorageClient()) ``` diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index c8901444..5ebc015e 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -11,7 +11,7 @@ from ._request_queue_client_full import ApifyRequestQueueClientFull from ._request_queue_client_simple import ApifyRequestQueueClientSimple from ._utils import hash_api_base_url_and_token -from apify import Configuration as ApifyConfiguration +from apify._configuration import Configuration as ApifyConfiguration from apify._utils import docs_group if TYPE_CHECKING: @@ -45,6 +45,8 @@ def __init__(self, *, simple_request_queue: bool = True) -> None: @override def get_additional_cache_key(self, configuration: CrawleeConfiguration) -> Hashable: if isinstance(configuration, ApifyConfiguration): + # Current design does not support opening exactly same queue with full and simple client at the same time, + # due to default and unnamed storages. Whichever client variation gets used first, wins. return hash_api_base_url_and_token(configuration) config_class = type(configuration) @@ -94,7 +96,7 @@ async def create_rq_client( configuration = configuration or ApifyConfiguration.get_global_configuration() if isinstance(configuration, ApifyConfiguration): client: type[ApifyRequestQueueClient] = ( - ApifyRequestQueueClientSimple if (self._simple_request_queue) else ApifyRequestQueueClientFull + ApifyRequestQueueClientSimple if self._simple_request_queue else ApifyRequestQueueClientFull ) return await client.open(id=id, name=name, alias=alias, configuration=configuration) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 52bee53a..a1657b92 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -20,14 +20,15 @@ from ._utils import generate_unique_resource_name from apify import Actor from apify._models import ActorRun +from apify.storage_clients import ApifyStorageClient from apify.storage_clients._apify._utils import AliasResolver +from apify.storages import RequestQueue if TYPE_CHECKING: from collections.abc import AsyncGenerator, Awaitable, Callable, Coroutine, Iterator, Mapping from decimal import Decimal from apify_client.clients.resource_clients import ActorClientAsync - from crawlee.storages import RequestQueue _TOKEN_ENV_VAR = 'APIFY_TEST_USER_API_TOKEN' _API_URL_ENV_VAR = 'APIFY_INTEGRATION_TESTS_API_URL' @@ -50,6 +51,9 @@ def prepare_test_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Callabl """ def _prepare_test_env() -> None: + # Reset the Actor class state. + apify._actor.Actor.__wrapped__.__class__._is_any_instance_initialized = False # type: ignore[attr-defined] + apify._actor.Actor.__wrapped__.__class__._is_rebooting = False # type: ignore[attr-defined] delattr(apify._actor.Actor, '__wrapped__') # Set the environment variable for the local storage directory to the temporary path. @@ -103,14 +107,15 @@ def apify_client_async(apify_token: str) -> ApifyClientAsync: return ApifyClientAsync(apify_token, api_url=api_url) -@pytest.fixture -async def request_queue_force_cloud(apify_token: str, monkeypatch: pytest.MonkeyPatch) -> AsyncGenerator[RequestQueue]: +@pytest.fixture(params=[False, True]) +async def default_request_queue_apify( + apify_token: str, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest +) -> AsyncGenerator[RequestQueue]: """Create an instance of the Apify request queue on the platform and drop it when the test is finished.""" - request_queue_name = generate_unique_resource_name('request_queue') monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) async with Actor: - rq = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) + rq = await RequestQueue.open(storage_client=ApifyStorageClient(simple_request_queue=request.param)) yield rq await rq.drop() diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 9521234b..51763fa1 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -12,26 +12,29 @@ from ._utils import generate_unique_resource_name from apify import Actor, Request from apify._models import ActorRun +from apify.storage_clients import ApifyStorageClient +from apify.storages import RequestQueue if TYPE_CHECKING: from collections.abc import AsyncGenerator from apify_client import ApifyClientAsync - from crawlee.storages import RequestQueue from .conftest import MakeActorFunction, RunActorFunction -@pytest.fixture +@pytest.fixture(params=[False, True]) async def apify_named_rq( - apify_client_async: ApifyClientAsync, monkeypatch: pytest.MonkeyPatch + apify_client_async: ApifyClientAsync, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest ) -> AsyncGenerator[RequestQueue]: assert apify_client_async.token monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_client_async.token) request_queue_name = generate_unique_resource_name('request_queue') async with Actor: - request_queue = await Actor.open_request_queue(name=request_queue_name, force_cloud=True) + request_queue = await RequestQueue.open( + name=request_queue_name, storage_client=ApifyStorageClient(simple_request_queue=request.param) + ) yield request_queue await request_queue.drop() diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index d4a4bd9d..437ab208 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -8,23 +8,26 @@ from apify_shared.consts import ApifyEnvVars from crawlee import Request, service_locator +from crawlee.crawlers import BasicCrawler from ._utils import generate_unique_resource_name from apify import Actor +from apify.storage_clients import ApifyStorageClient +from apify.storages import RequestQueue if TYPE_CHECKING: from apify_client import ApifyClientAsync - from crawlee.storages import RequestQueue + from crawlee._types import BasicCrawlingContext from .conftest import MakeActorFunction, RunActorFunction -async def test_add_and_fetch_requests(request_queue_force_cloud: RequestQueue) -> None: +async def test_add_and_fetch_requests(default_request_queue_apify: RequestQueue) -> None: """Test basic functionality of adding and fetching requests.""" desired_request_count = 100 Actor.log.info('Opening request queue...') - rq = request_queue_force_cloud + rq = default_request_queue_apify # Add some requests for i in range(desired_request_count): @@ -50,11 +53,11 @@ async def test_add_and_fetch_requests(request_queue_force_cloud: RequestQueue) - assert is_finished is True, f'is_finished={is_finished}' -async def test_add_requests_in_batches(request_queue_force_cloud: RequestQueue) -> None: +async def test_add_requests_in_batches(default_request_queue_apify: RequestQueue) -> None: """Test adding multiple requests in a single batch operation.""" desired_request_count = 100 - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Add some requests @@ -81,11 +84,11 @@ async def test_add_requests_in_batches(request_queue_force_cloud: RequestQueue) assert is_finished is True, f'is_finished={is_finished}' -async def test_add_non_unique_requests_in_batch(request_queue_force_cloud: RequestQueue) -> None: +async def test_add_non_unique_requests_in_batch(default_request_queue_apify: RequestQueue) -> None: """Test adding requests with duplicate unique keys in batch.""" desired_request_count = 100 - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Add some requests @@ -118,10 +121,10 @@ async def test_add_non_unique_requests_in_batch(request_queue_force_cloud: Reque assert is_finished is True, f'is_finished={is_finished}' -async def test_forefront_requests_ordering(request_queue_force_cloud: RequestQueue) -> None: +async def test_forefront_requests_ordering(default_request_queue_apify: RequestQueue) -> None: """Test that forefront requests are processed before regular requests.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Add regular requests @@ -157,10 +160,10 @@ async def test_forefront_requests_ordering(request_queue_force_cloud: RequestQue ) -async def test_request_unique_key_behavior(request_queue_force_cloud: RequestQueue) -> None: +async def test_request_unique_key_behavior(default_request_queue_apify: RequestQueue) -> None: """Test behavior of custom unique keys.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Add requests with custom unique keys @@ -201,10 +204,10 @@ async def test_request_unique_key_behavior(request_queue_force_cloud: RequestQue ) -async def test_request_reclaim_functionality(request_queue_force_cloud: RequestQueue) -> None: +async def test_request_reclaim_functionality(default_request_queue_apify: RequestQueue) -> None: """Test request reclaiming for failed processing.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Add a test request @@ -239,10 +242,10 @@ async def test_request_reclaim_functionality(request_queue_force_cloud: RequestQ assert is_finished is True, f'is_finished={is_finished}' -async def test_request_reclaim_with_forefront(request_queue_force_cloud: RequestQueue) -> None: +async def test_request_reclaim_with_forefront(default_request_queue_apify: RequestQueue) -> None: """Test reclaiming requests to the front of the queue.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Add multiple requests @@ -280,10 +283,10 @@ async def test_request_reclaim_with_forefront(request_queue_force_cloud: Request Actor.log.info(f'Test completed - processed {remaining_count} additional requests') -async def test_complex_request_objects(request_queue_force_cloud: RequestQueue) -> None: +async def test_complex_request_objects(default_request_queue_apify: RequestQueue) -> None: """Test handling complex Request objects with various properties.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Create request with various properties @@ -323,10 +326,10 @@ async def test_complex_request_objects(request_queue_force_cloud: RequestQueue) Actor.log.info('Complex request test completed') -async def test_get_request_by_unique_key(request_queue_force_cloud: RequestQueue) -> None: +async def test_get_request_by_unique_key(default_request_queue_apify: RequestQueue) -> None: """Test retrieving specific requests by their unique_key.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Add a request and get its unique_key @@ -347,10 +350,10 @@ async def test_get_request_by_unique_key(request_queue_force_cloud: RequestQueue Actor.log.info('Non-existent unique_key correctly returned None') -async def test_metadata_tracking(request_queue_force_cloud: RequestQueue) -> None: +async def test_metadata_tracking(default_request_queue_apify: RequestQueue) -> None: """Test request queue metadata and counts.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Check initial state @@ -387,10 +390,10 @@ async def test_metadata_tracking(request_queue_force_cloud: RequestQueue) -> Non assert final_handled == 3, f'final_handled={final_handled}' -async def test_batch_operations_performance(request_queue_force_cloud: RequestQueue) -> None: +async def test_batch_operations_performance(default_request_queue_apify: RequestQueue) -> None: """Test batch operations vs individual operations.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Test batch add vs individual adds @@ -423,10 +426,10 @@ async def test_batch_operations_performance(request_queue_force_cloud: RequestQu assert is_finished is True, f'is_finished={is_finished}' -async def test_state_consistency(request_queue_force_cloud: RequestQueue) -> None: +async def test_state_consistency(default_request_queue_apify: RequestQueue) -> None: """Test queue state consistency during concurrent operations.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Add initial requests @@ -478,10 +481,10 @@ async def test_state_consistency(request_queue_force_cloud: RequestQueue) -> Non assert is_finished is True, f'is_finished={is_finished}' -async def test_empty_rq_behavior(request_queue_force_cloud: RequestQueue) -> None: +async def test_empty_rq_behavior(default_request_queue_apify: RequestQueue) -> None: """Test behavior with empty queues.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Test empty queue operations @@ -509,10 +512,10 @@ async def test_empty_rq_behavior(request_queue_force_cloud: RequestQueue) -> Non assert metadata.pending_request_count == 0, f'metadata.pending_request_count={metadata.pending_request_count}' -async def test_large_batch_operations(request_queue_force_cloud: RequestQueue) -> None: +async def test_large_batch_operations(default_request_queue_apify: RequestQueue) -> None: """Test handling large batches of requests.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Create a large batch of requests @@ -546,10 +549,10 @@ async def test_large_batch_operations(request_queue_force_cloud: RequestQueue) - assert is_finished is True, f'is_finished={is_finished}' -async def test_mixed_string_and_request_objects(request_queue_force_cloud: RequestQueue) -> None: +async def test_mixed_string_and_request_objects(default_request_queue_apify: RequestQueue) -> None: """Test adding both string URLs and Request objects.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Add string URLs @@ -687,11 +690,11 @@ async def worker() -> int: assert run_result.status == 'SUCCEEDED' -async def test_persistence_across_operations(request_queue_force_cloud: RequestQueue) -> None: +async def test_persistence_across_operations(default_request_queue_apify: RequestQueue) -> None: """Test that queue state persists across different operations.""" # Open queue and add some requests - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Add initial batch @@ -749,9 +752,9 @@ async def test_persistence_across_operations(request_queue_force_cloud: RequestQ assert final_handled == 15, f'final_handled={final_handled}' -async def test_request_deduplication_edge_cases(request_queue_force_cloud: RequestQueue) -> None: +async def test_request_deduplication_edge_cases(default_request_queue_apify: RequestQueue) -> None: """Test edge cases in request deduplication.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Test URL normalization and deduplication with expected results @@ -799,10 +802,10 @@ async def test_request_deduplication_edge_cases(request_queue_force_cloud: Reque ) -async def test_request_ordering_with_mixed_operations(request_queue_force_cloud: RequestQueue) -> None: +async def test_request_ordering_with_mixed_operations(default_request_queue_apify: RequestQueue) -> None: """Test request ordering with mixed add/reclaim operations.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Add initial requests @@ -894,10 +897,10 @@ async def main() -> None: assert run_result.status == 'SUCCEEDED' -async def test_finished_state_accuracy(request_queue_force_cloud: RequestQueue) -> None: +async def test_finished_state_accuracy(default_request_queue_apify: RequestQueue) -> None: """Test accuracy of is_finished() method in various scenarios.""" - rq = request_queue_force_cloud + rq = default_request_queue_apify Actor.log.info('Request queue opened') # Initially should be finished @@ -936,10 +939,10 @@ async def test_finished_state_accuracy(request_queue_force_cloud: RequestQueue) assert final_finished is True, f'final_finished={final_finished}' -async def test_operations_performance_pattern(request_queue_force_cloud: RequestQueue) -> None: +async def test_operations_performance_pattern(default_request_queue_apify: RequestQueue) -> None: """Test a common performance pattern: producer-consumer.""" Actor.log.info('Request queue opened') - rq = request_queue_force_cloud + rq = default_request_queue_apify # Producer: Add requests in background async def producer() -> None: @@ -994,59 +997,61 @@ async def consumer() -> int: async def test_request_queue_enhanced_metadata( - request_queue_force_cloud: RequestQueue, + default_request_queue_apify: RequestQueue, apify_client_async: ApifyClientAsync, ) -> None: """Test metadata tracking. Multiple clients scenarios are not guaranteed to give correct results without delay. But at least multiple clients, single producer, should be reliable on the producer side.""" - + rq = default_request_queue_apify for i in range(1, 10): - await request_queue_force_cloud.add_request(Request.from_url(f'http://example.com/{i}')) + await rq.add_request(Request.from_url(f'http://example.com/{i}')) # Reliable information as the API response is enhanced with local metadata estimation. - assert (await request_queue_force_cloud.get_metadata()).total_request_count == i + assert (await rq.get_metadata()).total_request_count == i # Accessed with client created explicitly with `client_key=None` should appear as distinct client - api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id, client_key=None) + api_client = apify_client_async.request_queue(request_queue_id=rq.id, client_key=None) await api_client.list_head() # The presence of another non-producing client should not affect the metadata for i in range(10, 20): - await request_queue_force_cloud.add_request(Request.from_url(f'http://example.com/{i}')) + await rq.add_request(Request.from_url(f'http://example.com/{i}')) # Reliable information as the API response is enhanced with local metadata estimation. - assert (await request_queue_force_cloud.get_metadata()).total_request_count == i + assert (await rq.get_metadata()).total_request_count == i async def test_request_queue_metadata_another_client( - request_queue_force_cloud: RequestQueue, + default_request_queue_apify: RequestQueue, apify_client_async: ApifyClientAsync, ) -> None: """Test metadata tracking. The delayed metadata should be reliable even when changed by another client.""" - api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id, client_key=None) + rq = default_request_queue_apify + api_client = apify_client_async.request_queue(request_queue_id=rq.id, client_key=None) await api_client.add_request(Request.from_url('http://example.com/1').model_dump(by_alias=True, exclude={'id'})) # Wait to be sure that the API has updated the global metadata await asyncio.sleep(10) - assert (await request_queue_force_cloud.get_metadata()).total_request_count == 1 + assert (await rq.get_metadata()).total_request_count == 1 async def test_request_queue_had_multiple_clients( - request_queue_force_cloud: RequestQueue, + default_request_queue_apify: RequestQueue, apify_client_async: ApifyClientAsync, ) -> None: """Test that `RequestQueue` correctly detects multiple clients. Clients created with different `client_key` should appear as distinct clients.""" - await request_queue_force_cloud.fetch_next_request() + rq = default_request_queue_apify + await rq.fetch_next_request() # Accessed with client created explicitly with `client_key=None` should appear as distinct client - api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id, client_key=None) + api_client = apify_client_async.request_queue(request_queue_id=default_request_queue_apify.id, client_key=None) await api_client.list_head() # Check that it is correctly in the RequestQueueClient metadata - assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is True + assert (await rq.get_metadata()).had_multiple_clients is True # Check that it is correctly in the API api_response = await api_client.get() @@ -1055,26 +1060,85 @@ async def test_request_queue_had_multiple_clients( async def test_request_queue_not_had_multiple_clients( - request_queue_force_cloud: RequestQueue, apify_client_async: ApifyClientAsync + default_request_queue_apify: RequestQueue, apify_client_async: ApifyClientAsync ) -> None: """Test that same `RequestQueue` created from Actor does not act as multiple clients.""" - + rq = default_request_queue_apify # Two calls to API to create situation where different `client_key` can set `had_multiple_clients` to True - await request_queue_force_cloud.fetch_next_request() - await request_queue_force_cloud.fetch_next_request() + await rq.fetch_next_request() + await rq.fetch_next_request() # Check that it is correctly in the RequestQueueClient metadata - assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is False + assert (await rq.get_metadata()).had_multiple_clients is False # Check that it is correctly in the API - api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id) + api_client = apify_client_async.request_queue(request_queue_id=rq.id) api_response = await api_client.get() assert api_response assert api_response['hadMultipleClients'] is False +async def test_request_queue_simple_and_full_at_the_same_time( + apify_token: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Test using two variants of the ApifyStorageClient on the same queue resolves to the first client used.""" + monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) + + async with Actor: + rq_simple = await RequestQueue.open(storage_client=ApifyStorageClient(simple_request_queue=True)) + rq_full = await RequestQueue.open(storage_client=ApifyStorageClient(simple_request_queue=False)) + # Opening same queue again with different ApifyStorageClient will resolve to the first client used. + assert rq_simple is rq_full + await rq_simple.drop() + + +@pytest.mark.parametrize( + ('simple_request_queue', 'expected_write_count_per_request'), + [pytest.param(True, 2, id='Simple rq client'), pytest.param(False, 3, id='Full rq client')], +) +async def test_crawler_run_request_queue_variant_stats( + *, + apify_token: str, + monkeypatch: pytest.MonkeyPatch, + simple_request_queue: bool, + expected_write_count_per_request: int, +) -> None: + """Check the main difference in the simple vs full request queue client - writeCount per request. + + The simple client also has lower readCount, but the costs of read are order of magnitude cheaper than writes, so we + do test that. + """ + monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) + async with Actor: + requests = 5 + rq = await RequestQueue.open(storage_client=ApifyStorageClient(simple_request_queue=simple_request_queue)) + crawler = BasicCrawler(request_manager=rq) + + @crawler.router.default_handler + async def default_handler(context: BasicCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + await crawler.run([Request.from_url(f'https://example.com/{i}') for i in range(requests)]) + + # Make sure all requests were handled. + assert crawler.statistics.state.requests_finished == requests + + # Check the request queue stats + await asyncio.sleep(10) # Wait to be sure that metadata are updated + + # Get raw client, because stats are not exposed in `RequestQueue` class, but are available in raw client + # https://github.com/apify/apify-sdk-python/pull/574 + rq_client = Actor.apify_client.request_queue(request_queue_id=rq.id) + _rq = await rq_client.get() + assert _rq + request_queue_stats = _rq.get('stats', {}) + Actor.log.info(f'{request_queue_stats=}') + assert request_queue_stats['writeCount'] == requests * expected_write_count_per_request + await rq.drop() + + async def test_cache_initialization(apify_token: str, monkeypatch: pytest.MonkeyPatch) -> None: - """Test that Apify based `RequestQueue` initializes cache correctly to reduce unnecessary API calls.""" + """Test that Apify based simple `RequestQueue` initializes cache correctly to reduce unnecessary API calls.""" # Create an instance of the Apify request queue on the platform and drop it when the test is finished. request_queue_name = generate_unique_resource_name('request_queue') @@ -1091,6 +1155,7 @@ async def test_cache_initialization(apify_token: str, monkeypatch: pytest.Monkey await asyncio.sleep(10) # Wait to be sure that metadata are updated # Get raw client, because stats are not exposed in `RequestQueue` class, but are available in raw client + # https://github.com/apify/apify-sdk-python/pull/574 rq_client = Actor.apify_client.request_queue(request_queue_id=rq.id) _rq = await rq_client.get() assert _rq From 7ec13efc72555dab0962148134c281e576ae430d Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 19 Sep 2025 11:10:35 +0200 Subject: [PATCH 11/26] Remove unnecesary methods from the specialized client --- .../_apify/_request_queue_client_full.py | 35 ------------------- 1 file changed, 35 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client_full.py b/src/apify/storage_clients/_apify/_request_queue_client_full.py index 6e8a57de..400a4f1c 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client_full.py +++ b/src/apify/storage_clients/_apify/_request_queue_client_full.py @@ -77,41 +77,6 @@ async def _get_metadata_estimate(self) -> RequestQueueMetadata: # Get local estimation (will not include changes done bo another client) return self._metadata - @override - async def get_metadata(self) -> RequestQueueMetadata: - """Get metadata about the request queue. - - Returns: - Metadata from the API, merged with local estimation, because in some cases, the data from the API can - be delayed. - """ - response = await self._api_client.get() - if response is None: - raise ValueError('Failed to fetch request queue metadata from the API.') - # Enhance API response by local estimations (API can be delayed few seconds, while local estimation not.) - return RequestQueueMetadata( - id=response['id'], - name=response['name'], - total_request_count=max(response['totalRequestCount'], self._metadata.total_request_count), - handled_request_count=max(response['handledRequestCount'], self._metadata.handled_request_count), - pending_request_count=response['pendingRequestCount'], - created_at=min(response['createdAt'], self._metadata.created_at), - modified_at=max(response['modifiedAt'], self._metadata.modified_at), - accessed_at=max(response['accessedAt'], self._metadata.accessed_at), - had_multiple_clients=response['hadMultipleClients'] or self._metadata.had_multiple_clients, - ) - - @override - async def purge(self) -> None: - raise NotImplementedError( - 'Purging the request queue is not supported in the Apify platform. ' - 'Use the `drop` method to delete the request queue instead.' - ) - - @override - async def drop(self) -> None: - await self._api_client.delete() - @override async def add_batch_of_requests( self, From 77124104e764ee6f8c6d85508f62c5e65cb0e1e6 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 19 Sep 2025 16:42:59 +0200 Subject: [PATCH 12/26] Rename default_request_queue_apify --- tests/integration/conftest.py | 2 +- tests/integration/test_request_queue.py | 98 ++++++++++++------------- 2 files changed, 50 insertions(+), 50 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index a1657b92..3529f7ff 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -108,7 +108,7 @@ def apify_client_async(apify_token: str) -> ApifyClientAsync: @pytest.fixture(params=[False, True]) -async def default_request_queue_apify( +async def request_queue_apify( apify_token: str, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest ) -> AsyncGenerator[RequestQueue]: """Create an instance of the Apify request queue on the platform and drop it when the test is finished.""" diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index 8c265a40..66577655 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -23,12 +23,12 @@ from apify.storage_clients._apify._models import ApifyRequestQueueMetadata -async def test_add_and_fetch_requests(default_request_queue_apify: RequestQueue) -> None: +async def test_add_and_fetch_requests(request_queue_apify: RequestQueue) -> None: """Test basic functionality of adding and fetching requests.""" desired_request_count = 100 Actor.log.info('Opening request queue...') - rq = default_request_queue_apify + rq = request_queue_apify # Add some requests for i in range(desired_request_count): @@ -54,11 +54,11 @@ async def test_add_and_fetch_requests(default_request_queue_apify: RequestQueue) assert is_finished is True, f'is_finished={is_finished}' -async def test_add_requests_in_batches(default_request_queue_apify: RequestQueue) -> None: +async def test_add_requests_in_batches(request_queue_apify: RequestQueue) -> None: """Test adding multiple requests in a single batch operation.""" desired_request_count = 100 - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Add some requests @@ -85,11 +85,11 @@ async def test_add_requests_in_batches(default_request_queue_apify: RequestQueue assert is_finished is True, f'is_finished={is_finished}' -async def test_add_non_unique_requests_in_batch(default_request_queue_apify: RequestQueue) -> None: +async def test_add_non_unique_requests_in_batch(request_queue_apify: RequestQueue) -> None: """Test adding requests with duplicate unique keys in batch.""" desired_request_count = 100 - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Add some requests @@ -122,10 +122,10 @@ async def test_add_non_unique_requests_in_batch(default_request_queue_apify: Req assert is_finished is True, f'is_finished={is_finished}' -async def test_forefront_requests_ordering(default_request_queue_apify: RequestQueue) -> None: +async def test_forefront_requests_ordering(request_queue_apify: RequestQueue) -> None: """Test that forefront requests are processed before regular requests.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Add regular requests @@ -161,10 +161,10 @@ async def test_forefront_requests_ordering(default_request_queue_apify: RequestQ ) -async def test_request_unique_key_behavior(default_request_queue_apify: RequestQueue) -> None: +async def test_request_unique_key_behavior(request_queue_apify: RequestQueue) -> None: """Test behavior of custom unique keys.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Add requests with custom unique keys @@ -205,10 +205,10 @@ async def test_request_unique_key_behavior(default_request_queue_apify: RequestQ ) -async def test_request_reclaim_functionality(default_request_queue_apify: RequestQueue) -> None: +async def test_request_reclaim_functionality(request_queue_apify: RequestQueue) -> None: """Test request reclaiming for failed processing.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Add a test request @@ -243,10 +243,10 @@ async def test_request_reclaim_functionality(default_request_queue_apify: Reques assert is_finished is True, f'is_finished={is_finished}' -async def test_request_reclaim_with_forefront(default_request_queue_apify: RequestQueue) -> None: +async def test_request_reclaim_with_forefront(request_queue_apify: RequestQueue) -> None: """Test reclaiming requests to the front of the queue.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Add multiple requests @@ -284,10 +284,10 @@ async def test_request_reclaim_with_forefront(default_request_queue_apify: Reque Actor.log.info(f'Test completed - processed {remaining_count} additional requests') -async def test_complex_request_objects(default_request_queue_apify: RequestQueue) -> None: +async def test_complex_request_objects(request_queue_apify: RequestQueue) -> None: """Test handling complex Request objects with various properties.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Create request with various properties @@ -327,10 +327,10 @@ async def test_complex_request_objects(default_request_queue_apify: RequestQueue Actor.log.info('Complex request test completed') -async def test_get_request_by_unique_key(default_request_queue_apify: RequestQueue) -> None: +async def test_get_request_by_unique_key(request_queue_apify: RequestQueue) -> None: """Test retrieving specific requests by their unique_key.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Add a request and get its unique_key @@ -351,10 +351,10 @@ async def test_get_request_by_unique_key(default_request_queue_apify: RequestQue Actor.log.info('Non-existent unique_key correctly returned None') -async def test_metadata_tracking(default_request_queue_apify: RequestQueue) -> None: +async def test_metadata_tracking(request_queue_apify: RequestQueue) -> None: """Test request queue metadata and counts.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Check initial state @@ -391,10 +391,10 @@ async def test_metadata_tracking(default_request_queue_apify: RequestQueue) -> N assert final_handled == 3, f'final_handled={final_handled}' -async def test_batch_operations_performance(default_request_queue_apify: RequestQueue) -> None: +async def test_batch_operations_performance(request_queue_apify: RequestQueue) -> None: """Test batch operations vs individual operations.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Test batch add vs individual adds @@ -427,10 +427,10 @@ async def test_batch_operations_performance(default_request_queue_apify: Request assert is_finished is True, f'is_finished={is_finished}' -async def test_state_consistency(default_request_queue_apify: RequestQueue) -> None: +async def test_state_consistency(request_queue_apify: RequestQueue) -> None: """Test queue state consistency during concurrent operations.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Add initial requests @@ -482,10 +482,10 @@ async def test_state_consistency(default_request_queue_apify: RequestQueue) -> N assert is_finished is True, f'is_finished={is_finished}' -async def test_empty_rq_behavior(default_request_queue_apify: RequestQueue) -> None: +async def test_empty_rq_behavior(request_queue_apify: RequestQueue) -> None: """Test behavior with empty queues.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Test empty queue operations @@ -513,10 +513,10 @@ async def test_empty_rq_behavior(default_request_queue_apify: RequestQueue) -> N assert metadata.pending_request_count == 0, f'metadata.pending_request_count={metadata.pending_request_count}' -async def test_large_batch_operations(default_request_queue_apify: RequestQueue) -> None: +async def test_large_batch_operations(request_queue_apify: RequestQueue) -> None: """Test handling large batches of requests.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Create a large batch of requests @@ -550,10 +550,10 @@ async def test_large_batch_operations(default_request_queue_apify: RequestQueue) assert is_finished is True, f'is_finished={is_finished}' -async def test_mixed_string_and_request_objects(default_request_queue_apify: RequestQueue) -> None: +async def test_mixed_string_and_request_objects(request_queue_apify: RequestQueue) -> None: """Test adding both string URLs and Request objects.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Add string URLs @@ -691,11 +691,11 @@ async def worker() -> int: assert run_result.status == 'SUCCEEDED' -async def test_persistence_across_operations(default_request_queue_apify: RequestQueue) -> None: +async def test_persistence_across_operations(request_queue_apify: RequestQueue) -> None: """Test that queue state persists across different operations.""" # Open queue and add some requests - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Add initial batch @@ -753,9 +753,9 @@ async def test_persistence_across_operations(default_request_queue_apify: Reques assert final_handled == 15, f'final_handled={final_handled}' -async def test_request_deduplication_edge_cases(default_request_queue_apify: RequestQueue) -> None: +async def test_request_deduplication_edge_cases(request_queue_apify: RequestQueue) -> None: """Test edge cases in request deduplication.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Test URL normalization and deduplication with expected results @@ -803,10 +803,10 @@ async def test_request_deduplication_edge_cases(default_request_queue_apify: Req ) -async def test_request_ordering_with_mixed_operations(default_request_queue_apify: RequestQueue) -> None: +async def test_request_ordering_with_mixed_operations(request_queue_apify: RequestQueue) -> None: """Test request ordering with mixed add/reclaim operations.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Add initial requests @@ -898,10 +898,10 @@ async def main() -> None: assert run_result.status == 'SUCCEEDED' -async def test_finished_state_accuracy(default_request_queue_apify: RequestQueue) -> None: +async def test_finished_state_accuracy(request_queue_apify: RequestQueue) -> None: """Test accuracy of is_finished() method in various scenarios.""" - rq = default_request_queue_apify + rq = request_queue_apify Actor.log.info('Request queue opened') # Initially should be finished @@ -940,10 +940,10 @@ async def test_finished_state_accuracy(default_request_queue_apify: RequestQueue assert final_finished is True, f'final_finished={final_finished}' -async def test_operations_performance_pattern(default_request_queue_apify: RequestQueue) -> None: +async def test_operations_performance_pattern(request_queue_apify: RequestQueue) -> None: """Test a common performance pattern: producer-consumer.""" Actor.log.info('Request queue opened') - rq = default_request_queue_apify + rq = request_queue_apify # Producer: Add requests in background async def producer() -> None: @@ -998,14 +998,14 @@ async def consumer() -> int: async def test_request_queue_enhanced_metadata( - default_request_queue_apify: RequestQueue, + request_queue_apify: RequestQueue, apify_client_async: ApifyClientAsync, ) -> None: """Test metadata tracking. Multiple clients scenarios are not guaranteed to give correct results without delay. But at least multiple clients, single producer, should be reliable on the producer side.""" - rq = default_request_queue_apify + rq = request_queue_apify for i in range(1, 10): await rq.add_request(Request.from_url(f'http://example.com/{i}')) # Reliable information as the API response is enhanced with local metadata estimation. @@ -1023,11 +1023,11 @@ async def test_request_queue_enhanced_metadata( async def test_request_queue_metadata_another_client( - default_request_queue_apify: RequestQueue, + request_queue_apify: RequestQueue, apify_client_async: ApifyClientAsync, ) -> None: """Test metadata tracking. The delayed metadata should be reliable even when changed by another client.""" - rq = default_request_queue_apify + rq = request_queue_apify api_client = apify_client_async.request_queue(request_queue_id=rq.id, client_key=None) await api_client.add_request(Request.from_url('http://example.com/1').model_dump(by_alias=True, exclude={'id'})) @@ -1038,17 +1038,17 @@ async def test_request_queue_metadata_another_client( async def test_request_queue_had_multiple_clients( - default_request_queue_apify: RequestQueue, + request_queue_apify: RequestQueue, apify_client_async: ApifyClientAsync, ) -> None: """Test that `RequestQueue` correctly detects multiple clients. Clients created with different `client_key` should appear as distinct clients.""" - rq = default_request_queue_apify + rq = request_queue_apify await rq.fetch_next_request() # Accessed with client created explicitly with `client_key=None` should appear as distinct client - api_client = apify_client_async.request_queue(request_queue_id=default_request_queue_apify.id, client_key=None) + api_client = apify_client_async.request_queue(request_queue_id=request_queue_apify.id, client_key=None) await api_client.list_head() # Check that it is correctly in the RequestQueueClient metadata @@ -1061,10 +1061,10 @@ async def test_request_queue_had_multiple_clients( async def test_request_queue_not_had_multiple_clients( - default_request_queue_apify: RequestQueue, apify_client_async: ApifyClientAsync + request_queue_apify: RequestQueue, apify_client_async: ApifyClientAsync ) -> None: """Test that same `RequestQueue` created from Actor does not act as multiple clients.""" - rq = default_request_queue_apify + rq = request_queue_apify # Two calls to API to create situation where different `client_key` can set `had_multiple_clients` to True await rq.fetch_next_request() await rq.fetch_next_request() From e63f546aa8d6f380c2d6a2c1beabde6fe4cb8bf3 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 19 Sep 2025 16:55:40 +0200 Subject: [PATCH 13/26] Use single and shared literals and rename the RQ client classes --- ...ull.py => _request_queue_shared_client.py} | 2 +- ...ple.py => _request_queue_single_client.py} | 2 +- .../storage_clients/_apify/_storage_client.py | 20 +++++++++---------- tests/integration/conftest.py | 4 ++-- tests/integration/test_actor_request_queue.py | 4 ++-- tests/integration/test_request_queue.py | 14 ++++++------- 6 files changed, 23 insertions(+), 23 deletions(-) rename src/apify/storage_clients/_apify/{_request_queue_client_full.py => _request_queue_shared_client.py} (99%) rename src/apify/storage_clients/_apify/{_request_queue_client_simple.py => _request_queue_single_client.py} (99%) diff --git a/src/apify/storage_clients/_apify/_request_queue_client_full.py b/src/apify/storage_clients/_apify/_request_queue_shared_client.py similarity index 99% rename from src/apify/storage_clients/_apify/_request_queue_client_full.py rename to src/apify/storage_clients/_apify/_request_queue_shared_client.py index 400a4f1c..56c14c84 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client_full.py +++ b/src/apify/storage_clients/_apify/_request_queue_shared_client.py @@ -25,7 +25,7 @@ logger = getLogger(__name__) -class ApifyRequestQueueClientFull(ApifyRequestQueueClient): +class ApifyRequestQueueSharedClient(ApifyRequestQueueClient): """An Apify platform implementation of the request queue client. This implementation supports multiple producers and multiple consumers scenario. diff --git a/src/apify/storage_clients/_apify/_request_queue_client_simple.py b/src/apify/storage_clients/_apify/_request_queue_single_client.py similarity index 99% rename from src/apify/storage_clients/_apify/_request_queue_client_simple.py rename to src/apify/storage_clients/_apify/_request_queue_single_client.py index f7b46d8b..53446a5e 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client_simple.py +++ b/src/apify/storage_clients/_apify/_request_queue_single_client.py @@ -23,7 +23,7 @@ logger = getLogger(__name__) -class ApifyRequestQueueClientSimple(ApifyRequestQueueClient): +class ApifyRequestQueueSingleClient(ApifyRequestQueueClient): """An Apify platform implementation of the request queue client with limited capability. This client is designed to use as little resources as possible, but has to be used in constrained context. diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index 5ebc015e..a63123df 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal from typing_extensions import override @@ -8,8 +8,8 @@ from ._dataset_client import ApifyDatasetClient from ._key_value_store_client import ApifyKeyValueStoreClient -from ._request_queue_client_full import ApifyRequestQueueClientFull -from ._request_queue_client_simple import ApifyRequestQueueClientSimple +from ._request_queue_shared_client import ApifyRequestQueueSharedClient +from ._request_queue_single_client import ApifyRequestQueueSingleClient from ._utils import hash_api_base_url_and_token from apify._configuration import Configuration as ApifyConfiguration from apify._utils import docs_group @@ -26,16 +26,16 @@ class ApifyStorageClient(StorageClient): """Apify storage client.""" - def __init__(self, *, simple_request_queue: bool = True) -> None: + def __init__(self, *, access: Literal['single', 'shared'] = 'single') -> None: """Initialize the Apify storage client. Args: - simple_request_queue: If True, the `create_rq_client` will always return `ApifyRequestQueueClientSimple`, - if false it will return `ApifyRequestQueueClientFull`. Simple client is suitable for single consumer - scenarios and makes less API calls. Full client is suitable for multiple consumers scenarios at the - cost of higher API usage + access: If 'single', the `create_rq_client` will return `ApifyRequestQueueSingleClient`, if 'shared' it + will return `ApifyRequestQueueSharedClient`. + - 'single' is suitable for single consumer scenarios. It makes less API calls, is cheaper and faster. + - 'shared' is suitable for multiple consumers scenarios at the cost of higher API usage. """ - self._simple_request_queue = simple_request_queue + self._access = access # This class breaches Liskov Substitution Principle. It requires specialized Configuration compared to its parent. _lsp_violation_error_message_template = ( @@ -96,7 +96,7 @@ async def create_rq_client( configuration = configuration or ApifyConfiguration.get_global_configuration() if isinstance(configuration, ApifyConfiguration): client: type[ApifyRequestQueueClient] = ( - ApifyRequestQueueClientSimple if self._simple_request_queue else ApifyRequestQueueClientFull + ApifyRequestQueueSingleClient if self._access == 'single' else ApifyRequestQueueSharedClient ) return await client.open(id=id, name=name, alias=alias, configuration=configuration) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 3529f7ff..29bd2351 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -107,7 +107,7 @@ def apify_client_async(apify_token: str) -> ApifyClientAsync: return ApifyClientAsync(apify_token, api_url=api_url) -@pytest.fixture(params=[False, True]) +@pytest.fixture(params=[['single', 'shared']]) async def request_queue_apify( apify_token: str, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest ) -> AsyncGenerator[RequestQueue]: @@ -115,7 +115,7 @@ async def request_queue_apify( monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) async with Actor: - rq = await RequestQueue.open(storage_client=ApifyStorageClient(simple_request_queue=request.param)) + rq = await RequestQueue.open(storage_client=ApifyStorageClient(access=request.param)) yield rq await rq.drop() diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 51763fa1..1f7d4fa5 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -23,7 +23,7 @@ from .conftest import MakeActorFunction, RunActorFunction -@pytest.fixture(params=[False, True]) +@pytest.fixture(params=['single', 'shared']) async def apify_named_rq( apify_client_async: ApifyClientAsync, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest ) -> AsyncGenerator[RequestQueue]: @@ -33,7 +33,7 @@ async def apify_named_rq( async with Actor: request_queue = await RequestQueue.open( - name=request_queue_name, storage_client=ApifyStorageClient(simple_request_queue=request.param) + name=request_queue_name, storage_client=ApifyStorageClient(access=request.param) ) yield request_queue await request_queue.drop() diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index 66577655..f9efd880 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -2,7 +2,7 @@ import asyncio from datetime import datetime, timezone -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, cast import pytest @@ -1086,22 +1086,22 @@ async def test_request_queue_simple_and_full_at_the_same_time( monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) async with Actor: - rq_simple = await RequestQueue.open(storage_client=ApifyStorageClient(simple_request_queue=True)) - rq_full = await RequestQueue.open(storage_client=ApifyStorageClient(simple_request_queue=False)) + rq_simple = await RequestQueue.open(storage_client=ApifyStorageClient(access='single')) + rq_full = await RequestQueue.open(storage_client=ApifyStorageClient(access='shared')) # Opening same queue again with different ApifyStorageClient will resolve to the first client used. assert rq_simple is rq_full await rq_simple.drop() @pytest.mark.parametrize( - ('simple_request_queue', 'expected_write_count_per_request'), - [pytest.param(True, 2, id='Simple rq client'), pytest.param(False, 3, id='Full rq client')], + ('access', 'expected_write_count_per_request'), + [pytest.param('single', 2, id='Simple rq client'), pytest.param('shared', 3, id='Full rq client')], ) async def test_crawler_run_request_queue_variant_stats( *, apify_token: str, monkeypatch: pytest.MonkeyPatch, - simple_request_queue: bool, + access: Literal['single', 'shared'], expected_write_count_per_request: int, ) -> None: """Check the main difference in the simple vs full request queue client - writeCount per request. @@ -1112,7 +1112,7 @@ async def test_crawler_run_request_queue_variant_stats( monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) async with Actor: requests = 5 - rq = await RequestQueue.open(storage_client=ApifyStorageClient(simple_request_queue=simple_request_queue)) + rq = await RequestQueue.open(storage_client=ApifyStorageClient(access=access)) crawler = BasicCrawler(request_manager=rq) @crawler.router.default_handler From e5bdff276e496f6523eb14138b4e290c6267b66e Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 22 Sep 2025 10:10:13 +0200 Subject: [PATCH 14/26] Update tests --- tests/integration/conftest.py | 2 +- tests/integration/test_request_queue.py | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 29bd2351..4da5c4a2 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -107,7 +107,7 @@ def apify_client_async(apify_token: str) -> ApifyClientAsync: return ApifyClientAsync(apify_token, api_url=api_url) -@pytest.fixture(params=[['single', 'shared']]) +@pytest.fixture(params=['single', 'shared']) async def request_queue_apify( apify_token: str, monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest ) -> AsyncGenerator[RequestQueue]: diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index f9efd880..36db5ac9 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -1174,24 +1174,18 @@ async def test_cache_initialization(apify_token: str, monkeypatch: pytest.Monkey await rq.drop() -async def test_request_queue_has_stats(request_queue_force_cloud: RequestQueue) -> None: +async def test_request_queue_has_stats(request_queue_apify: RequestQueue) -> None: """Test that Apify based request queue has stats in metadata.""" - + rq = request_queue_apify add_request_count = 3 - read_request_count = 2 - await request_queue_force_cloud.add_requests( - [Request.from_url(f'http://example.com/{i}') for i in range(add_request_count)] - ) - for _ in range(read_request_count): - await request_queue_force_cloud.get_request(Request.from_url('http://example.com/1').unique_key) + await rq.add_requests([Request.from_url(f'http://example.com/{i}') for i in range(add_request_count)]) # Wait for stats to become stable await asyncio.sleep(10) - metadata = await request_queue_force_cloud.get_metadata() + metadata = await rq.get_metadata() assert hasattr(metadata, 'stats') apify_metadata = cast('ApifyRequestQueueMetadata', metadata) - assert apify_metadata.stats.read_count == read_request_count assert apify_metadata.stats.write_count == add_request_count From 79c02f5f729f78be7315f222ce1df5b488d091aa Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 22 Sep 2025 14:17:51 +0200 Subject: [PATCH 15/26] Update upgrading guide --- docs/04_upgrading/upgrading_to_v3.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/04_upgrading/upgrading_to_v3.md b/docs/04_upgrading/upgrading_to_v3.md index 46216d99..7d5de045 100644 --- a/docs/04_upgrading/upgrading_to_v3.md +++ b/docs/04_upgrading/upgrading_to_v3.md @@ -55,8 +55,8 @@ async def main(): ## The default use of optimized ApifyRequestQueueClient -- The default client for working with Apify platform based `RequestQueue` is now optimized and simplified client which has significantly lower amount of API calls, but does not support multiple consumers working on the same queue. It is cheaper and faster and is suitable for the majority of the use cases. -- The full client is still available, but it has to be explicitly requested via `simple_request_queue=False` argument when using the `ApifyStorageClient`. +- The default client for working with Apify platform based `RequestQueue` is now optimized and simplified client which does significantly lower amount of API calls, but does not support multiple consumers working on the same queue. It is cheaper and faster and is suitable for the majority of the use cases. +- The full client is still available, but it has to be explicitly requested via `access="shared"` argument when using the `ApifyStorageClient`. **Now (v3.0):** @@ -65,8 +65,8 @@ from apify.storages import RequestQueue from apify.storage_clients import ApifyStorageClient async def main(): - # Full client - rq_full = await RequestQueue.open(storage_client=ApifyStorageClient(simple_request_queue=False)) - # Default optimized client - rq_simple = await RequestQueue.open(storage_client=ApifyStorageClient()) + # Full client that supports multiple consumers of the Apify Request Queue + rq_shared = await RequestQueue.open(storage_client=ApifyStorageClient(access="shared")) + # Default optimized client that expects only single consumer of the Apify Request Queue + rq_single = await RequestQueue.open(storage_client=ApifyStorageClient()) ``` From d29a534777b5854bf6d281cba58076d5ea1e831d Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 24 Sep 2025 07:50:58 +0200 Subject: [PATCH 16/26] Extract storage related complexity from Actor to dedicated storage client --- docs/04_upgrading/upgrading_to_v3.md | 31 ++- pyproject.toml | 2 +- src/apify/_actor.py | 104 +++----- src/apify/storage_clients/__init__.py | 2 + .../storage_clients/_hybrid_apify/__init__.py | 1 + .../_hybrid_apify/_storage_client.py | 226 ++++++++++++++++++ .../actor_source_base/requirements.txt | 2 +- tests/integration/test_apify_storages.py | 93 ++++++- tests/unit/actor/test_configuration.py | 6 +- uv.lock | 10 +- 10 files changed, 389 insertions(+), 88 deletions(-) create mode 100644 src/apify/storage_clients/_hybrid_apify/__init__.py create mode 100644 src/apify/storage_clients/_hybrid_apify/_storage_client.py diff --git a/docs/04_upgrading/upgrading_to_v3.md b/docs/04_upgrading/upgrading_to_v3.md index 7d5de045..51af53e0 100644 --- a/docs/04_upgrading/upgrading_to_v3.md +++ b/docs/04_upgrading/upgrading_to_v3.md @@ -53,6 +53,28 @@ async def main(): +## Explicit control over storage clients used in Actor +- It is now possible to have full control over which storage clients are used by the `Actor`. To make development of Actors convenient, the `Actor` has two storage clients. One that is used when running on Apify platform or when opening storages with `force_cloud=True` and the other client that is used when running outside the Apify platform. The `Actor` has reasonable defaults and for the majority of use-cases there is no need to change it. However, if you need to use a different storage client, you can set it up before entering `Actor` context through `service_locator`. + +**Now (v3.0):** +```python +from crawlee import service_locator +from apify.storage_clients import ApifyStorageClient, ApifyHybridStorageClient, MemoryStorageClient +from apify import Actor + + +async def main(): + service_locator.set_storage_client( + ApifyHybridStorageClient( + cloud_storage_client=ApifyStorageClient(access="single"), + local_storage_client=MemoryStorageClient() + ) + ) + async with Actor: + rq = await Actor.open_request_queue() +``` + + ## The default use of optimized ApifyRequestQueueClient - The default client for working with Apify platform based `RequestQueue` is now optimized and simplified client which does significantly lower amount of API calls, but does not support multiple consumers working on the same queue. It is cheaper and faster and is suitable for the majority of the use cases. @@ -61,12 +83,13 @@ async def main(): **Now (v3.0):** ```python -from apify.storages import RequestQueue +from crawlee import service_locator from apify.storage_clients import ApifyStorageClient +from apify import Actor async def main(): # Full client that supports multiple consumers of the Apify Request Queue - rq_shared = await RequestQueue.open(storage_client=ApifyStorageClient(access="shared")) - # Default optimized client that expects only single consumer of the Apify Request Queue - rq_single = await RequestQueue.open(storage_client=ApifyStorageClient()) + service_locator.set_storage_client(ApifyStorageClient(access="shared")) + async with Actor: + rq = await Actor.open_request_queue() ``` diff --git a/pyproject.toml b/pyproject.toml index 50e348d2..7049074d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ keywords = [ dependencies = [ "apify-client>=2.0.0,<3.0.0", "apify-shared>=2.0.0,<3.0.0", - "crawlee==0.6.13b42", + "crawlee @ git+https://github.com/apify/crawlee-python.git@include-storag-client-in-additional-cache-key", "cachetools>=5.5.0", "cryptography>=42.0.0", "impit>=0.6.1", diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 133089b2..863f841b 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -38,7 +38,7 @@ from apify.log import _configure_logging, logger from apify.storage_clients import ApifyStorageClient from apify.storage_clients._file_system import ApifyFileSystemStorageClient -from apify.storages import Dataset, KeyValueStore, RequestQueue +from apify.storage_clients._hybrid_apify._storage_client import ApifyHybridStorageClient if TYPE_CHECKING: import logging @@ -48,9 +48,9 @@ from typing_extensions import Self from crawlee.proxy_configuration import _NewUrlFunction - from crawlee.storage_clients import StorageClient from apify._models import Webhook + from apify.storages import Dataset, KeyValueStore, RequestQueue MainReturnType = TypeVar('MainReturnType') @@ -131,7 +131,6 @@ def __init__( self._configuration = configuration self._configure_logging = configure_logging self._apify_client: ApifyClientAsync | None = None - self._local_storage_client: StorageClient | None = None self._is_initialized = False @@ -234,45 +233,49 @@ def log(self) -> logging.Logger: """The logging.Logger instance the Actor uses.""" return logger - def _get_local_storage_client(self) -> StorageClient: - """Get the local storage client the Actor instance uses.""" - if self._local_storage_client: - return self._local_storage_client + def _raise_if_not_initialized(self) -> None: + if not self._is_initialized: + raise RuntimeError('The Actor was not initialized!') + + @cached_property + def _storage_client(self) -> ApifyHybridStorageClient: + """Storage client used by the actor. + Depending on the initialization of the service locator the client can be created in different ways. + """ try: - # Set implicit default local storage client, unless local storage client was already set. - implicit_storage_client = ApifyFileSystemStorageClient() + # Notning was set by the user. + implicit_storage_client = ApifyHybridStorageClient( + local_storage_client=ApifyFileSystemStorageClient(), cloud_storage_client=ApifyStorageClient() + ) service_locator.set_storage_client(implicit_storage_client) - self._local_storage_client = implicit_storage_client except ServiceConflictError: self.log.debug( 'Storage client in service locator was set explicitly before Actor.init was called.' 'Using the existing storage client as implicit storage client for the Actor.' ) + else: + return implicit_storage_client - self._local_storage_client = service_locator.get_storage_client() - if type(self._local_storage_client) is FileSystemStorageClient: + # User set something in the service locator. + storage_client = service_locator.get_storage_client() + if isinstance(storage_client, ApifyHybridStorageClient): + # The client was manually set to the right type in the service locator. This is the explicit way. + return storage_client + + if isinstance(storage_client, ApifyStorageClient): + # The cloud storage client was manually set in the service locator. + return ApifyHybridStorageClient(cloud_storage_client=storage_client) + + # The local storage client was manually set in the service locator + if type(storage_client) is FileSystemStorageClient: self.log.warning( f'Using {FileSystemStorageClient.__module__}.{FileSystemStorageClient.__name__} in Actor context is not' f' recommended and can lead to problems with reading the input file. Use ' f'`apify.storage_clients.FileSystemStorageClient` instead.' ) - return self._local_storage_client - - def _raise_if_not_initialized(self) -> None: - if not self._is_initialized: - raise RuntimeError('The Actor was not initialized!') - - def _raise_if_cloud_requested_but_not_configured(self, *, force_cloud: bool) -> None: - if not force_cloud: - return - - if not self.is_at_home() and self.configuration.token is None: - raise RuntimeError( - 'In order to use the Apify cloud storage from your computer, ' - 'you need to provide an Apify token using the APIFY_TOKEN environment variable.' - ) + return ApifyHybridStorageClient(cloud_storage_client=ApifyStorageClient(), local_storage_client=storage_client) async def init(self) -> None: """Initialize the Actor instance. @@ -298,22 +301,13 @@ async def init(self) -> None: if _ActorType._is_any_instance_initialized: self.log.warning('Repeated Actor initialization detected - this is non-standard usage, proceed with care') - # Create an instance of the cloud storage client, the local storage client is obtained - # from the service locator - self._cloud_storage_client = ApifyStorageClient() - # Make sure that the currently initialized instance is also available through the global `Actor` proxy cast('Proxy', Actor).__wrapped__ = self self._is_exiting = False self._was_final_persist_state_emitted = False - # If the Actor is running on the Apify platform, we set the cloud storage client. - if self.is_at_home(): - service_locator.set_storage_client(self._cloud_storage_client) - self._local_storage_client = self._cloud_storage_client - else: - self._get_local_storage_client() + self.log.debug(f'Storage client set to {self._storage_client}') service_locator.set_event_manager(self.event_manager) @@ -470,17 +464,7 @@ async def open_dataset( An instance of the `Dataset` class for the given ID or name. """ self._raise_if_not_initialized() - self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud) - - storage_client = self._cloud_storage_client if force_cloud else self._get_local_storage_client() - - return await Dataset.open( - id=id, - alias=alias, - name=name, - configuration=self.configuration, - storage_client=storage_client, - ) + return await self._storage_client.open_dataset(id=id, name=name, alias=alias, force_cloud=force_cloud) async def open_key_value_store( self, @@ -509,17 +493,7 @@ async def open_key_value_store( An instance of the `KeyValueStore` class for the given ID or name. """ self._raise_if_not_initialized() - self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud) - - storage_client = self._cloud_storage_client if force_cloud else self._get_local_storage_client() - - return await KeyValueStore.open( - id=id, - alias=alias, - name=name, - configuration=self.configuration, - storage_client=storage_client, - ) + return await self._storage_client.open_key_value_store(id=id, name=name, alias=alias, force_cloud=force_cloud) async def open_request_queue( self, @@ -550,17 +524,7 @@ async def open_request_queue( An instance of the `RequestQueue` class for the given ID or name. """ self._raise_if_not_initialized() - self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud) - - storage_client = self._cloud_storage_client if force_cloud else self._get_local_storage_client() - - return await RequestQueue.open( - id=id, - alias=alias, - name=name, - configuration=self.configuration, - storage_client=storage_client, - ) + return await self._storage_client.open_request_queue(id=id, name=name, alias=alias, force_cloud=force_cloud) @overload async def push_data(self, data: dict | list[dict]) -> None: ... diff --git a/src/apify/storage_clients/__init__.py b/src/apify/storage_clients/__init__.py index f3e5298c..2391951f 100644 --- a/src/apify/storage_clients/__init__.py +++ b/src/apify/storage_clients/__init__.py @@ -2,8 +2,10 @@ from ._apify import ApifyStorageClient from ._file_system import ApifyFileSystemStorageClient as FileSystemStorageClient +from ._hybrid_apify import ApifyHybridStorageClient __all__ = [ + 'ApifyHybridStorageClient', 'ApifyStorageClient', 'FileSystemStorageClient', 'MemoryStorageClient', diff --git a/src/apify/storage_clients/_hybrid_apify/__init__.py b/src/apify/storage_clients/_hybrid_apify/__init__.py new file mode 100644 index 00000000..8cbef292 --- /dev/null +++ b/src/apify/storage_clients/_hybrid_apify/__init__.py @@ -0,0 +1 @@ +from ._storage_client import ApifyHybridStorageClient diff --git a/src/apify/storage_clients/_hybrid_apify/_storage_client.py b/src/apify/storage_clients/_hybrid_apify/_storage_client.py new file mode 100644 index 00000000..f98f535f --- /dev/null +++ b/src/apify/storage_clients/_hybrid_apify/_storage_client.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +from functools import cached_property +from typing import TYPE_CHECKING + +from typing_extensions import override + +from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient, StorageClient +from crawlee.storages import RequestQueue + +from apify._configuration import Configuration as ApifyConfiguration +from apify._utils import docs_group +from apify.storage_clients import ApifyStorageClient +from apify.storage_clients._file_system import ApifyFileSystemStorageClient +from apify.storages import Dataset, KeyValueStore + +if TYPE_CHECKING: + from collections.abc import Hashable + + from crawlee.configuration import Configuration as CrawleeConfiguration + + +@docs_group('Storage clients') +class ApifyHybridStorageClient(StorageClient): + """ApifyHybridStorageClient that delegates to cloud_storage_client or local_storage_client. + + When running on Apify platform use cloud_storage_client, else use local_storage_client. It has additional wrapper + methods with `force_cloud` parameter to force using cloud_storage_client when opening specific storages even when + not running on the Apify platform. This storage client is designed to work specifically in Actor context. + """ + + def __init__( + self, + *, + cloud_storage_client: ApifyStorageClient | None = None, + local_storage_client: StorageClient | None = None, + ) -> None: + """Initialize the Apify storage client. + + Args: + cloud_storage_client: Client used to communicate with the Apify platform storage. Either through + `force_cloud` argument when opening storages or automatically when running on the Apify platform. + local_storage_client: Client used to communicate with the storage when not running on the Apify + platform and not using `force_cloud` argument when opening storages. + """ + self._cloud_storage_client = cloud_storage_client or ApifyStorageClient(access='single') + self._local_storage_client = local_storage_client or ApifyFileSystemStorageClient() + + def _get_suitable_storage_client(self, *, force_cloud: bool = False) -> StorageClient: + if self._is_at_home: + return self._cloud_storage_client + + configuration = ApifyConfiguration.get_global_configuration() + if force_cloud: + if configuration.token is None: + raise RuntimeError( + 'In order to use the Apify cloud storage from your computer, ' + 'you need to provide an Apify token using the APIFY_TOKEN environment variable.' + ) + return self._cloud_storage_client + + return self._local_storage_client + + @override + def get_additional_cache_key(self, configuration: CrawleeConfiguration) -> Hashable: + if self._is_at_home: + if isinstance(configuration, ApifyConfiguration): + return self._cloud_storage_client.get_additional_cache_key(configuration) + raise TypeError('Expecting ApifyConfiguration') + + return self._local_storage_client.get_additional_cache_key(configuration) + + @override + async def create_dataset_client( + self, + *, + id: str | None = None, + name: str | None = None, + alias: str | None = None, + configuration: CrawleeConfiguration | None = None, + ) -> DatasetClient: + return await self._get_suitable_storage_client().create_dataset_client( + id=id, name=id, alias=alias, configuration=configuration + ) + + @override + async def create_kvs_client( + self, + *, + id: str | None = None, + name: str | None = None, + alias: str | None = None, + configuration: CrawleeConfiguration | None = None, + ) -> KeyValueStoreClient: + return await self._get_suitable_storage_client().create_kvs_client( + id=id, name=id, alias=alias, configuration=configuration + ) + + @override + async def create_rq_client( + self, + *, + id: str | None = None, + name: str | None = None, + alias: str | None = None, + configuration: CrawleeConfiguration | None = None, + ) -> RequestQueueClient: + return await self._get_suitable_storage_client().create_rq_client( + id=id, name=id, alias=alias, configuration=configuration + ) + + async def open_dataset( + self, + *, + id: str | None = None, + alias: str | None = None, + name: str | None = None, + configuration: ApifyConfiguration | None = None, + force_cloud: bool = False, + ) -> Dataset: + """Open a dataset. + + Datasets are used to store structured data where each object stored has the same attributes, such as online + store products or real estate offers. The actual data is stored either on the local filesystem or in + the Apify cloud. + + Args: + id: The ID of the dataset to open. If provided, searches for existing dataset by ID. + Mutually exclusive with name and alias. + name: The name of the dataset to open (global scope, persists across runs). + Mutually exclusive with id and alias. + alias: The alias of the dataset to open (run scope, creates unnamed storage). + Mutually exclusive with id and name. + configuration: Configuration used to open the dataset. + force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible + to combine local and cloud storage. + + Returns: + An instance of the `Dataset` class for the given ID or name. + """ + return await Dataset.open( + id=id, + alias=alias, + name=name, + configuration=configuration, + storage_client=self._get_suitable_storage_client(force_cloud=force_cloud), + ) + + async def open_key_value_store( + self, + *, + id: str | None = None, + alias: str | None = None, + name: str | None = None, + configuration: ApifyConfiguration | None = None, + force_cloud: bool = False, + ) -> KeyValueStore: + """Open a key-value store. + + Key-value stores are used to store records or files, along with their MIME content type. The records are stored + and retrieved using a unique key. The actual data is stored either on a local filesystem or in the Apify cloud. + + Args: + id: The ID of the KVS to open. If provided, searches for existing KVS by ID. + Mutually exclusive with name and alias. + name: The name of the KVS to open (global scope, persists across runs). + Mutually exclusive with id and alias. + alias: The alias of the KVS to open (run scope, creates unnamed storage). + Mutually exclusive with id and name. + configuration: Configuration used to open the key value store. + force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible + to combine local and cloud storage. + + Returns: + An instance of the `KeyValueStore` class for the given ID or name. + """ + return await KeyValueStore.open( + id=id, + alias=alias, + name=name, + configuration=configuration, + storage_client=self._get_suitable_storage_client(force_cloud=force_cloud), + ) + + async def open_request_queue( + self, + *, + id: str | None = None, + alias: str | None = None, + name: str | None = None, + configuration: ApifyConfiguration | None = None, + force_cloud: bool = False, + ) -> RequestQueue: + """Open a request queue. + + Request queue represents a queue of URLs to crawl, which is stored either on local filesystem or in + the Apify cloud. The queue is used for deep crawling of websites, where you start with several URLs and then + recursively follow links to other pages. The data structure supports both breadth-first and depth-first + crawling orders. + + Args: + id: The ID of the RQ to open. If provided, searches for existing RQ by ID. + Mutually exclusive with name and alias. + name: The name of the RQ to open (global scope, persists across runs). + Mutually exclusive with id and alias. + alias: The alias of the RQ to open (run scope, creates unnamed storage). + Mutually exclusive with id and name. + configuration: Configuration used to open the request queue. + force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible + to combine local and cloud storage. + + Returns: + An instance of the `RequestQueue` class for the given ID or name. + """ + return await RequestQueue.open( + id=id, + alias=alias, + name=name, + configuration=configuration, + storage_client=self._get_suitable_storage_client(force_cloud=force_cloud), + ) + + @cached_property + def _is_at_home(self) -> bool: + configuration = ApifyConfiguration.get_global_configuration() + return configuration.is_at_home diff --git a/tests/integration/actor_source_base/requirements.txt b/tests/integration/actor_source_base/requirements.txt index e9bc6dad..a7bc9105 100644 --- a/tests/integration/actor_source_base/requirements.txt +++ b/tests/integration/actor_source_base/requirements.txt @@ -1,4 +1,4 @@ # The test fixture will put the Apify SDK wheel path on the next line APIFY_SDK_WHEEL_PLACEHOLDER uvicorn[standard] -crawlee[parsel]==0.6.13b42 +crawlee[parsel] @ git+https://github.com/apify/crawlee-python.git@include-storag-client-in-additional-cache-key diff --git a/tests/integration/test_apify_storages.py b/tests/integration/test_apify_storages.py index 0cf0c9af..45409b82 100644 --- a/tests/integration/test_apify_storages.py +++ b/tests/integration/test_apify_storages.py @@ -5,8 +5,9 @@ from crawlee import service_locator from crawlee.storages import Dataset, KeyValueStore, RequestQueue -from apify import Configuration -from apify.storage_clients import ApifyStorageClient +from .conftest import MakeActorFunction, RunActorFunction +from apify import Actor, Configuration +from apify.storage_clients import ApifyHybridStorageClient, ApifyStorageClient, MemoryStorageClient @pytest.mark.parametrize( @@ -32,3 +33,91 @@ async def test_alias_concurrent_creation_local( except AssertionError: for storage in storages: await storage.drop() + + +async def test_actor_full_explicit_storage_init(apify_token: str) -> None: + service_locator.set_configuration(Configuration(token=apify_token)) + service_locator.set_storage_client( + ApifyHybridStorageClient( + local_storage_client=MemoryStorageClient(), + cloud_storage_client=ApifyStorageClient(access='shared'), + ) + ) + async with Actor(): + # If service locator was already set with ApifyHybridStorageClient, the actor will use it. + # Storages should be different when force_cloud is used outside the Apify platform + assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) + assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) + assert await Actor.open_request_queue() is not await Actor.open_request_queue(force_cloud=True) + + +async def test_actor_full_explicit_storage_init_same_client(apify_token: str) -> None: + service_locator.set_configuration(Configuration(token=apify_token)) + service_locator.set_storage_client( + ApifyHybridStorageClient( + local_storage_client=ApifyStorageClient(access='shared'), + cloud_storage_client=ApifyStorageClient(access='shared'), + ) + ) + async with Actor(): + # If service locator was already set with ApifyHybridStorageClient, the actor will use it. + # Storages should be same as the equivalent storage client is for both local and cloud storage client + assert await Actor.open_dataset() is await Actor.open_dataset(force_cloud=True) + assert await Actor.open_key_value_store() is await Actor.open_key_value_store(force_cloud=True) + assert await Actor.open_request_queue() is await Actor.open_request_queue(force_cloud=True) + + +async def test_actor_partial_explicit_cloud_storage_init(apify_token: str) -> None: + service_locator.set_configuration(Configuration(token=apify_token)) + service_locator.set_storage_client(ApifyStorageClient(access='shared')) + async with Actor(): + # If service locator was already set with ApifyStorageClient, the actor will use it as cloud_storage_client of + # ApifyHybridStorageClient + assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) + assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) + assert await Actor.open_request_queue() is not await Actor.open_request_queue(force_cloud=True) + + +async def test_actor_partial_explicit_local_storage_init(apify_token: str) -> None: + service_locator.set_configuration(Configuration(token=apify_token)) + service_locator.set_storage_client(MemoryStorageClient()) + async with Actor(): + # If service locator was already set with non-ApifyStorageClient, the actor will use it as local_storage_client + # of ApifyHybridStorageClient + assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) + assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) + assert await Actor.open_request_queue() is not await Actor.open_request_queue(force_cloud=True) + + +async def test_actor_implicit_storage_init(apify_token: str) -> None: + service_locator.set_configuration(Configuration(token=apify_token)) + async with Actor(): + assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) + assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) + assert await Actor.open_request_queue() is not await Actor.open_request_queue(force_cloud=True) + + +async def test_actor_full_explicit_storage_init_on_platform( + make_actor: MakeActorFunction, run_actor: RunActorFunction +) -> None: + async def main() -> None: + from crawlee import service_locator + + from apify.storage_clients import ApifyHybridStorageClient, ApifyStorageClient, MemoryStorageClient + + service_locator.set_storage_client( + ApifyHybridStorageClient( + local_storage_client=MemoryStorageClient(), + cloud_storage_client=ApifyStorageClient(access='shared'), + ) + ) + async with Actor(): + # Storages should be same as the cloud client is used on the platform + assert await Actor.open_dataset() is await Actor.open_dataset(force_cloud=True) + assert await Actor.open_key_value_store() is await Actor.open_key_value_store(force_cloud=True) + assert await Actor.open_request_queue() is await Actor.open_request_queue(force_cloud=True) + + actor = await make_actor(label='explicit_storage_init', main_func=main) + run_result = await run_actor(actor) + + assert run_result.status == 'SUCCEEDED' diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index 7212f5e3..83e9eb2e 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -11,7 +11,7 @@ from apify import Actor from apify import Configuration as ApifyConfiguration -from apify.storage_clients import FileSystemStorageClient as ApifyFileSystemStorageClient +from apify.storage_clients._hybrid_apify._storage_client import ApifyHybridStorageClient @pytest.mark.parametrize( @@ -111,8 +111,8 @@ async def test_crawler_implicit_local_storage() -> None: async with Actor(): crawler = BasicCrawler() - assert isinstance(service_locator.get_storage_client(), ApifyFileSystemStorageClient) - assert isinstance(crawler._service_locator.get_storage_client(), ApifyFileSystemStorageClient) + assert isinstance(service_locator.get_storage_client(), ApifyHybridStorageClient) + assert isinstance(crawler._service_locator.get_storage_client(), ApifyHybridStorageClient) async def test_crawlers_own_configuration(tmp_path: Path) -> None: diff --git a/uv.lock b/uv.lock index 6a71bfa1..d1eee088 100644 --- a/uv.lock +++ b/uv.lock @@ -76,7 +76,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=2.0.0,<3.0.0" }, { name = "apify-shared", specifier = ">=2.0.0,<3.0.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", specifier = "==0.6.13b42" }, + { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=include-storag-client-in-additional-cache-key" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "impit", specifier = ">=0.6.1" }, { name = "lazy-object-proxy", specifier = ">=1.11.0" }, @@ -516,8 +516,8 @@ toml = [ [[package]] name = "crawlee" -version = "0.6.13b42" -source = { registry = "https://pypi.org/simple" } +version = "0.6.13" +source = { git = "https://github.com/apify/crawlee-python.git?rev=include-storag-client-in-additional-cache-key#182b872c9b18ebdc54a442e2331ea2e3172ab8e6" } dependencies = [ { name = "cachetools" }, { name = "colorama" }, @@ -532,10 +532,6 @@ dependencies = [ { name = "typing-extensions" }, { name = "yarl" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/98/8e/8c5bf3cd84335aeb157f95ecaadc5cb61b9bb0f1ffa28a50f9a2485c38a6/crawlee-0.6.13b42.tar.gz", hash = "sha256:5a8c7bcf6abf77c6b7be3323e3cfa017a9717f0b5e5275bbb7ad8de589c851af", size = 24842767, upload-time = "2025-09-17T15:19:26.706Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6e/eb/6a048e5916a78c30ea1b550452a6ede24facf5cafd564bbb1bc5e8ba6fea/crawlee-0.6.13b42-py3-none-any.whl", hash = "sha256:e9c258d49c8d4269d41a1dd9babfc262d241c62c9549d4dd54d1cad0ddbf9569", size = 279764, upload-time = "2025-09-17T15:19:23.817Z" }, -] [package.optional-dependencies] parsel = [ From 1cc80bbb6c195b75c741780682a3560bc6b93053 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 24 Sep 2025 08:46:54 +0200 Subject: [PATCH 17/26] Update log test --- src/apify/_actor.py | 2 +- tests/unit/actor/test_actor_log.py | 66 ++++++++++++++++-------------- 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 863f841b..b192bdbe 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -288,6 +288,7 @@ async def init(self) -> None: This method should be called immediately before performing any additional Actor actions, and it should be called only once. """ + self.log.info('Initializing Actor...') if self._configuration: # Set explicitly the configuration in the service locator service_locator.set_configuration(self.configuration) @@ -315,7 +316,6 @@ async def init(self) -> None: if self._configure_logging: _configure_logging() - self.log.info('Initializing Actor...') self.log.info('System info', extra=get_system_info()) await self.event_manager.__aenter__() diff --git a/tests/unit/actor/test_actor_log.py b/tests/unit/actor/test_actor_log.py index 356f8bb3..ecb90ab6 100644 --- a/tests/unit/actor/test_actor_log.py +++ b/tests/unit/actor/test_actor_log.py @@ -37,7 +37,7 @@ async def test_actor_logs_messages_correctly(caplog: pytest.LogCaptureFixture) - raise RuntimeError('Dummy RuntimeError') # Updated expected number of log records (an extra record is now captured) - assert len(caplog.records) == 14 + assert len(caplog.records) == 15 # Record 0: Extra Pytest context log assert caplog.records[0].levelno == logging.DEBUG @@ -51,54 +51,58 @@ async def test_actor_logs_messages_correctly(caplog: pytest.LogCaptureFixture) - assert caplog.records[2].levelno == logging.INFO assert caplog.records[2].message == 'Initializing Actor...' + # Record 2: Initializing Actor... + assert caplog.records[3].levelno == logging.DEBUG + assert caplog.records[3].message.startswith('Storage client set to') + # Record 3: System info - assert caplog.records[3].levelno == logging.INFO - assert caplog.records[3].message == 'System info' + assert caplog.records[4].levelno == logging.INFO + assert caplog.records[4].message == 'System info' # Record 4: Event manager initialized - assert caplog.records[4].levelno == logging.DEBUG - assert caplog.records[4].message == 'Event manager initialized' + assert caplog.records[5].levelno == logging.DEBUG + assert caplog.records[5].message == 'Event manager initialized' # Record 5: Charging manager initialized - assert caplog.records[5].levelno == logging.DEBUG - assert caplog.records[5].message == 'Charging manager initialized' + assert caplog.records[6].levelno == logging.DEBUG + assert caplog.records[6].message == 'Charging manager initialized' # Record 6: Debug message - assert caplog.records[6].levelno == logging.DEBUG - assert caplog.records[6].message == 'Debug message' + assert caplog.records[7].levelno == logging.DEBUG + assert caplog.records[7].message == 'Debug message' # Record 7: Info message - assert caplog.records[7].levelno == logging.INFO - assert caplog.records[7].message == 'Info message' + assert caplog.records[8].levelno == logging.INFO + assert caplog.records[8].message == 'Info message' # Record 8: Warning message - assert caplog.records[8].levelno == logging.WARNING - assert caplog.records[8].message == 'Warning message' + assert caplog.records[9].levelno == logging.WARNING + assert caplog.records[9].message == 'Warning message' # Record 9: Error message - assert caplog.records[9].levelno == logging.ERROR - assert caplog.records[9].message == 'Error message' + assert caplog.records[10].levelno == logging.ERROR + assert caplog.records[10].message == 'Error message' # Record 10: Exception message with traceback (ValueError) - assert caplog.records[10].levelno == logging.ERROR - assert caplog.records[10].message == 'Exception message' - assert caplog.records[10].exc_info is not None - assert caplog.records[10].exc_info[0] is ValueError - assert isinstance(caplog.records[10].exc_info[1], ValueError) - assert str(caplog.records[10].exc_info[1]) == 'Dummy ValueError' + assert caplog.records[11].levelno == logging.ERROR + assert caplog.records[11].message == 'Exception message' + assert caplog.records[11].exc_info is not None + assert caplog.records[11].exc_info[0] is ValueError + assert isinstance(caplog.records[11].exc_info[1], ValueError) + assert str(caplog.records[11].exc_info[1]) == 'Dummy ValueError' # Record 11: Multiline log message - assert caplog.records[11].levelno == logging.INFO - assert caplog.records[11].message == 'Multi\nline\nlog\nmessage' + assert caplog.records[12].levelno == logging.INFO + assert caplog.records[12].message == 'Multi\nline\nlog\nmessage' # Record 12: Actor failed with an exception (RuntimeError) - assert caplog.records[12].levelno == logging.ERROR - assert caplog.records[12].message == 'Actor failed with an exception' - assert caplog.records[12].exc_info is not None - assert caplog.records[12].exc_info[0] is RuntimeError - assert isinstance(caplog.records[12].exc_info[1], RuntimeError) - assert str(caplog.records[12].exc_info[1]) == 'Dummy RuntimeError' + assert caplog.records[13].levelno == logging.ERROR + assert caplog.records[13].message == 'Actor failed with an exception' + assert caplog.records[13].exc_info is not None + assert caplog.records[13].exc_info[0] is RuntimeError + assert isinstance(caplog.records[13].exc_info[1], RuntimeError) + assert str(caplog.records[13].exc_info[1]) == 'Dummy RuntimeError' # Record 13: Exiting Actor - assert caplog.records[13].levelno == logging.INFO - assert caplog.records[13].message == 'Exiting Actor' + assert caplog.records[14].levelno == logging.INFO + assert caplog.records[14].message == 'Exiting Actor' From 860b0eca811036e3c55c14892c619b2fd12ce98c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 24 Sep 2025 09:29:49 +0200 Subject: [PATCH 18/26] Rename access to request_queue_access --- docs/04_upgrading/upgrading_to_v3.md | 10 ++++++---- .../storage_clients/_apify/_storage_client.py | 16 +++++++++------- .../_hybrid_apify/_storage_client.py | 2 +- tests/integration/conftest.py | 2 +- tests/integration/test_actor_request_queue.py | 2 +- tests/integration/test_apify_storages.py | 10 +++++----- tests/integration/test_request_queue.py | 6 +++--- 7 files changed, 26 insertions(+), 22 deletions(-) diff --git a/docs/04_upgrading/upgrading_to_v3.md b/docs/04_upgrading/upgrading_to_v3.md index f83d503d..5954a628 100644 --- a/docs/04_upgrading/upgrading_to_v3.md +++ b/docs/04_upgrading/upgrading_to_v3.md @@ -111,6 +111,7 @@ async def main(): - It is now possible to have full control over which storage clients are used by the `Actor`. To make development of Actors convenient, the `Actor` has two storage clients. One that is used when running on Apify platform or when opening storages with `force_cloud=True` and the other client that is used when running outside the Apify platform. The `Actor` has reasonable defaults and for the majority of use-cases there is no need to change it. However, if you need to use a different storage client, you can set it up before entering `Actor` context through `service_locator`. **Now (v3.0):** + ```python from crawlee import service_locator from apify.storage_clients import ApifyStorageClient, ApifyHybridStorageClient, MemoryStorageClient @@ -120,7 +121,7 @@ from apify import Actor async def main(): service_locator.set_storage_client( ApifyHybridStorageClient( - cloud_storage_client=ApifyStorageClient(access="single"), + cloud_storage_client=ApifyStorageClient(request_queue_access="single"), local_storage_client=MemoryStorageClient() ) ) @@ -132,7 +133,7 @@ async def main(): ## The default use of optimized ApifyRequestQueueClient - The default client for working with Apify platform based `RequestQueue` is now optimized and simplified client which does significantly lower amount of API calls, but does not support multiple consumers working on the same queue. It is cheaper and faster and is suitable for the majority of the use cases. -- The full client is still available, but it has to be explicitly requested via `access="shared"` argument when using the `ApifyStorageClient`. +- The full client is still available, but it has to be explicitly requested via `request_queue_access="shared"` argument when using the `ApifyStorageClient`. **Now (v3.0):** @@ -141,9 +142,10 @@ from crawlee import service_locator from apify.storage_clients import ApifyStorageClient from apify import Actor + async def main(): - # Full client that supports multiple consumers of the Apify Request Queue - service_locator.set_storage_client(ApifyStorageClient(access="shared")) + # Full client that supports multiple consumers of the Apify Request Queue + service_locator.set_storage_client(ApifyStorageClient(request_queue_access="shared")) async with Actor: rq = await Actor.open_request_queue() ``` diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index a63123df..c3a09399 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -26,16 +26,16 @@ class ApifyStorageClient(StorageClient): """Apify storage client.""" - def __init__(self, *, access: Literal['single', 'shared'] = 'single') -> None: + def __init__(self, *, request_queue_access: Literal['single', 'shared'] = 'single') -> None: """Initialize the Apify storage client. Args: - access: If 'single', the `create_rq_client` will return `ApifyRequestQueueSingleClient`, if 'shared' it - will return `ApifyRequestQueueSharedClient`. - - 'single' is suitable for single consumer scenarios. It makes less API calls, is cheaper and faster. - - 'shared' is suitable for multiple consumers scenarios at the cost of higher API usage. + request_queue_access: If 'single', the `create_rq_client` will return `ApifyRequestQueueSingleClient`, if + 'shared' it will return `ApifyRequestQueueSharedClient`. + - 'single' is suitable for single consumer scenarios. It makes less API calls, is cheaper and faster. + - 'shared' is suitable for multiple consumers scenarios at the cost of higher API usage. """ - self._access = access + self._request_queue_access = request_queue_access # This class breaches Liskov Substitution Principle. It requires specialized Configuration compared to its parent. _lsp_violation_error_message_template = ( @@ -96,7 +96,9 @@ async def create_rq_client( configuration = configuration or ApifyConfiguration.get_global_configuration() if isinstance(configuration, ApifyConfiguration): client: type[ApifyRequestQueueClient] = ( - ApifyRequestQueueSingleClient if self._access == 'single' else ApifyRequestQueueSharedClient + ApifyRequestQueueSingleClient + if self._request_queue_access == 'single' + else ApifyRequestQueueSharedClient ) return await client.open(id=id, name=name, alias=alias, configuration=configuration) diff --git a/src/apify/storage_clients/_hybrid_apify/_storage_client.py b/src/apify/storage_clients/_hybrid_apify/_storage_client.py index f98f535f..971fe091 100644 --- a/src/apify/storage_clients/_hybrid_apify/_storage_client.py +++ b/src/apify/storage_clients/_hybrid_apify/_storage_client.py @@ -43,7 +43,7 @@ def __init__( local_storage_client: Client used to communicate with the storage when not running on the Apify platform and not using `force_cloud` argument when opening storages. """ - self._cloud_storage_client = cloud_storage_client or ApifyStorageClient(access='single') + self._cloud_storage_client = cloud_storage_client or ApifyStorageClient(request_queue_access='single') self._local_storage_client = local_storage_client or ApifyFileSystemStorageClient() def _get_suitable_storage_client(self, *, force_cloud: bool = False) -> StorageClient: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 4da5c4a2..aea770db 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -115,7 +115,7 @@ async def request_queue_apify( monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) async with Actor: - rq = await RequestQueue.open(storage_client=ApifyStorageClient(access=request.param)) + rq = await RequestQueue.open(storage_client=ApifyStorageClient(request_queue_access=request.param)) yield rq await rq.drop() diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py index 1f7d4fa5..3a9053c7 100644 --- a/tests/integration/test_actor_request_queue.py +++ b/tests/integration/test_actor_request_queue.py @@ -33,7 +33,7 @@ async def apify_named_rq( async with Actor: request_queue = await RequestQueue.open( - name=request_queue_name, storage_client=ApifyStorageClient(access=request.param) + name=request_queue_name, storage_client=ApifyStorageClient(request_queue_access=request.param) ) yield request_queue await request_queue.drop() diff --git a/tests/integration/test_apify_storages.py b/tests/integration/test_apify_storages.py index 45409b82..c9658935 100644 --- a/tests/integration/test_apify_storages.py +++ b/tests/integration/test_apify_storages.py @@ -40,7 +40,7 @@ async def test_actor_full_explicit_storage_init(apify_token: str) -> None: service_locator.set_storage_client( ApifyHybridStorageClient( local_storage_client=MemoryStorageClient(), - cloud_storage_client=ApifyStorageClient(access='shared'), + cloud_storage_client=ApifyStorageClient(request_queue_access='shared'), ) ) async with Actor(): @@ -55,8 +55,8 @@ async def test_actor_full_explicit_storage_init_same_client(apify_token: str) -> service_locator.set_configuration(Configuration(token=apify_token)) service_locator.set_storage_client( ApifyHybridStorageClient( - local_storage_client=ApifyStorageClient(access='shared'), - cloud_storage_client=ApifyStorageClient(access='shared'), + local_storage_client=ApifyStorageClient(request_queue_access='shared'), + cloud_storage_client=ApifyStorageClient(request_queue_access='shared'), ) ) async with Actor(): @@ -69,7 +69,7 @@ async def test_actor_full_explicit_storage_init_same_client(apify_token: str) -> async def test_actor_partial_explicit_cloud_storage_init(apify_token: str) -> None: service_locator.set_configuration(Configuration(token=apify_token)) - service_locator.set_storage_client(ApifyStorageClient(access='shared')) + service_locator.set_storage_client(ApifyStorageClient(request_queue_access='shared')) async with Actor(): # If service locator was already set with ApifyStorageClient, the actor will use it as cloud_storage_client of # ApifyHybridStorageClient @@ -108,7 +108,7 @@ async def main() -> None: service_locator.set_storage_client( ApifyHybridStorageClient( local_storage_client=MemoryStorageClient(), - cloud_storage_client=ApifyStorageClient(access='shared'), + cloud_storage_client=ApifyStorageClient(request_queue_access='shared'), ) ) async with Actor(): diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py index 36db5ac9..fbbdfb74 100644 --- a/tests/integration/test_request_queue.py +++ b/tests/integration/test_request_queue.py @@ -1086,8 +1086,8 @@ async def test_request_queue_simple_and_full_at_the_same_time( monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) async with Actor: - rq_simple = await RequestQueue.open(storage_client=ApifyStorageClient(access='single')) - rq_full = await RequestQueue.open(storage_client=ApifyStorageClient(access='shared')) + rq_simple = await RequestQueue.open(storage_client=ApifyStorageClient(request_queue_access='single')) + rq_full = await RequestQueue.open(storage_client=ApifyStorageClient(request_queue_access='shared')) # Opening same queue again with different ApifyStorageClient will resolve to the first client used. assert rq_simple is rq_full await rq_simple.drop() @@ -1112,7 +1112,7 @@ async def test_crawler_run_request_queue_variant_stats( monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token) async with Actor: requests = 5 - rq = await RequestQueue.open(storage_client=ApifyStorageClient(access=access)) + rq = await RequestQueue.open(storage_client=ApifyStorageClient(request_queue_access=access)) crawler = BasicCrawler(request_manager=rq) @crawler.router.default_handler From e6c6fc5af21801c10f34e57d149b7bc07a1baa80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Wed, 24 Sep 2025 13:53:07 +0200 Subject: [PATCH 19/26] Update src/apify/_actor.py Co-authored-by: Jan Buchar --- src/apify/_actor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index b192bdbe..04dc1cd3 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -244,7 +244,7 @@ def _storage_client(self) -> ApifyHybridStorageClient: Depending on the initialization of the service locator the client can be created in different ways. """ try: - # Notning was set by the user. + # Nothing was set by the user. implicit_storage_client = ApifyHybridStorageClient( local_storage_client=ApifyFileSystemStorageClient(), cloud_storage_client=ApifyStorageClient() ) From da2f5df9245506e42cf84a13f338700052ee7977 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 24 Sep 2025 14:17:32 +0200 Subject: [PATCH 20/26] Review comments --- src/apify/_actor.py | 35 ++- src/apify/storage_clients/__init__.py | 4 +- .../storage_clients/_hybrid_apify/__init__.py | 1 - .../_hybrid_apify/_storage_client.py | 226 ------------------ .../storage_clients/_smart_apify/__init__.py | 1 + .../_smart_apify/_storage_client.py | 124 ++++++++++ tests/integration/test_apify_storages.py | 30 +-- tests/unit/actor/test_configuration.py | 6 +- 8 files changed, 170 insertions(+), 257 deletions(-) delete mode 100644 src/apify/storage_clients/_hybrid_apify/__init__.py delete mode 100644 src/apify/storage_clients/_hybrid_apify/_storage_client.py create mode 100644 src/apify/storage_clients/_smart_apify/__init__.py create mode 100644 src/apify/storage_clients/_smart_apify/_storage_client.py diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 04dc1cd3..08635f6e 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -38,7 +38,8 @@ from apify.log import _configure_logging, logger from apify.storage_clients import ApifyStorageClient from apify.storage_clients._file_system import ApifyFileSystemStorageClient -from apify.storage_clients._hybrid_apify._storage_client import ApifyHybridStorageClient +from apify.storage_clients._smart_apify._storage_client import SmartApifyStorageClient +from apify.storages import Dataset, KeyValueStore, RequestQueue if TYPE_CHECKING: import logging @@ -50,7 +51,6 @@ from crawlee.proxy_configuration import _NewUrlFunction from apify._models import Webhook - from apify.storages import Dataset, KeyValueStore, RequestQueue MainReturnType = TypeVar('MainReturnType') @@ -238,14 +238,14 @@ def _raise_if_not_initialized(self) -> None: raise RuntimeError('The Actor was not initialized!') @cached_property - def _storage_client(self) -> ApifyHybridStorageClient: + def _storage_client(self) -> SmartApifyStorageClient: """Storage client used by the actor. Depending on the initialization of the service locator the client can be created in different ways. """ try: # Nothing was set by the user. - implicit_storage_client = ApifyHybridStorageClient( + implicit_storage_client = SmartApifyStorageClient( local_storage_client=ApifyFileSystemStorageClient(), cloud_storage_client=ApifyStorageClient() ) service_locator.set_storage_client(implicit_storage_client) @@ -259,13 +259,13 @@ def _storage_client(self) -> ApifyHybridStorageClient: # User set something in the service locator. storage_client = service_locator.get_storage_client() - if isinstance(storage_client, ApifyHybridStorageClient): + if isinstance(storage_client, SmartApifyStorageClient): # The client was manually set to the right type in the service locator. This is the explicit way. return storage_client if isinstance(storage_client, ApifyStorageClient): # The cloud storage client was manually set in the service locator. - return ApifyHybridStorageClient(cloud_storage_client=storage_client) + return SmartApifyStorageClient(cloud_storage_client=storage_client) # The local storage client was manually set in the service locator if type(storage_client) is FileSystemStorageClient: @@ -275,7 +275,7 @@ def _storage_client(self) -> ApifyHybridStorageClient: f'`apify.storage_clients.FileSystemStorageClient` instead.' ) - return ApifyHybridStorageClient(cloud_storage_client=ApifyStorageClient(), local_storage_client=storage_client) + return SmartApifyStorageClient(cloud_storage_client=ApifyStorageClient(), local_storage_client=storage_client) async def init(self) -> None: """Initialize the Actor instance. @@ -464,7 +464,12 @@ async def open_dataset( An instance of the `Dataset` class for the given ID or name. """ self._raise_if_not_initialized() - return await self._storage_client.open_dataset(id=id, name=name, alias=alias, force_cloud=force_cloud) + return await Dataset.open( + id=id, + name=name, + alias=alias, + storage_client=self._storage_client.get_suitable_storage_client(force_cloud=force_cloud), + ) async def open_key_value_store( self, @@ -493,7 +498,12 @@ async def open_key_value_store( An instance of the `KeyValueStore` class for the given ID or name. """ self._raise_if_not_initialized() - return await self._storage_client.open_key_value_store(id=id, name=name, alias=alias, force_cloud=force_cloud) + return await KeyValueStore.open( + id=id, + name=name, + alias=alias, + storage_client=self._storage_client.get_suitable_storage_client(force_cloud=force_cloud), + ) async def open_request_queue( self, @@ -524,7 +534,12 @@ async def open_request_queue( An instance of the `RequestQueue` class for the given ID or name. """ self._raise_if_not_initialized() - return await self._storage_client.open_request_queue(id=id, name=name, alias=alias, force_cloud=force_cloud) + return await RequestQueue.open( + id=id, + name=name, + alias=alias, + storage_client=self._storage_client.get_suitable_storage_client(force_cloud=force_cloud), + ) @overload async def push_data(self, data: dict | list[dict]) -> None: ... diff --git a/src/apify/storage_clients/__init__.py b/src/apify/storage_clients/__init__.py index 2391951f..8a62e3dc 100644 --- a/src/apify/storage_clients/__init__.py +++ b/src/apify/storage_clients/__init__.py @@ -2,11 +2,11 @@ from ._apify import ApifyStorageClient from ._file_system import ApifyFileSystemStorageClient as FileSystemStorageClient -from ._hybrid_apify import ApifyHybridStorageClient +from ._smart_apify import SmartApifyStorageClient __all__ = [ - 'ApifyHybridStorageClient', 'ApifyStorageClient', 'FileSystemStorageClient', 'MemoryStorageClient', + 'SmartApifyStorageClient', ] diff --git a/src/apify/storage_clients/_hybrid_apify/__init__.py b/src/apify/storage_clients/_hybrid_apify/__init__.py deleted file mode 100644 index 8cbef292..00000000 --- a/src/apify/storage_clients/_hybrid_apify/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from ._storage_client import ApifyHybridStorageClient diff --git a/src/apify/storage_clients/_hybrid_apify/_storage_client.py b/src/apify/storage_clients/_hybrid_apify/_storage_client.py deleted file mode 100644 index 971fe091..00000000 --- a/src/apify/storage_clients/_hybrid_apify/_storage_client.py +++ /dev/null @@ -1,226 +0,0 @@ -from __future__ import annotations - -from functools import cached_property -from typing import TYPE_CHECKING - -from typing_extensions import override - -from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient, StorageClient -from crawlee.storages import RequestQueue - -from apify._configuration import Configuration as ApifyConfiguration -from apify._utils import docs_group -from apify.storage_clients import ApifyStorageClient -from apify.storage_clients._file_system import ApifyFileSystemStorageClient -from apify.storages import Dataset, KeyValueStore - -if TYPE_CHECKING: - from collections.abc import Hashable - - from crawlee.configuration import Configuration as CrawleeConfiguration - - -@docs_group('Storage clients') -class ApifyHybridStorageClient(StorageClient): - """ApifyHybridStorageClient that delegates to cloud_storage_client or local_storage_client. - - When running on Apify platform use cloud_storage_client, else use local_storage_client. It has additional wrapper - methods with `force_cloud` parameter to force using cloud_storage_client when opening specific storages even when - not running on the Apify platform. This storage client is designed to work specifically in Actor context. - """ - - def __init__( - self, - *, - cloud_storage_client: ApifyStorageClient | None = None, - local_storage_client: StorageClient | None = None, - ) -> None: - """Initialize the Apify storage client. - - Args: - cloud_storage_client: Client used to communicate with the Apify platform storage. Either through - `force_cloud` argument when opening storages or automatically when running on the Apify platform. - local_storage_client: Client used to communicate with the storage when not running on the Apify - platform and not using `force_cloud` argument when opening storages. - """ - self._cloud_storage_client = cloud_storage_client or ApifyStorageClient(request_queue_access='single') - self._local_storage_client = local_storage_client or ApifyFileSystemStorageClient() - - def _get_suitable_storage_client(self, *, force_cloud: bool = False) -> StorageClient: - if self._is_at_home: - return self._cloud_storage_client - - configuration = ApifyConfiguration.get_global_configuration() - if force_cloud: - if configuration.token is None: - raise RuntimeError( - 'In order to use the Apify cloud storage from your computer, ' - 'you need to provide an Apify token using the APIFY_TOKEN environment variable.' - ) - return self._cloud_storage_client - - return self._local_storage_client - - @override - def get_additional_cache_key(self, configuration: CrawleeConfiguration) -> Hashable: - if self._is_at_home: - if isinstance(configuration, ApifyConfiguration): - return self._cloud_storage_client.get_additional_cache_key(configuration) - raise TypeError('Expecting ApifyConfiguration') - - return self._local_storage_client.get_additional_cache_key(configuration) - - @override - async def create_dataset_client( - self, - *, - id: str | None = None, - name: str | None = None, - alias: str | None = None, - configuration: CrawleeConfiguration | None = None, - ) -> DatasetClient: - return await self._get_suitable_storage_client().create_dataset_client( - id=id, name=id, alias=alias, configuration=configuration - ) - - @override - async def create_kvs_client( - self, - *, - id: str | None = None, - name: str | None = None, - alias: str | None = None, - configuration: CrawleeConfiguration | None = None, - ) -> KeyValueStoreClient: - return await self._get_suitable_storage_client().create_kvs_client( - id=id, name=id, alias=alias, configuration=configuration - ) - - @override - async def create_rq_client( - self, - *, - id: str | None = None, - name: str | None = None, - alias: str | None = None, - configuration: CrawleeConfiguration | None = None, - ) -> RequestQueueClient: - return await self._get_suitable_storage_client().create_rq_client( - id=id, name=id, alias=alias, configuration=configuration - ) - - async def open_dataset( - self, - *, - id: str | None = None, - alias: str | None = None, - name: str | None = None, - configuration: ApifyConfiguration | None = None, - force_cloud: bool = False, - ) -> Dataset: - """Open a dataset. - - Datasets are used to store structured data where each object stored has the same attributes, such as online - store products or real estate offers. The actual data is stored either on the local filesystem or in - the Apify cloud. - - Args: - id: The ID of the dataset to open. If provided, searches for existing dataset by ID. - Mutually exclusive with name and alias. - name: The name of the dataset to open (global scope, persists across runs). - Mutually exclusive with id and alias. - alias: The alias of the dataset to open (run scope, creates unnamed storage). - Mutually exclusive with id and name. - configuration: Configuration used to open the dataset. - force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible - to combine local and cloud storage. - - Returns: - An instance of the `Dataset` class for the given ID or name. - """ - return await Dataset.open( - id=id, - alias=alias, - name=name, - configuration=configuration, - storage_client=self._get_suitable_storage_client(force_cloud=force_cloud), - ) - - async def open_key_value_store( - self, - *, - id: str | None = None, - alias: str | None = None, - name: str | None = None, - configuration: ApifyConfiguration | None = None, - force_cloud: bool = False, - ) -> KeyValueStore: - """Open a key-value store. - - Key-value stores are used to store records or files, along with their MIME content type. The records are stored - and retrieved using a unique key. The actual data is stored either on a local filesystem or in the Apify cloud. - - Args: - id: The ID of the KVS to open. If provided, searches for existing KVS by ID. - Mutually exclusive with name and alias. - name: The name of the KVS to open (global scope, persists across runs). - Mutually exclusive with id and alias. - alias: The alias of the KVS to open (run scope, creates unnamed storage). - Mutually exclusive with id and name. - configuration: Configuration used to open the key value store. - force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible - to combine local and cloud storage. - - Returns: - An instance of the `KeyValueStore` class for the given ID or name. - """ - return await KeyValueStore.open( - id=id, - alias=alias, - name=name, - configuration=configuration, - storage_client=self._get_suitable_storage_client(force_cloud=force_cloud), - ) - - async def open_request_queue( - self, - *, - id: str | None = None, - alias: str | None = None, - name: str | None = None, - configuration: ApifyConfiguration | None = None, - force_cloud: bool = False, - ) -> RequestQueue: - """Open a request queue. - - Request queue represents a queue of URLs to crawl, which is stored either on local filesystem or in - the Apify cloud. The queue is used for deep crawling of websites, where you start with several URLs and then - recursively follow links to other pages. The data structure supports both breadth-first and depth-first - crawling orders. - - Args: - id: The ID of the RQ to open. If provided, searches for existing RQ by ID. - Mutually exclusive with name and alias. - name: The name of the RQ to open (global scope, persists across runs). - Mutually exclusive with id and alias. - alias: The alias of the RQ to open (run scope, creates unnamed storage). - Mutually exclusive with id and name. - configuration: Configuration used to open the request queue. - force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible - to combine local and cloud storage. - - Returns: - An instance of the `RequestQueue` class for the given ID or name. - """ - return await RequestQueue.open( - id=id, - alias=alias, - name=name, - configuration=configuration, - storage_client=self._get_suitable_storage_client(force_cloud=force_cloud), - ) - - @cached_property - def _is_at_home(self) -> bool: - configuration = ApifyConfiguration.get_global_configuration() - return configuration.is_at_home diff --git a/src/apify/storage_clients/_smart_apify/__init__.py b/src/apify/storage_clients/_smart_apify/__init__.py new file mode 100644 index 00000000..605be630 --- /dev/null +++ b/src/apify/storage_clients/_smart_apify/__init__.py @@ -0,0 +1 @@ +from ._storage_client import SmartApifyStorageClient diff --git a/src/apify/storage_clients/_smart_apify/_storage_client.py b/src/apify/storage_clients/_smart_apify/_storage_client.py new file mode 100644 index 00000000..85ebf1a7 --- /dev/null +++ b/src/apify/storage_clients/_smart_apify/_storage_client.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +from functools import cached_property +from typing import TYPE_CHECKING + +from typing_extensions import override + +from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient, StorageClient + +from apify._configuration import Configuration as ApifyConfiguration +from apify._utils import docs_group +from apify.storage_clients import ApifyStorageClient +from apify.storage_clients._file_system import ApifyFileSystemStorageClient + +if TYPE_CHECKING: + from collections.abc import Hashable + + from crawlee.configuration import Configuration as CrawleeConfiguration + + +@docs_group('Storage clients') +class SmartApifyStorageClient(StorageClient): + """SmartApifyStorageClient that delegates to cloud_storage_client or local_storage_client. + + When running on Apify platform use cloud_storage_client, else use local_storage_client. It has additional wrapper + methods with `force_cloud` parameter to force using cloud_storage_client when opening specific storages even when + not running on the Apify platform. This storage client is designed to work specifically in Actor context. + """ + + def __init__( + self, + *, + cloud_storage_client: ApifyStorageClient | None = None, + local_storage_client: StorageClient | None = None, + ) -> None: + """Initialize the Apify storage client. + + Args: + cloud_storage_client: Client used to communicate with the Apify platform storage. Either through + `force_cloud` argument when opening storages or automatically when running on the Apify platform. + local_storage_client: Client used to communicate with the storage when not running on the Apify + platform and not using `force_cloud` argument when opening storages. + """ + self._cloud_storage_client = cloud_storage_client or ApifyStorageClient(request_queue_access='single') + self._local_storage_client = local_storage_client or ApifyFileSystemStorageClient() + + def __str__(self) -> str: + return ( + f'{self.__class__.__name__}(cloud_storage_client={self._cloud_storage_client.__class__.__name__},' + f' local_storage_client={self._local_storage_client.__class__.__name__})' + ) + + def get_suitable_storage_client(self, *, force_cloud: bool = False) -> StorageClient: + """Get a suitable storage client based on the global configuration and the value of the force_cloud flag. + + Args: + force_cloud: If True, return `cloud_storage_client`. + """ + if self._is_at_home: + return self._cloud_storage_client + + configuration = ApifyConfiguration.get_global_configuration() + if force_cloud: + if configuration.token is None: + raise RuntimeError( + 'In order to use the Apify cloud storage from your computer, ' + 'you need to provide an Apify token using the APIFY_TOKEN environment variable.' + ) + return self._cloud_storage_client + + return self._local_storage_client + + @override + def get_additional_cache_key(self, configuration: CrawleeConfiguration) -> Hashable: + if self._is_at_home: + if isinstance(configuration, ApifyConfiguration): + return self._cloud_storage_client.get_additional_cache_key(configuration) + raise TypeError('Expecting ApifyConfiguration') + + return self._local_storage_client.get_additional_cache_key(configuration) + + @override + async def create_dataset_client( + self, + *, + id: str | None = None, + name: str | None = None, + alias: str | None = None, + configuration: CrawleeConfiguration | None = None, + ) -> DatasetClient: + return await self.get_suitable_storage_client().create_dataset_client( + id=id, name=id, alias=alias, configuration=configuration + ) + + @override + async def create_kvs_client( + self, + *, + id: str | None = None, + name: str | None = None, + alias: str | None = None, + configuration: CrawleeConfiguration | None = None, + ) -> KeyValueStoreClient: + return await self.get_suitable_storage_client().create_kvs_client( + id=id, name=id, alias=alias, configuration=configuration + ) + + @override + async def create_rq_client( + self, + *, + id: str | None = None, + name: str | None = None, + alias: str | None = None, + configuration: CrawleeConfiguration | None = None, + ) -> RequestQueueClient: + return await self.get_suitable_storage_client().create_rq_client( + id=id, name=id, alias=alias, configuration=configuration + ) + + @cached_property + def _is_at_home(self) -> bool: + configuration = ApifyConfiguration.get_global_configuration() + return configuration.is_at_home diff --git a/tests/integration/test_apify_storages.py b/tests/integration/test_apify_storages.py index c9658935..af827bc4 100644 --- a/tests/integration/test_apify_storages.py +++ b/tests/integration/test_apify_storages.py @@ -7,7 +7,7 @@ from .conftest import MakeActorFunction, RunActorFunction from apify import Actor, Configuration -from apify.storage_clients import ApifyHybridStorageClient, ApifyStorageClient, MemoryStorageClient +from apify.storage_clients import ApifyStorageClient, MemoryStorageClient, SmartApifyStorageClient @pytest.mark.parametrize( @@ -38,13 +38,13 @@ async def test_alias_concurrent_creation_local( async def test_actor_full_explicit_storage_init(apify_token: str) -> None: service_locator.set_configuration(Configuration(token=apify_token)) service_locator.set_storage_client( - ApifyHybridStorageClient( + SmartApifyStorageClient( local_storage_client=MemoryStorageClient(), cloud_storage_client=ApifyStorageClient(request_queue_access='shared'), ) ) - async with Actor(): - # If service locator was already set with ApifyHybridStorageClient, the actor will use it. + async with Actor: + # If service locator was already set with SmartApifyStorageClient, the actor will use it. # Storages should be different when force_cloud is used outside the Apify platform assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) @@ -54,13 +54,13 @@ async def test_actor_full_explicit_storage_init(apify_token: str) -> None: async def test_actor_full_explicit_storage_init_same_client(apify_token: str) -> None: service_locator.set_configuration(Configuration(token=apify_token)) service_locator.set_storage_client( - ApifyHybridStorageClient( + SmartApifyStorageClient( local_storage_client=ApifyStorageClient(request_queue_access='shared'), cloud_storage_client=ApifyStorageClient(request_queue_access='shared'), ) ) - async with Actor(): - # If service locator was already set with ApifyHybridStorageClient, the actor will use it. + async with Actor: + # If service locator was already set with SmartApifyStorageClient, the actor will use it. # Storages should be same as the equivalent storage client is for both local and cloud storage client assert await Actor.open_dataset() is await Actor.open_dataset(force_cloud=True) assert await Actor.open_key_value_store() is await Actor.open_key_value_store(force_cloud=True) @@ -70,9 +70,9 @@ async def test_actor_full_explicit_storage_init_same_client(apify_token: str) -> async def test_actor_partial_explicit_cloud_storage_init(apify_token: str) -> None: service_locator.set_configuration(Configuration(token=apify_token)) service_locator.set_storage_client(ApifyStorageClient(request_queue_access='shared')) - async with Actor(): + async with Actor: # If service locator was already set with ApifyStorageClient, the actor will use it as cloud_storage_client of - # ApifyHybridStorageClient + # SmartApifyStorageClient assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) assert await Actor.open_request_queue() is not await Actor.open_request_queue(force_cloud=True) @@ -81,9 +81,9 @@ async def test_actor_partial_explicit_cloud_storage_init(apify_token: str) -> No async def test_actor_partial_explicit_local_storage_init(apify_token: str) -> None: service_locator.set_configuration(Configuration(token=apify_token)) service_locator.set_storage_client(MemoryStorageClient()) - async with Actor(): + async with Actor: # If service locator was already set with non-ApifyStorageClient, the actor will use it as local_storage_client - # of ApifyHybridStorageClient + # of SmartApifyStorageClient assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) assert await Actor.open_request_queue() is not await Actor.open_request_queue(force_cloud=True) @@ -91,7 +91,7 @@ async def test_actor_partial_explicit_local_storage_init(apify_token: str) -> No async def test_actor_implicit_storage_init(apify_token: str) -> None: service_locator.set_configuration(Configuration(token=apify_token)) - async with Actor(): + async with Actor: assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) assert await Actor.open_request_queue() is not await Actor.open_request_queue(force_cloud=True) @@ -103,15 +103,15 @@ async def test_actor_full_explicit_storage_init_on_platform( async def main() -> None: from crawlee import service_locator - from apify.storage_clients import ApifyHybridStorageClient, ApifyStorageClient, MemoryStorageClient + from apify.storage_clients import ApifyStorageClient, MemoryStorageClient, SmartApifyStorageClient service_locator.set_storage_client( - ApifyHybridStorageClient( + SmartApifyStorageClient( local_storage_client=MemoryStorageClient(), cloud_storage_client=ApifyStorageClient(request_queue_access='shared'), ) ) - async with Actor(): + async with Actor: # Storages should be same as the cloud client is used on the platform assert await Actor.open_dataset() is await Actor.open_dataset(force_cloud=True) assert await Actor.open_key_value_store() is await Actor.open_key_value_store(force_cloud=True) diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index 83e9eb2e..bf6724ff 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -11,7 +11,7 @@ from apify import Actor from apify import Configuration as ApifyConfiguration -from apify.storage_clients._hybrid_apify._storage_client import ApifyHybridStorageClient +from apify.storage_clients._smart_apify._storage_client import SmartApifyStorageClient @pytest.mark.parametrize( @@ -111,8 +111,8 @@ async def test_crawler_implicit_local_storage() -> None: async with Actor(): crawler = BasicCrawler() - assert isinstance(service_locator.get_storage_client(), ApifyHybridStorageClient) - assert isinstance(crawler._service_locator.get_storage_client(), ApifyHybridStorageClient) + assert isinstance(service_locator.get_storage_client(), SmartApifyStorageClient) + assert isinstance(crawler._service_locator.get_storage_client(), SmartApifyStorageClient) async def test_crawlers_own_configuration(tmp_path: Path) -> None: From 1e8a834c2946ed1900276ab8ba4dacbc142ea27a Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 24 Sep 2025 17:28:45 +0200 Subject: [PATCH 21/26] Review comments --- src/apify/_actor.py | 26 +++++++------------ src/apify/storage_clients/_apify/_utils.py | 2 +- .../_smart_apify/_storage_client.py | 10 ++----- tests/integration/test_apify_storages.py | 24 +++++------------ tests/unit/actor/test_configuration.py | 14 ---------- 5 files changed, 18 insertions(+), 58 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 08635f6e..965725e2 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -25,7 +25,6 @@ EventPersistStateData, EventSystemInfoData, ) -from crawlee.storage_clients import FileSystemStorageClient from apify._charging import ChargeResult, ChargingManager, ChargingManagerImplementation from apify._configuration import Configuration @@ -258,24 +257,17 @@ def _storage_client(self) -> SmartApifyStorageClient: return implicit_storage_client # User set something in the service locator. - storage_client = service_locator.get_storage_client() - if isinstance(storage_client, SmartApifyStorageClient): + explicit_storage_client = service_locator.get_storage_client() + if isinstance(explicit_storage_client, SmartApifyStorageClient): # The client was manually set to the right type in the service locator. This is the explicit way. - return storage_client - - if isinstance(storage_client, ApifyStorageClient): - # The cloud storage client was manually set in the service locator. - return SmartApifyStorageClient(cloud_storage_client=storage_client) - - # The local storage client was manually set in the service locator - if type(storage_client) is FileSystemStorageClient: - self.log.warning( - f'Using {FileSystemStorageClient.__module__}.{FileSystemStorageClient.__name__} in Actor context is not' - f' recommended and can lead to problems with reading the input file. Use ' - f'`apify.storage_clients.FileSystemStorageClient` instead.' - ) + return explicit_storage_client - return SmartApifyStorageClient(cloud_storage_client=ApifyStorageClient(), local_storage_client=storage_client) + raise RuntimeError( + 'The storage client in the service locator has to be instance of SmartApifyStorageClient. If you want to ' + 'set the storage client manually you have to call ' + '`service_locator.set_storage_client(SmartApifyStorageClient(...))` before entering Actor context or ' + 'awaiting `Actor.init`.' + ) async def init(self) -> None: """Initialize the Actor instance. diff --git a/src/apify/storage_clients/_apify/_utils.py b/src/apify/storage_clients/_apify/_utils.py index ebae80f7..8492da08 100644 --- a/src/apify/storage_clients/_apify/_utils.py +++ b/src/apify/storage_clients/_apify/_utils.py @@ -107,7 +107,7 @@ async def store_mapping(self, storage_id: str) -> None: # Update in-memory mapping (await self._get_alias_map())[self._storage_key] = storage_id if not Configuration.get_global_configuration().is_at_home: - logging.getLogger(__name__).warning( + logging.getLogger(__name__).debug( 'AliasResolver storage limited retention is only supported on Apify platform. Storage is not exported.' ) return diff --git a/src/apify/storage_clients/_smart_apify/_storage_client.py b/src/apify/storage_clients/_smart_apify/_storage_client.py index 85ebf1a7..5a36598e 100644 --- a/src/apify/storage_clients/_smart_apify/_storage_client.py +++ b/src/apify/storage_clients/_smart_apify/_storage_client.py @@ -1,6 +1,5 @@ from __future__ import annotations -from functools import cached_property from typing import TYPE_CHECKING from typing_extensions import override @@ -56,7 +55,7 @@ def get_suitable_storage_client(self, *, force_cloud: bool = False) -> StorageCl Args: force_cloud: If True, return `cloud_storage_client`. """ - if self._is_at_home: + if ApifyConfiguration.get_global_configuration().is_at_home: return self._cloud_storage_client configuration = ApifyConfiguration.get_global_configuration() @@ -72,7 +71,7 @@ def get_suitable_storage_client(self, *, force_cloud: bool = False) -> StorageCl @override def get_additional_cache_key(self, configuration: CrawleeConfiguration) -> Hashable: - if self._is_at_home: + if ApifyConfiguration.get_global_configuration().is_at_home: if isinstance(configuration, ApifyConfiguration): return self._cloud_storage_client.get_additional_cache_key(configuration) raise TypeError('Expecting ApifyConfiguration') @@ -117,8 +116,3 @@ async def create_rq_client( return await self.get_suitable_storage_client().create_rq_client( id=id, name=id, alias=alias, configuration=configuration ) - - @cached_property - def _is_at_home(self) -> bool: - configuration = ApifyConfiguration.get_global_configuration() - return configuration.is_at_home diff --git a/tests/integration/test_apify_storages.py b/tests/integration/test_apify_storages.py index d605c34c..32cb5061 100644 --- a/tests/integration/test_apify_storages.py +++ b/tests/integration/test_apify_storages.py @@ -67,7 +67,6 @@ async def test_aliases_not_stored_on_platform_when_local( ) -> None: """Test that default Apify storage used locally is not persisting aliases to Apify based default KVS.""" service_locator.set_configuration(Configuration(token=apify_token)) - service_locator.set_storage_client(ApifyStorageClient()) async with Actor(configure_logging=False): await storage_type.open(alias='test') default_kvs = await Actor.open_key_value_store(force_cloud=True) @@ -111,23 +110,12 @@ async def test_actor_full_explicit_storage_init_same_client(apify_token: str) -> async def test_actor_partial_explicit_cloud_storage_init(apify_token: str) -> None: service_locator.set_configuration(Configuration(token=apify_token)) service_locator.set_storage_client(ApifyStorageClient(request_queue_access='shared')) - async with Actor: - # If service locator was already set with ApifyStorageClient, the actor will use it as cloud_storage_client of - # SmartApifyStorageClient - assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) - assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) - assert await Actor.open_request_queue() is not await Actor.open_request_queue(force_cloud=True) - - -async def test_actor_partial_explicit_local_storage_init(apify_token: str) -> None: - service_locator.set_configuration(Configuration(token=apify_token)) - service_locator.set_storage_client(MemoryStorageClient()) - async with Actor: - # If service locator was already set with non-ApifyStorageClient, the actor will use it as local_storage_client - # of SmartApifyStorageClient - assert await Actor.open_dataset() is not await Actor.open_dataset(force_cloud=True) - assert await Actor.open_key_value_store() is not await Actor.open_key_value_store(force_cloud=True) - assert await Actor.open_request_queue() is not await Actor.open_request_queue(force_cloud=True) + with pytest.raises( + RuntimeError, match=r'^The storage client in the service locator has to be instance of SmartApifyStorageClient' + ): + async with Actor: + # If service locator was explicitly set to something different than SmartApifyStorageClient, raise an error. + ... async def test_actor_implicit_storage_init(apify_token: str) -> None: diff --git a/tests/unit/actor/test_configuration.py b/tests/unit/actor/test_configuration.py index bf6724ff..97500eab 100644 --- a/tests/unit/actor/test_configuration.py +++ b/tests/unit/actor/test_configuration.py @@ -7,7 +7,6 @@ from crawlee.configuration import Configuration as CrawleeConfiguration from crawlee.crawlers import BasicCrawler from crawlee.errors import ServiceConflictError -from crawlee.storage_clients import FileSystemStorageClient from apify import Actor from apify import Configuration as ApifyConfiguration @@ -242,16 +241,3 @@ def test_apify_configuration_is_always_used(caplog: pytest.LogCaptureFixture) -> 'It is recommended to set `apify.Configuration` explicitly as early as possible by using ' 'service_locator.set_configuration' ) in caplog.messages - - -async def test_file_system_storage_client_warning(caplog: pytest.LogCaptureFixture) -> None: - service_locator.set_storage_client(FileSystemStorageClient()) - caplog.set_level('WARNING') - async with Actor(): - ... - - assert ( - 'Using crawlee.storage_clients._file_system._storage_client.FileSystemStorageClient in Actor context is not ' - 'recommended and can lead to problems with reading the input file. Use ' - '`apify.storage_clients.FileSystemStorageClient` instead.' - ) in caplog.messages From de941d4344846e822f727982e6fdc1d6b7d8b5af Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 25 Sep 2025 10:58:50 +0200 Subject: [PATCH 22/26] Update based on Crawlee update --- pyproject.toml | 2 +- src/apify/storage_clients/_apify/_storage_client.py | 4 ++-- .../storage_clients/_smart_apify/_storage_client.py | 6 +++--- tests/integration/actor_source_base/requirements.txt | 2 +- uv.lock | 10 +++++++--- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7049074d..a4b4fe4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ keywords = [ dependencies = [ "apify-client>=2.0.0,<3.0.0", "apify-shared>=2.0.0,<3.0.0", - "crawlee @ git+https://github.com/apify/crawlee-python.git@include-storag-client-in-additional-cache-key", + "crawlee==0.6.13b46", "cachetools>=5.5.0", "cryptography>=42.0.0", "impit>=0.6.1", diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index c3a09399..7c241b07 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -43,11 +43,11 @@ def __init__(self, *, request_queue_access: Literal['single', 'shared'] = 'singl ) @override - def get_additional_cache_key(self, configuration: CrawleeConfiguration) -> Hashable: + def get_storage_client_cache_key(self, configuration: CrawleeConfiguration) -> Hashable: if isinstance(configuration, ApifyConfiguration): # Current design does not support opening exactly same queue with full and simple client at the same time, # due to default and unnamed storages. Whichever client variation gets used first, wins. - return hash_api_base_url_and_token(configuration) + return super().get_storage_client_cache_key(configuration), hash_api_base_url_and_token(configuration) config_class = type(configuration) raise TypeError( diff --git a/src/apify/storage_clients/_smart_apify/_storage_client.py b/src/apify/storage_clients/_smart_apify/_storage_client.py index 5a36598e..a5b20b9f 100644 --- a/src/apify/storage_clients/_smart_apify/_storage_client.py +++ b/src/apify/storage_clients/_smart_apify/_storage_client.py @@ -70,13 +70,13 @@ def get_suitable_storage_client(self, *, force_cloud: bool = False) -> StorageCl return self._local_storage_client @override - def get_additional_cache_key(self, configuration: CrawleeConfiguration) -> Hashable: + def get_storage_client_cache_key(self, configuration: CrawleeConfiguration) -> Hashable: if ApifyConfiguration.get_global_configuration().is_at_home: if isinstance(configuration, ApifyConfiguration): - return self._cloud_storage_client.get_additional_cache_key(configuration) + return self._cloud_storage_client.get_storage_client_cache_key(configuration) raise TypeError('Expecting ApifyConfiguration') - return self._local_storage_client.get_additional_cache_key(configuration) + return self._local_storage_client.get_storage_client_cache_key(configuration) @override async def create_dataset_client( diff --git a/tests/integration/actor_source_base/requirements.txt b/tests/integration/actor_source_base/requirements.txt index a7bc9105..7f4f8246 100644 --- a/tests/integration/actor_source_base/requirements.txt +++ b/tests/integration/actor_source_base/requirements.txt @@ -1,4 +1,4 @@ # The test fixture will put the Apify SDK wheel path on the next line APIFY_SDK_WHEEL_PLACEHOLDER uvicorn[standard] -crawlee[parsel] @ git+https://github.com/apify/crawlee-python.git@include-storag-client-in-additional-cache-key +crawlee[parsel]==0.6.13b46 diff --git a/uv.lock b/uv.lock index d1eee088..5f826e54 100644 --- a/uv.lock +++ b/uv.lock @@ -76,7 +76,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=2.0.0,<3.0.0" }, { name = "apify-shared", specifier = ">=2.0.0,<3.0.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", git = "https://github.com/apify/crawlee-python.git?rev=include-storag-client-in-additional-cache-key" }, + { name = "crawlee", specifier = "==0.6.13b46" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "impit", specifier = ">=0.6.1" }, { name = "lazy-object-proxy", specifier = ">=1.11.0" }, @@ -516,8 +516,8 @@ toml = [ [[package]] name = "crawlee" -version = "0.6.13" -source = { git = "https://github.com/apify/crawlee-python.git?rev=include-storag-client-in-additional-cache-key#182b872c9b18ebdc54a442e2331ea2e3172ab8e6" } +version = "0.6.13b46" +source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cachetools" }, { name = "colorama" }, @@ -532,6 +532,10 @@ dependencies = [ { name = "typing-extensions" }, { name = "yarl" }, ] +sdist = { url = "https://files.pythonhosted.org/packages/94/5d/f42c7684b0120eaaf0fd0b7667e6222ab0e0bed2c197a348ad6b534061e8/crawlee-0.6.13b46.tar.gz", hash = "sha256:a1ba1fd649c5673801b85c7b3035c8288a8f783a25e6980961d87d08d55701d4", size = 24846309, upload-time = "2025-09-25T06:40:30.654Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ae/a8/1210d96728108c0ff1d1cd935fb41998768265a9e1a03265f2476c2d734f/crawlee-0.6.13b46-py3-none-any.whl", hash = "sha256:48788749a861024fa21eba114c1c209e34a321bc5475e0ce38c493ada333f785", size = 280069, upload-time = "2025-09-25T06:40:28.272Z" }, +] [package.optional-dependencies] parsel = [ From c5968bc7d77d8dffa17a05498a2ca25d8978dd34 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 25 Sep 2025 13:25:12 +0200 Subject: [PATCH 23/26] Use composition instead of inheritance --- .../_apify/_request_queue_client.py | 162 ++++++++++++++---- .../_apify/_request_queue_shared_client.py | 45 +++-- .../_apify/_request_queue_single_client.py | 31 ++-- .../storage_clients/_apify/_storage_client.py | 12 +- src/apify/storage_clients/_apify/_utils.py | 26 +++ .../test_apify_request_queue_client.py | 2 +- 6 files changed, 191 insertions(+), 87 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 746a8989..85d74fbd 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -1,53 +1,32 @@ from __future__ import annotations -import re -from base64 import b64encode -from hashlib import sha256 from logging import getLogger -from typing import TYPE_CHECKING, Final +from typing import TYPE_CHECKING, Final, Literal from typing_extensions import override from apify_client import ApifyClientAsync from crawlee._utils.crypto import crypto_random_object_id from crawlee.storage_clients._base import RequestQueueClient -from crawlee.storage_clients.models import RequestQueueMetadata +from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata from crawlee.storages import RequestQueue from ._models import ApifyRequestQueueMetadata, RequestQueueStats +from ._request_queue_shared_client import _ApifyRequestQueueSharedClient +from ._request_queue_single_client import _ApifyRequestQueueSingleClient from ._utils import AliasResolver if TYPE_CHECKING: + from collections.abc import Sequence + from apify_client.clients import RequestQueueClientAsync + from crawlee import Request from apify import Configuration logger = getLogger(__name__) -def unique_key_to_request_id(unique_key: str, *, request_id_length: int = 15) -> str: - """Generate a deterministic request ID based on a unique key. - - Args: - unique_key: The unique key to convert into a request ID. - request_id_length: The length of the request ID. - - Returns: - A URL-safe, truncated request ID based on the unique key. - """ - # Encode the unique key and compute its SHA-256 hash - hashed_key = sha256(unique_key.encode('utf-8')).digest() - - # Encode the hash in base64 and decode it to get a string - base64_encoded = b64encode(hashed_key).decode('utf-8') - - # Remove characters that are not URL-safe ('+', '/', or '=') - url_safe_key = re.sub(r'(\+|\/|=)', '', base64_encoded) - - # Truncate the key to the desired length - return url_safe_key[:request_id_length] - - class ApifyRequestQueueClient(RequestQueueClient): """Base class for Apify platform implementations of the request queue client.""" @@ -59,6 +38,7 @@ def __init__( *, api_client: RequestQueueClientAsync, metadata: RequestQueueMetadata, + access: Literal['single', 'shared'] = 'single', ) -> None: """Initialize a new instance. @@ -67,8 +47,112 @@ def __init__( self._api_client = api_client """The Apify request queue client for API operations.""" - self._metadata = metadata - """Additional data related to the RequestQueue.""" + self._implementation: _ApifyRequestQueueSingleClient | _ApifyRequestQueueSharedClient + """Internal implementation used to communicate with the Apify platform based Request Queue.""" + if access == 'single': + self._implementation = _ApifyRequestQueueSingleClient( + api_client=self._api_client, metadata=metadata, cache_size=self._MAX_CACHED_REQUESTS + ) + elif access == 'shared': + self._implementation = _ApifyRequestQueueSharedClient( + api_client=self._api_client, + metadata=metadata, + cache_size=self._MAX_CACHED_REQUESTS, + metadata_getter=self.get_metadata, + ) + else: + raise RuntimeError(f"Unsupported access type: {access}. Allowed values are 'single' or 'shared'.") + + @property + def _metadata(self) -> RequestQueueMetadata: + return self._implementation.metadata + + @override + async def add_batch_of_requests( + self, + requests: Sequence[Request], + *, + forefront: bool = False, + ) -> AddRequestsResponse: + """Add a batch of requests to the queue. + + Args: + requests: The requests to add. + forefront: Whether to add the requests to the beginning of the queue. + + Returns: + Response containing information about the added requests. + """ + return await self._implementation.add_batch_of_requests(requests, forefront=forefront) + + @override + async def fetch_next_request(self) -> Request | None: + """Return the next request in the queue to be processed. + + Once you successfully finish processing of the request, you need to call `mark_request_as_handled` + to mark the request as handled in the queue. If there was some error in processing the request, call + `reclaim_request` instead, so that the queue will give the request to some other consumer + in another call to the `fetch_next_request` method. + + Returns: + The request or `None` if there are no more pending requests. + """ + return await self._implementation.fetch_next_request() + + @override + async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: + """Mark a request as handled after successful processing. + + Handled requests will never again be returned by the `fetch_next_request` method. + + Args: + request: The request to mark as handled. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + return await self._implementation.mark_request_as_handled(request) + + @override + async def get_request(self, unique_key: str) -> Request | None: + """Get a request by unique key. + + Args: + unique_key: Unique key of the request to get. + + Returns: + The request or None if not found. + """ + return await self._implementation.get_request(unique_key) + + @override + async def reclaim_request( + self, + request: Request, + *, + forefront: bool = False, + ) -> ProcessedRequest | None: + """Reclaim a failed request back to the queue. + + The request will be returned for processing later again by another call to `fetch_next_request`. + + Args: + request: The request to return to the queue. + forefront: Whether to add the request to the head or the end of the queue. + + Returns: + Information about the queue operation. `None` if the given request was not in progress. + """ + return await self._implementation.reclaim_request(request, forefront=forefront) + + @override + async def is_empty(self) -> bool: + """Check if the queue is empty. + + Returns: + True if the queue is empty, False otherwise. + """ + return await self._implementation.is_empty() @override async def get_metadata(self) -> ApifyRequestQueueMetadata: @@ -103,6 +187,7 @@ async def open( name: str | None, alias: str | None, configuration: Configuration, + access: Literal['single', 'shared'] = 'single', ) -> ApifyRequestQueueClient: """Open an Apify request queue client. @@ -120,6 +205,18 @@ async def open( configuration: The configuration object containing API credentials and settings. Must include a valid `token` and `api_base_url`. May also contain a `default_request_queue_id` for fallback when neither `id`, `name`, nor `alias` is provided. + access: Controls the implementation of the request queue client based on expected scenario: + - 'single' is suitable for single consumer scenarios. It makes less API calls, is cheaper and faster. + - 'shared' is suitable for multiple consumers scenarios at the cost of higher API usage. + + Detailed constraints for the 'single' access type: + - Only one client is consuming the request queue at the time. + - Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to + be handled so quickly as this client does not aggressively fetch the forefront and relies on local + head estimation. + - Requests are only added to the queue, never deleted by other clients. (Marking as handled is ok.) + - Other producers can add new requests, but not modify existing ones. + (Modifications would not be included in local cache) Returns: An instance for the opened or created storage client. @@ -217,10 +314,7 @@ async def open( metadata_model = RequestQueueMetadata.model_validate(metadata) - return cls( - api_client=apify_rq_client, - metadata=metadata_model, - ) + return cls(api_client=apify_rq_client, metadata=metadata_model, access=access) @override async def purge(self) -> None: diff --git a/src/apify/storage_clients/_apify/_request_queue_shared_client.py b/src/apify/storage_clients/_apify/_request_queue_shared_client.py index 56c14c84..162cf71e 100644 --- a/src/apify/storage_clients/_apify/_request_queue_shared_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_shared_client.py @@ -4,20 +4,18 @@ from collections import deque from datetime import datetime, timedelta, timezone from logging import getLogger -from typing import TYPE_CHECKING, Final +from typing import TYPE_CHECKING, Any, Final from cachetools import LRUCache -from typing_extensions import override from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata -from . import ApifyRequestQueueClient -from ._models import CachedRequest, RequestQueueHead -from ._request_queue_client import unique_key_to_request_id +from ._models import ApifyRequestQueueMetadata, CachedRequest, RequestQueueHead +from ._utils import unique_key_to_request_id from apify import Request if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import Callable, Coroutine, Sequence from apify_client.clients import RequestQueueClientAsync @@ -25,7 +23,7 @@ logger = getLogger(__name__) -class ApifyRequestQueueSharedClient(ApifyRequestQueueClient): +class _ApifyRequestQueueSharedClient: """An Apify platform implementation of the request queue client. This implementation supports multiple producers and multiple consumers scenario. @@ -39,21 +37,26 @@ def __init__( *, api_client: RequestQueueClientAsync, metadata: RequestQueueMetadata, + cache_size: int, + metadata_getter: Callable[[], Coroutine[Any, Any, ApifyRequestQueueMetadata]], ) -> None: """Initialize a new instance. Preferably use the `ApifyRequestQueueClient.open` class method to create a new instance. """ + self.metadata = metadata + """Additional data related to the RequestQueue.""" + + self._metadata_getter = metadata_getter + """Async function to get metadata from API.""" + self._api_client = api_client """The Apify request queue client for API operations.""" - self._metadata = metadata - """Additional data related to the RequestQueue.""" - self._queue_head = deque[str]() """A deque to store request unique keys in the queue head.""" - self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=self._MAX_CACHED_REQUESTS) + self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=cache_size) """A cache to store request objects. Request unique key is used as the cache key.""" self._queue_has_locked_requests: bool | None = None @@ -72,12 +75,11 @@ async def _get_metadata_estimate(self) -> RequestQueueMetadata: Local estimation of metadata is without delay, unlike metadata from API. In situation where there is only one client, it is the better choice. """ - if self._metadata.had_multiple_clients: - return await self.get_metadata() + if self.metadata.had_multiple_clients: + return await self._metadata_getter() # Get local estimation (will not include changes done bo another client) - return self._metadata + return self.metadata - @override async def add_batch_of_requests( self, requests: Sequence[Request], @@ -167,11 +169,10 @@ async def add_batch_of_requests( if not processed_request.was_already_present and not processed_request.was_already_handled: new_request_count += 1 - self._metadata.total_request_count += new_request_count + self.metadata.total_request_count += new_request_count return api_response - @override async def get_request(self, unique_key: str) -> Request | None: """Get a request by unique key. @@ -188,7 +189,6 @@ async def get_request(self, unique_key: str) -> Request | None: return Request.model_validate(response) - @override async def fetch_next_request(self) -> Request | None: """Return the next request in the queue to be processed. @@ -240,7 +240,6 @@ async def fetch_next_request(self) -> Request | None: return request - @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: """Mark a request as handled after successful processing. @@ -265,7 +264,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | # Update assumed handled count if this wasn't already handled if not processed_request.was_already_handled: - self._metadata.handled_request_count += 1 + self.metadata.handled_request_count += 1 # Update the cache with the handled request cache_key = request.unique_key @@ -280,7 +279,6 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | else: return processed_request - @override async def reclaim_request( self, request: Request, @@ -313,7 +311,7 @@ async def reclaim_request( # If the request was previously handled, decrement our handled count since # we're putting it back for processing. if request.was_already_handled and not processed_request.was_already_handled: - self._metadata.handled_request_count -= 1 + self.metadata.handled_request_count -= 1 # Update the cache cache_key = request.unique_key @@ -334,7 +332,6 @@ async def reclaim_request( else: return processed_request - @override async def is_empty(self) -> bool: """Check if the queue is empty. @@ -472,7 +469,7 @@ async def _list_head( # Update the queue head cache self._queue_has_locked_requests = response.get('queueHasLockedRequests', False) # Check if there is another client working with the RequestQueue - self._metadata.had_multiple_clients = response.get('hadMultipleClients', False) + self.metadata.had_multiple_clients = response.get('hadMultipleClients', False) for request_data in response.get('items', []): request = Request.model_validate(request_data) diff --git a/src/apify/storage_clients/_apify/_request_queue_single_client.py b/src/apify/storage_clients/_apify/_request_queue_single_client.py index 53446a5e..d1814bb4 100644 --- a/src/apify/storage_clients/_apify/_request_queue_single_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_single_client.py @@ -6,13 +6,11 @@ from typing import TYPE_CHECKING, Final from cachetools import LRUCache -from typing_extensions import override from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata from apify import Request -from apify.storage_clients._apify import ApifyRequestQueueClient -from apify.storage_clients._apify._request_queue_client import unique_key_to_request_id +from apify.storage_clients._apify._utils import unique_key_to_request_id if TYPE_CHECKING: from collections.abc import Sequence @@ -23,7 +21,7 @@ logger = getLogger(__name__) -class ApifyRequestQueueSingleClient(ApifyRequestQueueClient): +class _ApifyRequestQueueSingleClient: """An Apify platform implementation of the request queue client with limited capability. This client is designed to use as little resources as possible, but has to be used in constrained context. @@ -45,18 +43,19 @@ def __init__( *, api_client: RequestQueueClientAsync, metadata: RequestQueueMetadata, + cache_size: int, ) -> None: """Initialize a new instance. Preferably use the `ApifyRequestQueueClient.open` class method to create a new instance. """ + self.metadata = metadata + """Additional data related to the RequestQueue.""" + self._api_client = api_client """The Apify request queue client for API operations.""" - self._metadata = metadata - """Additional data related to the RequestQueue.""" - - self._requests_cache: LRUCache[str, Request] = LRUCache(maxsize=self._MAX_CACHED_REQUESTS) + self._requests_cache: LRUCache[str, Request] = LRUCache(maxsize=cache_size) """A cache to store request objects. Request unique key is used as the cache key.""" self._head_requests: deque[str] = deque() @@ -82,7 +81,6 @@ def __init__( Initialization is done lazily only if deduplication is needed (When calling add_batch_of_requests). """ - @override async def add_batch_of_requests( self, requests: Sequence[Request], @@ -169,11 +167,10 @@ async def add_batch_of_requests( for processed_request in api_response.processed_requests: if not processed_request.was_already_present and not processed_request.was_already_handled: new_request_count += 1 - self._metadata.total_request_count += new_request_count + self.metadata.total_request_count += new_request_count return api_response - @override async def get_request(self, unique_key: str) -> Request | None: """Get a request by unique key. @@ -193,7 +190,6 @@ async def get_request(self, unique_key: str) -> Request | None: return Request.model_validate(response) - @override async def fetch_next_request(self) -> Request | None: """Return the next request in the queue to be processed. @@ -231,11 +227,11 @@ async def _list_head(self) -> None: # Update metadata # Check if there is another client working with the RequestQueue - self._metadata.had_multiple_clients = response.get('hadMultipleClients', False) + self.metadata.had_multiple_clients = response.get('hadMultipleClients', False) # Should warn once? This might be outside expected context if the other consumers consumes at the same time if modified_at := response.get('queueModifiedAt'): - self._metadata.modified_at = max(self._metadata.modified_at, modified_at) + self.metadata.modified_at = max(self.metadata.modified_at, modified_at) # Update the cached data for request_data in response.get('items', []): @@ -259,7 +255,6 @@ async def _list_head(self) -> None: if request.unique_key not in self._head_requests: self._head_requests.appendleft(request.unique_key) - @override async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: """Mark a request as handled after successful processing. @@ -275,7 +270,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | if request.handled_at is None: request.handled_at = datetime.now(tz=timezone.utc) - self._metadata.handled_request_count += 1 + self.metadata.handled_request_count += 1 if cached_request := self._requests_cache.get(request.unique_key): cached_request.handled_at = request.handled_at @@ -297,7 +292,6 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | else: return processed_request - @override async def reclaim_request( self, request: Request, @@ -338,7 +332,7 @@ async def reclaim_request( # If the request was previously handled, decrement our handled count since # we're putting it back for processing. if request.was_already_handled and not processed_request.was_already_handled: - self._metadata.handled_request_count -= 1 + self.metadata.handled_request_count -= 1 except Exception as exc: logger.debug(f'Error reclaiming request {request.unique_key}: {exc!s}') @@ -346,7 +340,6 @@ async def reclaim_request( else: return processed_request - @override async def is_empty(self) -> bool: """Check if the queue is empty. diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index 7c241b07..0702e964 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -8,8 +8,7 @@ from ._dataset_client import ApifyDatasetClient from ._key_value_store_client import ApifyKeyValueStoreClient -from ._request_queue_shared_client import ApifyRequestQueueSharedClient -from ._request_queue_single_client import ApifyRequestQueueSingleClient +from ._request_queue_client import ApifyRequestQueueClient from ._utils import hash_api_base_url_and_token from apify._configuration import Configuration as ApifyConfiguration from apify._utils import docs_group @@ -19,8 +18,6 @@ from crawlee.configuration import Configuration as CrawleeConfiguration - from ._request_queue_client import ApifyRequestQueueClient - @docs_group('Storage clients') class ApifyStorageClient(StorageClient): @@ -95,11 +92,8 @@ async def create_rq_client( ) -> ApifyRequestQueueClient: configuration = configuration or ApifyConfiguration.get_global_configuration() if isinstance(configuration, ApifyConfiguration): - client: type[ApifyRequestQueueClient] = ( - ApifyRequestQueueSingleClient - if self._request_queue_access == 'single' - else ApifyRequestQueueSharedClient + return await ApifyRequestQueueClient.open( + id=id, name=name, alias=alias, configuration=configuration, access=self._request_queue_access ) - return await client.open(id=id, name=name, alias=alias, configuration=configuration) raise TypeError(self._lsp_violation_error_message_template.format(type(configuration).__name__)) diff --git a/src/apify/storage_clients/_apify/_utils.py b/src/apify/storage_clients/_apify/_utils.py index 8492da08..eee87367 100644 --- a/src/apify/storage_clients/_apify/_utils.py +++ b/src/apify/storage_clients/_apify/_utils.py @@ -1,7 +1,10 @@ from __future__ import annotations import logging +import re from asyncio import Lock +from base64 import b64encode +from hashlib import sha256 from logging import getLogger from typing import TYPE_CHECKING, ClassVar @@ -166,3 +169,26 @@ def hash_api_base_url_and_token(configuration: Configuration) -> str: if configuration.api_public_base_url is None or configuration.token is None: raise ValueError("'Configuration.api_public_base_url' and 'Configuration.token' must be set.") return compute_short_hash(f'{configuration.api_public_base_url}{configuration.token}'.encode()) + + +def unique_key_to_request_id(unique_key: str, *, request_id_length: int = 15) -> str: + """Generate a deterministic request ID based on a unique key. + + Args: + unique_key: The unique key to convert into a request ID. + request_id_length: The length of the request ID. + + Returns: + A URL-safe, truncated request ID based on the unique key. + """ + # Encode the unique key and compute its SHA-256 hash + hashed_key = sha256(unique_key.encode('utf-8')).digest() + + # Encode the hash in base64 and decode it to get a string + base64_encoded = b64encode(hashed_key).decode('utf-8') + + # Remove characters that are not URL-safe ('+', '/', or '=') + url_safe_key = re.sub(r'(\+|\/|=)', '', base64_encoded) + + # Truncate the key to the desired length + return url_safe_key[:request_id_length] diff --git a/tests/unit/storage_clients/test_apify_request_queue_client.py b/tests/unit/storage_clients/test_apify_request_queue_client.py index 019b2e0b..f00b2d3a 100644 --- a/tests/unit/storage_clients/test_apify_request_queue_client.py +++ b/tests/unit/storage_clients/test_apify_request_queue_client.py @@ -1,6 +1,6 @@ import pytest -from apify.storage_clients._apify._request_queue_client import unique_key_to_request_id +from apify.storage_clients._apify._utils import unique_key_to_request_id def test_unique_key_to_request_id_length() -> None: From 49c357e368725373252954e12b96a660859af78f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 25 Sep 2025 14:23:56 +0200 Subject: [PATCH 24/26] Polish some docs --- docs/04_upgrading/upgrading_to_v3.md | 12 ++++++++---- .../storage_clients/_smart_apify/_storage_client.py | 5 ++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/docs/04_upgrading/upgrading_to_v3.md b/docs/04_upgrading/upgrading_to_v3.md index 596cbee7..803db6d8 100644 --- a/docs/04_upgrading/upgrading_to_v3.md +++ b/docs/04_upgrading/upgrading_to_v3.md @@ -118,13 +118,13 @@ async def main(): ```python from crawlee import service_locator -from apify.storage_clients import ApifyStorageClient, ApifyHybridStorageClient, MemoryStorageClient +from apify.storage_clients import ApifyStorageClient, SmartApifyStorageClient, MemoryStorageClient from apify import Actor async def main(): service_locator.set_storage_client( - ApifyHybridStorageClient( + SmartApifyStorageClient( cloud_storage_client=ApifyStorageClient(request_queue_access="single"), local_storage_client=MemoryStorageClient() ) @@ -143,13 +143,17 @@ async def main(): ```python from crawlee import service_locator -from apify.storage_clients import ApifyStorageClient +from apify.storage_clients import ApifyStorageClient, SmartApifyStorageClient from apify import Actor async def main(): # Full client that supports multiple consumers of the Apify Request Queue - service_locator.set_storage_client(ApifyStorageClient(request_queue_access="shared")) + service_locator.set_storage_client( + SmartApifyStorageClient( + cloud_storage_client=ApifyStorageClient(request_queue_access="shared"), + ) + ) async with Actor: rq = await Actor.open_request_queue() ``` diff --git a/src/apify/storage_clients/_smart_apify/_storage_client.py b/src/apify/storage_clients/_smart_apify/_storage_client.py index a5b20b9f..db1b8b5d 100644 --- a/src/apify/storage_clients/_smart_apify/_storage_client.py +++ b/src/apify/storage_clients/_smart_apify/_storage_client.py @@ -21,9 +21,8 @@ class SmartApifyStorageClient(StorageClient): """SmartApifyStorageClient that delegates to cloud_storage_client or local_storage_client. - When running on Apify platform use cloud_storage_client, else use local_storage_client. It has additional wrapper - methods with `force_cloud` parameter to force using cloud_storage_client when opening specific storages even when - not running on the Apify platform. This storage client is designed to work specifically in Actor context. + When running on Apify platform use cloud_storage_client, else use local_storage_client. This storage client is + designed to work specifically in Actor context. """ def __init__( From 6edb093ff5b2659979e07a35fa8e413a19915266 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 25 Sep 2025 16:27:41 +0200 Subject: [PATCH 25/26] More docs polishing --- .../_apify/_request_queue_client.py | 1 - .../storage_clients/_apify/_storage_client.py | 15 +++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 85d74fbd..1928f0ad 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -208,7 +208,6 @@ async def open( access: Controls the implementation of the request queue client based on expected scenario: - 'single' is suitable for single consumer scenarios. It makes less API calls, is cheaper and faster. - 'shared' is suitable for multiple consumers scenarios at the cost of higher API usage. - Detailed constraints for the 'single' access type: - Only one client is consuming the request queue at the time. - Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index 0702e964..2bee6527 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -27,10 +27,17 @@ def __init__(self, *, request_queue_access: Literal['single', 'shared'] = 'singl """Initialize the Apify storage client. Args: - request_queue_access: If 'single', the `create_rq_client` will return `ApifyRequestQueueSingleClient`, if - 'shared' it will return `ApifyRequestQueueSharedClient`. + request_queue_access: Controls the implementation of the request queue client based on expected scenario: - 'single' is suitable for single consumer scenarios. It makes less API calls, is cheaper and faster. - 'shared' is suitable for multiple consumers scenarios at the cost of higher API usage. + Detailed constraints for the 'single' access type: + - Only one client is consuming the request queue at the time. + - Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to + be handled so quickly as this client does not aggressively fetch the forefront and relies on local + head estimation. + - Requests are only added to the queue, never deleted by other clients. (Marking as handled is ok.) + - Other producers can add new requests, but not modify existing ones. + (Modifications would not be included in local cache) """ self._request_queue_access = request_queue_access @@ -42,8 +49,8 @@ def __init__(self, *, request_queue_access: Literal['single', 'shared'] = 'singl @override def get_storage_client_cache_key(self, configuration: CrawleeConfiguration) -> Hashable: if isinstance(configuration, ApifyConfiguration): - # Current design does not support opening exactly same queue with full and simple client at the same time, - # due to default and unnamed storages. Whichever client variation gets used first, wins. + # It is not supported to open exactly same queue with 'single' and 'shared' client at the same time. + # Whichever client variation gets used first, wins. return super().get_storage_client_cache_key(configuration), hash_api_base_url_and_token(configuration) config_class = type(configuration) From b17ebefa1a35de204b6184832bc835f67b441b4e Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 26 Sep 2025 09:58:16 +0200 Subject: [PATCH 26/26] Track pending_request_count in local metadata estimation --- .../storage_clients/_apify/_request_queue_shared_client.py | 3 +++ .../storage_clients/_apify/_request_queue_single_client.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/apify/storage_clients/_apify/_request_queue_shared_client.py b/src/apify/storage_clients/_apify/_request_queue_shared_client.py index 162cf71e..65ad8daa 100644 --- a/src/apify/storage_clients/_apify/_request_queue_shared_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_shared_client.py @@ -170,6 +170,7 @@ async def add_batch_of_requests( new_request_count += 1 self.metadata.total_request_count += new_request_count + self.metadata.pending_request_count += new_request_count return api_response @@ -265,6 +266,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | # Update assumed handled count if this wasn't already handled if not processed_request.was_already_handled: self.metadata.handled_request_count += 1 + self.metadata.pending_request_count -= 1 # Update the cache with the handled request cache_key = request.unique_key @@ -312,6 +314,7 @@ async def reclaim_request( # we're putting it back for processing. if request.was_already_handled and not processed_request.was_already_handled: self.metadata.handled_request_count -= 1 + self.metadata.pending_request_count += 1 # Update the cache cache_key = request.unique_key diff --git a/src/apify/storage_clients/_apify/_request_queue_single_client.py b/src/apify/storage_clients/_apify/_request_queue_single_client.py index d1814bb4..1b9502b0 100644 --- a/src/apify/storage_clients/_apify/_request_queue_single_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_single_client.py @@ -168,6 +168,7 @@ async def add_batch_of_requests( if not processed_request.was_already_present and not processed_request.was_already_handled: new_request_count += 1 self.metadata.total_request_count += new_request_count + self.metadata.pending_request_count += new_request_count return api_response @@ -271,6 +272,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | if request.handled_at is None: request.handled_at = datetime.now(tz=timezone.utc) self.metadata.handled_request_count += 1 + self.metadata.pending_request_count -= 1 if cached_request := self._requests_cache.get(request.unique_key): cached_request.handled_at = request.handled_at @@ -333,6 +335,7 @@ async def reclaim_request( # we're putting it back for processing. if request.was_already_handled and not processed_request.was_already_handled: self.metadata.handled_request_count -= 1 + self.metadata.pending_request_count += 1 except Exception as exc: logger.debug(f'Error reclaiming request {request.unique_key}: {exc!s}')