Draft, unique key enables better metadata handling

Pijukatel · Pijukatel · commit b159d3ed61d2 · 2025-08-16T13:32:09.000+02:00
diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py
@@ -10,6 +10,7 @@
 from typing_extensions import override
 
 from apify_client import ApifyClientAsync
+from crawlee._utils.crypto import crypto_random_object_id
 from crawlee._utils.requests import unique_key_to_request_id
 from crawlee.storage_clients._base import RequestQueueClient
 from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
@@ -40,10 +41,7 @@ def __init__(
         self,
         *,
         api_client: RequestQueueClientAsync,
-        id: str,
-        name: str | None,
-        total_request_count: int,
-        handled_request_count: int,
+        metadata: RequestQueueMetadata,
     ) -> None:
         """Initialize a new instance.
 
@@ -52,11 +50,8 @@ def __init__(
         self._api_client = api_client
         """The Apify request queue client for API operations."""
 
-        self._id = id
-        """The ID of the request queue."""
-
-        self._name = name
-        """The name of the request queue."""
+        self._metadata = metadata
+        """Additional data related to the RequestQueue."""
 
         self._queue_head = deque[str]()
         """A deque to store request IDs in the queue head."""
@@ -70,41 +65,55 @@ def __init__(
         self._should_check_for_forefront_requests = False
         """Whether to check for forefront requests in the next list_head call."""
 
-        self._had_multiple_clients = False
-        """Whether the request queue has been accessed by multiple clients."""
-
-        self._initial_total_count = total_request_count
-        """The initial total request count (from the API) when the queue was opened."""
-
-        self._initial_handled_count = handled_request_count
-        """The initial handled request count (from the API) when the queue was opened."""
-
-        self._assumed_total_count = 0
-        """The number of requests we assume are in the queue (tracked manually for this instance)."""
-
-        self._assumed_handled_count = 0
-        """The number of requests we assume have been handled (tracked manually for this instance)."""
-
         self._fetch_lock = asyncio.Lock()
         """Fetch lock to minimize race conditions when communicating with API."""
 
     @override
     async def get_metadata(self) -> RequestQueueMetadata:
-        total_count = self._initial_total_count + self._assumed_total_count
-        handled_count = self._initial_handled_count + self._assumed_handled_count
-        pending_count = total_count - handled_count
-
-        return RequestQueueMetadata(
-            id=self._id,
-            name=self._name,
-            total_request_count=total_count,
-            handled_request_count=handled_count,
-            pending_request_count=pending_count,
-            created_at=datetime.now(timezone.utc),
-            modified_at=datetime.now(timezone.utc),
-            accessed_at=datetime.now(timezone.utc),
-            had_multiple_clients=self._had_multiple_clients,
-        )
+        """Get metadata about the request queue."""
+        if self._metadata.had_multiple_clients:
+            # Enhanced from API (can be delayed few seconds)
+            response = await self._api_client.get()
+            if response is None:
+                raise ValueError('Failed to fetch request queue metadata from the API.')
+            return RequestQueueMetadata(
+                id=response['id'],
+                name=response['name'],
+                total_request_count=max(response['totalRequestCount'], self._metadata.total_request_count),
+                handled_request_count=max(response['handledRequestCount'], self._metadata.handled_request_count),
+                pending_request_count=response['pendingRequestCount'],
+                created_at=response['createdAt'],
+                modified_at=max(response['modifiedAt'], self._metadata.modified_at),
+                accessed_at=max(response['accessedAt'], self._metadata.accessed_at),
+                had_multiple_clients=response['hadMultipleClients'],
+            )
+           # Update local estimation?
+        # Get local estimation (will not include changes done bo another client)
+        return self._metadata
+
+
+    @override
+    async def get_metadata(self) -> RequestQueueMetadata:
+        """Get metadata about the request queue."""
+        if self._metadata.had_multiple_clients:
+            # Enhanced from API (can be delayed few seconds)
+            response = await self._api_client.get()
+            if response is None:
+                raise ValueError('Failed to fetch request queue metadata from the API.')
+            return RequestQueueMetadata(
+                id=response['id'],
+                name=response['name'],
+                total_request_count=max(response['totalRequestCount'], self._metadata.total_request_count),
+                handled_request_count=max(response['handledRequestCount'], self._metadata.handled_request_count),
+                pending_request_count=response['pendingRequestCount'],
+                created_at=response['createdAt'],
+                modified_at=max(response['modifiedAt'], self._metadata.modified_at),
+                accessed_at=max(response['accessedAt'], self._metadata.accessed_at),
+                had_multiple_clients=response['hadMultipleClients'],
+            )
+           # Update local estimation?
+        # Get local estimation (will not include changes done bo another client)
+        return self._metadata
 
     @classmethod
     async def open(
@@ -162,27 +171,33 @@ async def open(
         )
         apify_rqs_client = apify_client_async.request_queues()
 
-        # If both id and name are provided, raise an error.
-        if id and name:
-            raise ValueError('Only one of "id" or "name" can be specified, not both.')
-
-        # If id is provided, get the storage by ID.
-        if id and name is None:
-            apify_rq_client = apify_client_async.request_queue(request_queue_id=id)
+        match (id, name):
+            case (None, None):
+                # If both id and name are None, try to get the default storage ID from environment variables.
+                # The default storage ID environment variable is set by the Apify platform. It also contains
+                # a new storage ID after Actor's reboot or migration.
+                id = configuration.default_request_queue_id
+            case (None, name):
+                # If name is provided, get or create the storage by name.
+                id = RequestQueueMetadata.model_validate(
+                    await apify_rqs_client.get_or_create(name=name),
+                ).id
+            case (_, None):
+                pass
+            case (_, _):
+                # If both id and name are provided, raise an error.
+                raise ValueError('Only one of "id" or "name" can be specified, not both.')
+        if id is None:
+            raise RuntimeError('Unreachable code')
 
-        # If name is provided, get or create the storage by name.
-        if name and id is None:
-            id = RequestQueueMetadata.model_validate(
-                await apify_rqs_client.get_or_create(name=name),
-            ).id
-            apify_rq_client = apify_client_async.request_queue(request_queue_id=id)
+        # Use suitable client_key to make `hadMultipleClients` response of Apify API useful.
+        # It should persist across migrated Actor runs on the Apify platform.
+        _api_max_client_key_length = 32
+        client_key = (configuration.actor_run_id or crypto_random_object_id(length=_api_max_client_key_length))[
+            :_api_max_client_key_length
+        ]
 
-        # If both id and name are None, try to get the default storage ID from environment variables.
-        # The default storage ID environment variable is set by the Apify platform. It also contains
-        # a new storage ID after Actor's reboot or migration.
-        if id is None and name is None:
-            id = configuration.default_request_queue_id
-            apify_rq_client = apify_client_async.request_queue(request_queue_id=id)
+        apify_rq_client = apify_client_async.request_queue(request_queue_id=id, client_key=client_key)
 
         # Fetch its metadata.
         metadata = await apify_rq_client.get()
@@ -192,27 +207,18 @@ async def open(
             id = RequestQueueMetadata.model_validate(
                 await apify_rqs_client.get_or_create(),
             ).id
-            apify_rq_client = apify_client_async.request_queue(request_queue_id=id)
+            apify_rq_client = apify_client_async.request_queue(request_queue_id=id, client_key=client_key)
 
         # Verify that the storage exists by fetching its metadata again.
         metadata = await apify_rq_client.get()
         if metadata is None:
             raise ValueError(f'Opening request queue with id={id} and name={name} failed.')
 
-        metadata_model = RequestQueueMetadata.model_validate(
-            await apify_rqs_client.get_or_create(),
-        )
-
-        # Ensure we have a valid ID.
-        if id is None:
-            raise ValueError('Request queue ID cannot be None.')
+        metadata_model = RequestQueueMetadata.model_validate(metadata)
 
         return cls(
             api_client=apify_rq_client,
-            id=id,
-            name=name,
-            total_request_count=metadata_model.total_request_count,
-            handled_request_count=metadata_model.handled_request_count,
+            metadata=metadata_model,
         )
 
     @override
@@ -261,7 +267,7 @@ async def add_batch_of_requests(
             if not processed_request.was_already_present and not processed_request.was_already_handled:
                 new_request_count += 1
 
-        self._assumed_total_count += new_request_count
+        self._metadata.total_request_count += new_request_count
 
         return api_response
 
@@ -359,7 +365,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest |
 
             # Update assumed handled count if this wasn't already handled
             if not processed_request.was_already_handled:
-                self._assumed_handled_count += 1
+                self._metadata.handled_request_count += 1
 
             # Update the cache with the handled request
             cache_key = unique_key_to_request_id(request.unique_key)
@@ -407,7 +413,7 @@ async def reclaim_request(
                 # If the request was previously handled, decrement our handled count since
                 # we're putting it back for processing.
                 if request.was_already_handled and not processed_request.was_already_handled:
-                    self._assumed_handled_count -= 1
+                    self._metadata.handled_request_count -= 1
 
                 # Update the cache
                 cache_key = unique_key_to_request_id(request.unique_key)
@@ -591,6 +597,8 @@ async def _list_head(
 
         # Update the queue head cache
         self._queue_has_locked_requests = response.get('queueHasLockedRequests', False)
+        # Check if there is another client working with the RequestQueue
+        self._metadata.had_multiple_clients = response.get('hadMultipleClients', False)
 
         for request_data in response.get('items', []):
             request = Request.model_validate(request_data)
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -96,6 +96,11 @@ def apify_token() -> str:
     return api_token
 
 
+@pytest.fixture(autouse=True)
+def set_token(apify_token: str, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token)
+
+
 @pytest.fixture
 def apify_client_async(apify_token: str) -> ApifyClientAsync:
     """Create an instance of the ApifyClientAsync.
diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py