Add init cache test, update upgrading guide

Pijukatel · Pijukatel · commit 359c46e32c1b · 2025-09-12T15:33:04.000+02:00
diff --git a/docs/04_upgrading/upgrading_to_v3.md b/docs/04_upgrading/upgrading_to_v3.md
@@ -16,3 +16,32 @@ Support for Python 3.9 has been dropped. The Apify Python SDK v3.x now requires
 ## Storage clients
 
 <!-- TODO -->
+
+## The default use of optimized ApifyRequestQueueClient
+
+- The default client for working with Apify platform based `RequestQueue` is now optimized and simplified client which has significantly lower amount of API calls, but does not support multiple consumers working on the same queue. It is cheaper and faster and is suitable for the majority of the use cases.
+- The full client is still available, but it has to be explicitly requested via `simple_request_queue=False` argument when using the `ApifyStorageClient`.
+
+**Before (v2.x):**
+
+```python
+from apify import Actor
+
+async def main():
+    async with Actor:
+        ...
+```
+
+**Now (v3.0):**
+
+```python
+from apify import Actor
+from crawlee import service_locator
+from apify.storage_clients import ApifyStorageClient
+
+async def main():
+    # Use the full-featured RequestQueue client only if you really need it.
+    service_locator.set_storage_client(ApifyStorageClient(simple_request_queue=False))
+    async with Actor:
+        ...
+```
diff --git a/src/apify/storage_clients/_apify/_request_queue_client_simple.py b/src/apify/storage_clients/_apify/_request_queue_client_simple.py
@@ -388,6 +388,9 @@ async def _init_caches(self) -> None:
 
         This is mainly done to improve local deduplication capability. List request can return up to 10k requests, but
         their order is implementation detail and does not respect head order or insertion order.
+
+        Deduplication on platform is expensive, it takes 1 API call per request and 1 write operation per request.
+        Local deduplication is cheaper, it takes 1 API call for whole cache and 1 read operation per request.
         """
         response = await self._api_client.list_requests(limit=10_000)
         for request_data in response.get('items', []):
diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py
@@ -22,7 +22,7 @@
 class ApifyStorageClient(StorageClient):
     """Apify storage client."""
 
-    def __init__(self, simple_request_queue: bool = True) -> None:
+    def __init__(self, *, simple_request_queue: bool = True) -> None:
         """Initialize the Apify storage client.
 
         Args:
@@ -86,10 +86,9 @@ async def create_rq_client(
 
         configuration = configuration or ApifyConfiguration.get_global_configuration()
         if isinstance(configuration, ApifyConfiguration):
-            if not self._simple_request_queue:
+            if self._simple_request_queue:
                 return await ApifyRequestQueueClientSimple.open(id=id, name=name, configuration=configuration)
-            else:
-                return await ApifyRequestQueueClientFull.open(id=id, name=name, configuration=configuration)
+            return await ApifyRequestQueueClientFull.open(id=id, name=name, configuration=configuration)
 
         raise TypeError(
             f'Expected "configuration" to be an instance of "apify.Configuration", '
diff --git a/tests/integration/test_request_queue.py b/tests/integration/test_request_queue.py
@@ -1,15 +1,16 @@
 from __future__ import annotations
 
 import asyncio
+from datetime import datetime, timezone
 from typing import TYPE_CHECKING
 
 import pytest
-from apify_shared.consts import ApifyEnvVars
 
-from crawlee import Request
+from apify_shared.consts import ApifyEnvVars
+from crawlee import Request, service_locator
 
-from apify import Actor
 from ._utils import generate_unique_resource_name
+from apify import Actor
 
 if TYPE_CHECKING:
     from apify_client import ApifyClientAsync
@@ -1072,29 +1073,47 @@ async def test_request_queue_not_had_multiple_clients(
     assert api_response['hadMultipleClients'] is False
 
 
-async def test_cache_initialization(
-    apify_token: str, monkeypatch: pytest.MonkeyPatch, apify_client_async: ApifyClientAsync
-) -> None:
-    """Test that same `RequestQueue` created from Actor does not act as multiple clients."""
+async def test_cache_initialization(apify_token: str, monkeypatch: pytest.MonkeyPatch) -> None:
+    """Test that Apify based `RequestQueue` initializes cache correctly to reduce unnecessary API calls."""
 
-    """Create an instance of the Apify request queue on the platform and drop it when the test is finished."""
+    # Create an instance of the Apify request queue on the platform and drop it when the test is finished.
     request_queue_name = generate_unique_resource_name('request_queue')
     monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_token)
 
+    requests = [Request.from_url(f'http://example.com/{i}', handled_at=datetime.now(timezone.utc)) for i in range(10)]
+
     async with Actor:
         rq = await Actor.open_request_queue(name=request_queue_name, force_cloud=True)
-        yield rq
-        await rq.drop()
-
-
-    await request_queue_force_cloud.fetch_next_request()
-    await request_queue_force_cloud.fetch_next_request()
-
-    # Check that it is correctly in the RequestQueueClient metadata
-    assert (await request_queue_force_cloud.get_metadata()).had_multiple_clients is False
-
-    # Check that it is correctly in the API
-    api_client = apify_client_async.request_queue(request_queue_id=request_queue_force_cloud.id)
-    api_response = await api_client.get()
-    assert api_response
-    assert api_response['hadMultipleClients'] is False
+        try:
+            await rq.add_requests(requests)
+
+            # Check that it is correctly in the API
+            await asyncio.sleep(10)  # Wait to be sure that metadata are updated
+
+            # Get raw client, because stats are not exposed in `RequestQueue` class, but are available in raw client
+            rq_client = Actor.apify_client.request_queue(request_queue_id=rq.id)
+            _rq = await rq_client.get()
+            assert _rq
+            stats_before = _rq.get('stats', {})
+            Actor.log.info(stats_before)
+
+            # Clear service locator cache to simulate creating RQ instance from scratch
+            service_locator.storage_instance_manager.clear_cache()
+
+            # Try to enqueue same requests again. It should be deduplicated from local cache created on initialization
+            rq = await Actor.open_request_queue(name=request_queue_name, force_cloud=True)
+            await rq.add_requests(requests)
+
+            await asyncio.sleep(10)  # Wait to be sure that metadata are updated
+            _rq = await rq_client.get()
+            assert _rq
+            stats_after = _rq.get('stats', {})
+            Actor.log.info(stats_after)
+
+            # Cache was actually initialized, readCount increased
+            assert (stats_after['readCount'] - stats_before['readCount']) == len(requests)
+            # Deduplication happened locally, writeCount should be the same
+            assert stats_after['writeCount'] == stats_before['writeCount']
+
+        finally:
+            await rq.drop()