Add with debug stuff

Pijukatel · Pijukatel · commit 134e8beb5349 · 2025-08-12T13:49:07.000+02:00
diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py
@@ -26,6 +26,7 @@
 
 logger = getLogger(__name__)
 
+COUNTER = iter(range(10000))
 
 class ApifyRequestQueueClient(RequestQueueClient):
     """An Apify platform implementation of the request queue client."""
@@ -294,18 +295,25 @@ async def fetch_next_request(self) -> Request | None:
         Returns:
             The request or `None` if there are no more pending requests.
         """
+        call_time = next(COUNTER)
         # Ensure the queue head has requests if available. Fetching the head with lock to prevent race conditions.
+        logger.debug(f'Before _fetch_lock, {call_time}')
         async with self._fetch_lock:
+            logger.debug(f'Fetching, {call_time}')
             await self._ensure_head_is_non_empty()
 
             # If queue head is empty after ensuring, there are no requests
             if not self._queue_head:
+                logger.debug(f'Empty, {call_time}')
                 return None
 
             # Get the next request ID from the queue head
             next_request_id = self._queue_head.popleft()
+            logger.debug(f'New request, {call_time}')
 
+        logger.debug(f'Before hydrate, {call_time}')
         request = await self._get_or_hydrate_request(next_request_id)
+        logger.debug(f'After hydrate, {call_time}')
 
         # Handle potential inconsistency where request might not be in the main table yet
         if request is None:
@@ -324,14 +332,15 @@ async def fetch_next_request(self) -> Request | None:
             return None
 
         # Use get request to ensure we have the full request object.
-        request = await self.get_request(request.id)
+        #request = await self.get_request(request.id) This seems redundant
         if request is None:
             logger.debug(
                 'Request fetched from the beginning of queue was not found in the RQ',
                 extra={'nextRequestId': next_request_id},
             )
             return None
 
+        logger.debug(f'{request.retry_count=}, {call_time}')
         return request
 
     @override
@@ -394,42 +403,48 @@ async def reclaim_request(
         """
         # Check if the request was marked as handled and clear it. When reclaiming,
         # we want to put the request back for processing.
+        call_time = next(COUNTER)
         if request.was_already_handled:
             request.handled_at = None
 
-        try:
-            # Update the request in the API.
-            processed_request = await self._update_request(request, forefront=forefront)
-            processed_request.unique_key = request.unique_key
-
-            # If the request was previously handled, decrement our handled count since
-            # we're putting it back for processing.
-            if request.was_already_handled and not processed_request.was_already_handled:
-                self._assumed_handled_count -= 1
-
-            # Update the cache
-            cache_key = unique_key_to_request_id(request.unique_key)
-            self._cache_request(
-                cache_key,
-                processed_request,
-                hydrated_request=request,
-            )
+        async with self._fetch_lock:
+            try:
+                # Update the request in the API.
+                logger.debug(f'Before _update_request reclaiming, {call_time}')
+                processed_request = await self._update_request(request, forefront=forefront)
+                logger.debug(f'After _update_request reclaiming, {call_time}')
+                processed_request.unique_key = request.unique_key
+
+                # If the request was previously handled, decrement our handled count since
+                # we're putting it back for processing.
+                if request.was_already_handled and not processed_request.was_already_handled:
+                    self._assumed_handled_count -= 1
+
+                # Update the cache
+                cache_key = unique_key_to_request_id(request.unique_key)
+                self._cache_request(
+                    cache_key,
+                    processed_request,
+                    hydrated_request=request,
+                )
 
-            # If we're adding to the forefront, we need to check for forefront requests
-            # in the next list_head call
-            if forefront:
-                self._should_check_for_forefront_requests = True
+                # If we're adding to the forefront, we need to check for forefront requests
+                # in the next list_head call
+                if forefront:
+                    self._should_check_for_forefront_requests = True
 
-            # Try to release the lock on the request
-            try:
-                await self._delete_request_lock(request.id, forefront=forefront)
-            except Exception as err:
-                logger.debug(f'Failed to delete request lock for request {request.id}', exc_info=err)
-        except Exception as exc:
-            logger.debug(f'Error reclaiming request {request.id}: {exc!s}')
-            return None
-        else:
-            return processed_request
+                # Try to release the lock on the request
+                try:
+                    logger.debug(f'Before _delete_request_lock reclaiming, {call_time}')
+                    await self._delete_request_lock(request.id, forefront=forefront)
+                    logger.debug(f'After _delete_request_lock reclaiming, {call_time}')
+                except Exception as err:
+                    logger.debug(f'Failed to delete request lock for request {request.id}', exc_info=err)
+            except Exception as exc:
+                logger.debug(f'Error reclaiming request {request.id}: {exc!s}')
+                return None
+            else:
+                return processed_request
 
     @override
     async def is_empty(self) -> bool:
@@ -438,9 +453,14 @@ async def is_empty(self) -> bool:
         Returns:
             True if the queue is empty, False otherwise.
         """
-        head = await self._list_head(limit=1, lock_time=None)
-
-        return len(head.items) == 0 and not self._queue_has_locked_requests
+        call_time = next(COUNTER)
+        logger.debug(f'Before _list_head is_empty, {call_time}')
+        async with self._fetch_lock:
+            logger.debug(f'During _list_head is_empty, {call_time}')
+            head = await self._list_head(limit=1, lock_time=None)
+            logger.debug(f'After _list_head is_empty, {call_time}')
+            logger.debug(f'Finish _list_head is_empty, {call_time}')
+            return len(head.items) == 0 and not self._queue_has_locked_requests
 
     async def _ensure_head_is_non_empty(self) -> None:
         """Ensure that the queue head has requests if they are available in the queue."""
@@ -551,8 +571,9 @@ async def _list_head(
             A collection of requests from the beginning of the queue.
         """
         # Return from cache if available and we're not checking for new forefront requests
+        call_time = next(COUNTER)
         if self._queue_head and not self._should_check_for_forefront_requests:
-            logger.debug(f'Using cached queue head with {len(self._queue_head)} requests')
+            logger.debug(f'Using cached queue head with {len(self._queue_head)} requests, {call_time}')
 
             # Create a list of requests from the cached queue head
             items = []
@@ -571,7 +592,7 @@ async def _list_head(
                 queue_has_locked_requests=self._queue_has_locked_requests,
                 lock_time=lock_time,
             )
-
+        logger.debug(f'Updating cached queue head with {len(self._queue_head)} requests, {call_time}')
         leftover_buffer = list[str]()
         if self._should_check_for_forefront_requests:
             leftover_buffer = list(self._queue_head)
@@ -615,13 +636,14 @@ async def _list_head(
                 ),
                 hydrated_request=request,
             )
-
+            logger.debug(f'Adding to head, {call_time}')
             self._queue_head.append(request.id)
+        logger.debug(f'Cached queue head with {len(self._queue_head)} requests, {call_time}')
 
         for leftover_request_id in leftover_buffer:
             # After adding new requests to the forefront, any existing leftover locked request is kept in the end.
             self._queue_head.append(leftover_request_id)
-
+        logger.debug(f'Cached queue head with {len(self._queue_head)} requests, {call_time}')
         return RequestQueueHead.model_validate(response)
 
     async def _prolong_request_lock(
diff --git a/tests/integration/test_actor_request_queue.py b/tests/integration/test_actor_request_queue.py
@@ -98,15 +98,40 @@ async def test_request_queue_is_finished(
     request_queue_name = generate_unique_resource_name('request_queue')
 
     async with Actor:
-        request_queue = await Actor.open_request_queue(name=request_queue_name, force_cloud=True)
-        await request_queue.add_request(Request.from_url('http://example.com'))
-        assert not await request_queue.is_finished()
+        try:
+            request_queue = await Actor.open_request_queue(name=request_queue_name, force_cloud=True)
+            await request_queue.add_request(Request.from_url('http://example.com'))
+            assert not await request_queue.is_finished()
+
+            request = await request_queue.fetch_next_request()
+            assert request is not None
+            assert not await request_queue.is_finished(), (
+                'RequestQueue should not be finished unless the request is marked as handled.'
+            )
+
+            await request_queue.mark_request_as_handled(request)
+            assert await request_queue.is_finished()
+        finally:
+            await request_queue.drop()
+
 
-        request = await request_queue.fetch_next_request()
-        assert request is not None
-        assert not await request_queue.is_finished(), (
-            'RequestQueue should not be finished unless the request is marked as handled.'
-        )
+async def test_same_request_fetched_twice(
+    apify_client_async: ApifyClientAsync,
+    monkeypatch: pytest.MonkeyPatch):
+    """Test that the same request can be fetched twice from the request queue."""
+    monkeypatch.setenv(ApifyEnvVars.TOKEN, apify_client_async.token)
 
-        await request_queue.mark_request_as_handled(request)
-        assert await request_queue.is_finished()
+    request_queue_name = generate_unique_resource_name('request_queue')
+    async with Actor:
+        try:
+            request_queue = await Actor.open_request_queue(name='same-request-fetch', force_cloud=request_queue_name)
+
+            request = Request.from_url('http://example.com')
+            await request_queue.add_request(request)
+
+            fetched_request_1 = await request_queue.fetch_next_request()
+            assert fetched_request_1 is not None
+            assert fetched_request_1.url == 'http://example.com'
+            await request_queue.reclaim_request(fetched_request_1)
+        finally:
+            await request_queue.drop()
diff --git a/tests/integration/test_crawlers_with_storages.py b/tests/integration/test_crawlers_with_storages.py
@@ -2,6 +2,8 @@
 
 from typing import TYPE_CHECKING
 
+import pytest
+
 if TYPE_CHECKING:
     from .conftest import MakeActorFunction, RunActorFunction
 
@@ -76,19 +78,23 @@ async def default_handler(context: ParselCrawlingContext) -> None:
     assert run_result.status == 'SUCCEEDED'
 
 
-async def test_actor_on_platform_max_request_retries(
+@pytest.mark.parametrize('_', range(10))
+async def test_actor_on_platform_max_request_retries(_,
     make_actor: MakeActorFunction,
     run_actor: RunActorFunction,
 ) -> None:
     """Test that the actor respects max_request_retries."""
 
     async def main() -> None:
         """The crawler entry point."""
+        import logging
+
         from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext
 
         from apify import Actor
 
         async with Actor:
+            logging.getLogger('apify.storage_clients._apify._request_queue_client').setLevel(logging.DEBUG)
             max_retries = 3
             crawler = ParselCrawler(max_request_retries=max_retries)
             failed_counter = 0