Skip to content

Commit 249f8f5

Browse files
committed
Find the chacing problem.
Migrate most actor based tests to normal force cloud rq tests (for future parametrization of the Apify clients)
1 parent eadab26 commit 249f8f5

File tree

4 files changed

+721
-953
lines changed

4 files changed

+721
-953
lines changed

src/apify/storage_clients/_apify/_request_queue_client_simple.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ class ApifyRequestQueueClientSimple(ApifyRequestQueueClient):
3232
- Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to be handled
3333
so quickly as this client does not aggressively fetch the forefront and relies on local head estimation.
3434
- Requests are only added to the queue, never deleted. (Marking as handled is ok.)
35+
- Other producers can add new requests, but not modify existing ones (otherwise caching can miss the updates)
3536
3637
If the constraints are not met, the client might work in an unpredictable way.
3738
"""
@@ -247,7 +248,13 @@ async def _list_head(self) -> None:
247248
# Do not cache fully handled requests, we do not need them. Just cache their unique_key.
248249
self._requests_already_handled.add(request.unique_key)
249250
else:
250-
self._requests_cache[request.unique_key] = request
251+
# Only fetch the request if we do not know it yet.
252+
if request.unique_key not in self._requests_cache:
253+
request = Request.model_validate(
254+
await self._api_client.get_request(unique_key_to_request_id(request.unique_key))
255+
)
256+
self._requests_cache[request.unique_key] = request
257+
251258
# Add new requests to the end of the head, unless already present in head
252259
if request.unique_key not in self._head_requests:
253260
self._head_requests.appendleft(request.unique_key)

tests/integration/test_crawlers_with_storages.py

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -41,35 +41,6 @@ async def default_handler(context: ParselCrawlingContext) -> None:
4141
assert run_result.status == 'SUCCEEDED'
4242

4343

44-
async def test_actor_on_platform_max_crawl_depth(
45-
) -> None:
46-
"""Test that the actor respects max_crawl_depth."""
47-
48-
"""The crawler entry point."""
49-
import re
50-
51-
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
52-
53-
from apify import Actor
54-
55-
async with Actor:
56-
rq= await Actor.open_request_queue(force_cloud=True)
57-
crawler = ParselCrawler(max_crawl_depth=2, request_manager=rq)
58-
finished = []
59-
enqueue_pattern = re.compile(r'http://localhost:8080/2+$')
60-
61-
@crawler.router.default_handler
62-
async def default_handler(context: ParselCrawlingContext) -> None:
63-
"""Default request handler."""
64-
context.log.info(f'Processing {context.request.url} ...')
65-
await context.enqueue_links(include=[enqueue_pattern])
66-
finished.append(context.request.url)
67-
68-
await crawler.run(['http://localhost:8080/'])
69-
assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22']
70-
71-
72-
7344
async def test_actor_on_platform_max_requests_per_crawl(
7445
make_actor: MakeActorFunction,
7546
run_actor: RunActorFunction,

0 commit comments

Comments
 (0)