|
| 1 | +from crawlee._types import BasicCrawlingContext |
1 | 2 | from tests.integration.conftest import MakeActorFunction, RunActorFunction
|
2 | 3 |
|
3 | 4 |
|
@@ -25,14 +26,12 @@ async def default_handler(context: ParselCrawlingContext) -> None:
|
25 | 26 | """Default request handler."""
|
26 | 27 | context.log.info(f'Processing {context.request.url} ...')
|
27 | 28 | await context.enqueue_links(include=[enqueue_pattern])
|
28 |
| - await context.push_data({'Url': context.request.url}) |
29 | 29 | finished.append(context.request.url)
|
30 | 30 |
|
31 | 31 | await crawler.run(['http://localhost:8080/'])
|
32 | 32 | assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22']
|
33 |
| - # assert some dataset |
34 | 33 |
|
35 |
| - actor = await make_actor(label='parsel-crawler', main_func=main) |
| 34 | + actor = await make_actor(label='crawler-max-depth', main_func=main) |
36 | 35 | run_result = await run_actor(actor)
|
37 | 36 |
|
38 | 37 | assert run_result.status == 'SUCCEEDED'
|
@@ -62,14 +61,48 @@ async def default_handler(context: ParselCrawlingContext) -> None:
|
62 | 61 | """Default request handler."""
|
63 | 62 | context.log.info(f'Processing {context.request.url} ...')
|
64 | 63 | await context.enqueue_links()
|
65 |
| - await context.push_data({'Url': context.request.url}) |
66 | 64 | finished.append(context.request.url)
|
67 | 65 |
|
68 | 66 | await crawler.run(['http://localhost:8080/'])
|
69 | 67 | assert len(finished) == 3
|
70 |
| - # assert some dataset |
71 | 68 |
|
72 |
| - actor = await make_actor(label='parsel-crawler', main_func=main) |
| 69 | + actor = await make_actor(label='crawler-max-requests', main_func=main) |
| 70 | + run_result = await run_actor(actor) |
| 71 | + |
| 72 | + assert run_result.status == 'SUCCEEDED' |
| 73 | + |
| 74 | + |
| 75 | +async def test_actor_on_platform_max_request_retries( |
| 76 | + make_actor: MakeActorFunction, |
| 77 | + run_actor: RunActorFunction, |
| 78 | +) -> None: |
| 79 | + """Test that the actor respects max_requests_per_crawl.""" |
| 80 | + |
| 81 | + async def main() -> None: |
| 82 | + """The crawler entry point.""" |
| 83 | + from crawlee.crawlers import ParselCrawler, ParselCrawlingContext |
| 84 | + |
| 85 | + from apify import Actor |
| 86 | + |
| 87 | + async with Actor: |
| 88 | + max_retries = 2 |
| 89 | + crawler = ParselCrawler(max_request_retries=max_retries) |
| 90 | + finished = [] |
| 91 | + failed = [] |
| 92 | + |
| 93 | + @crawler.failed_request_handler |
| 94 | + async def failed_handler(context: BasicCrawlingContext, _: Exception) -> None: |
| 95 | + failed.add(context.request.url) |
| 96 | + |
| 97 | + @crawler.router.default_handler |
| 98 | + async def default_handler(context: ParselCrawlingContext) -> None: |
| 99 | + finished.append(context.request.url) |
| 100 | + |
| 101 | + await crawler.run(['http://localhost:8080/non-existing-url']) |
| 102 | + assert len(finished) == 0 |
| 103 | + assert len(failed) == max_retries + 1 |
| 104 | + |
| 105 | + actor = await make_actor(label='crawler-max-retries', main_func=main) |
73 | 106 | run_result = await run_actor(actor)
|
74 | 107 |
|
75 | 108 | assert run_result.status == 'SUCCEEDED'
|
0 commit comments