Skip to content

Commit 7ca3716

Browse files
committed
add example "Run parallel crawlers"
1 parent e201147 commit 7ca3716

File tree

2 files changed

+113
-0
lines changed

2 files changed

+113
-0
lines changed
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import asyncio
2+
3+
from crawlee import ConcurrencySettings
4+
from crawlee.crawlers import (
5+
ParselCrawler,
6+
ParselCrawlingContext,
7+
PlaywrightCrawler,
8+
PlaywrightCrawlingContext,
9+
)
10+
from crawlee.sessions import SessionPool
11+
from crawlee.storages import RequestQueue
12+
13+
14+
async def main() -> None:
15+
# Open request queues for both crawlers with different aliases
16+
playwright_rq = await RequestQueue.open(alias='playwright-requests')
17+
parsel_rq = await RequestQueue.open(alias='parsel-requests')
18+
19+
# Use a shared session pool between both crawlers
20+
async with SessionPool() as session_pool:
21+
playwright_crawler = PlaywrightCrawler(
22+
# Set the request queue for Playwright crawler
23+
request_manager=playwright_rq,
24+
session_pool=session_pool,
25+
# Configure concurrency settings for Playwright crawler
26+
concurrency_settings=ConcurrencySettings(
27+
max_concurrency=5, desired_concurrency=5
28+
),
29+
# Set `keep_alive`` so that the crawler does not stop working when there are
30+
# no requests in the queue.
31+
keep_alive=True,
32+
)
33+
34+
parsel_crawler = ParselCrawler(
35+
# Set the request queue for Parsel crawler
36+
request_manager=parsel_rq,
37+
session_pool=session_pool,
38+
# Configure concurrency settings for Parsel crawler
39+
concurrency_settings=ConcurrencySettings(
40+
max_concurrency=10, desired_concurrency=10
41+
),
42+
# Set maximum requests per crawl for Parsel crawler
43+
max_requests_per_crawl=50,
44+
)
45+
46+
@playwright_crawler.router.default_handler
47+
async def handle_playwright(context: PlaywrightCrawlingContext) -> None:
48+
context.log.info(f'Playwright Processing {context.request.url}...')
49+
50+
title = await context.page.title()
51+
# Push the extracted data to the dataset for Playwright crawler
52+
await context.push_data(
53+
{'title': title, 'url': context.request.url, 'source': 'playwright'},
54+
dataset_name='playwright-data',
55+
)
56+
57+
@parsel_crawler.router.default_handler
58+
async def handle_parsel(context: ParselCrawlingContext) -> None:
59+
context.log.info(f'Parsel Processing {context.request.url}...')
60+
61+
title = context.parsed_content.css('title::text').get()
62+
# Push the extracted data to the dataset for Parsel crawler
63+
await context.push_data(
64+
{'title': title, 'url': context.request.url, 'source': 'parsel'},
65+
dataset_name='parsel-data',
66+
)
67+
68+
# Enqueue links to the Playwright request queue for blog pages
69+
await context.enqueue_links(
70+
selector='a[href*="/blog/"]', rq_alias='playwright-requests'
71+
)
72+
# Enqueue other links to the Parsel request queue
73+
await context.enqueue_links(selector='a:not([href*="/blog/"])')
74+
75+
# Start the Playwright crawler in the background
76+
background_crawler_task = asyncio.create_task(playwright_crawler.run([]))
77+
78+
# Run the Parsel crawler with the initial URL and wait for it to finish
79+
await parsel_crawler.run(['https://crawlee.dev/blog'])
80+
81+
# Wait for the Playwright crawler to finish processing all requests
82+
while not await playwright_rq.is_empty():
83+
playwright_crawler.log.info('Waiting for Playwright crawler to finish...')
84+
await asyncio.sleep(5)
85+
86+
# Stop the Playwright crawler after all requests are processed
87+
playwright_crawler.stop()
88+
89+
# Wait for the background Playwright crawler task to complete
90+
await background_crawler_task
91+
92+
93+
if __name__ == '__main__':
94+
asyncio.run(main())
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
---
2+
id: run-parallel-crawlers
3+
title: Run parallel crawlers
4+
---
5+
6+
import ApiLink from '@site/src/components/ApiLink';
7+
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
8+
9+
import RunParallelCrawlersExample from '!!raw-loader!roa-loader!./code_examples/run_parallel_crawlers.py';
10+
11+
This example demonstrates how to run two parallel crawlers where one crawler processes links discovered by another crawler.
12+
13+
In some situations, you may need different approaches for scraping data from a website. For example, you might use <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> for navigating JavaScript-heavy pages and a faster, more lightweight <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> for processing static pages. One way to solve this is to use <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink>, see the [Adaptive Playwright crawler example](./adaptive-playwright-crawler) to learn more.
14+
15+
The code below demonstrates an alternative approach using two separate crawlers. Links are passed between crawlers via <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> aliases. The `keep_alive` option allows the Playwright crawler to run in the background and wait for incoming links without stopping when its queue is empty.
16+
17+
<RunnableCodeBlock className="language-python" language="python">
18+
{RunParallelCrawlersExample}
19+
</RunnableCodeBlock>

0 commit comments

Comments
 (0)