Skip to content

Commit 597ce13

Browse files
authored
docs: Add example "Run parallel crawlers" (#1623)
### Description - Add example "Run parallel crawlers".
1 parent 480a669 commit 597ce13

File tree

2 files changed

+113
-0
lines changed

2 files changed

+113
-0
lines changed
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import asyncio
2+
3+
from crawlee import ConcurrencySettings
4+
from crawlee.crawlers import (
5+
ParselCrawler,
6+
ParselCrawlingContext,
7+
PlaywrightCrawler,
8+
PlaywrightCrawlingContext,
9+
)
10+
from crawlee.sessions import SessionPool
11+
from crawlee.storages import RequestQueue
12+
13+
14+
async def main() -> None:
15+
# Open request queues for both crawlers with different aliases
16+
playwright_rq = await RequestQueue.open(alias='playwright-requests')
17+
parsel_rq = await RequestQueue.open(alias='parsel-requests')
18+
19+
# Use a shared session pool between both crawlers
20+
async with SessionPool() as session_pool:
21+
playwright_crawler = PlaywrightCrawler(
22+
# Set the request queue for Playwright crawler
23+
request_manager=playwright_rq,
24+
session_pool=session_pool,
25+
# Configure concurrency settings for Playwright crawler
26+
concurrency_settings=ConcurrencySettings(
27+
max_concurrency=5, desired_concurrency=5
28+
),
29+
# Set `keep_alive`` so that the crawler does not stop working when there are
30+
# no requests in the queue.
31+
keep_alive=True,
32+
)
33+
34+
parsel_crawler = ParselCrawler(
35+
# Set the request queue for Parsel crawler
36+
request_manager=parsel_rq,
37+
session_pool=session_pool,
38+
# Configure concurrency settings for Parsel crawler
39+
concurrency_settings=ConcurrencySettings(
40+
max_concurrency=10, desired_concurrency=10
41+
),
42+
# Set maximum requests per crawl for Parsel crawler
43+
max_requests_per_crawl=50,
44+
)
45+
46+
@playwright_crawler.router.default_handler
47+
async def handle_playwright(context: PlaywrightCrawlingContext) -> None:
48+
context.log.info(f'Playwright Processing {context.request.url}...')
49+
50+
title = await context.page.title()
51+
# Push the extracted data to the dataset for Playwright crawler
52+
await context.push_data(
53+
{'title': title, 'url': context.request.url, 'source': 'playwright'},
54+
dataset_name='playwright-data',
55+
)
56+
57+
@parsel_crawler.router.default_handler
58+
async def handle_parsel(context: ParselCrawlingContext) -> None:
59+
context.log.info(f'Parsel Processing {context.request.url}...')
60+
61+
title = context.parsed_content.css('title::text').get()
62+
# Push the extracted data to the dataset for Parsel crawler
63+
await context.push_data(
64+
{'title': title, 'url': context.request.url, 'source': 'parsel'},
65+
dataset_name='parsel-data',
66+
)
67+
68+
# Enqueue links to the Playwright request queue for blog pages
69+
await context.enqueue_links(
70+
selector='a[href*="/blog/"]', rq_alias='playwright-requests'
71+
)
72+
# Enqueue other links to the Parsel request queue
73+
await context.enqueue_links(selector='a:not([href*="/blog/"])')
74+
75+
# Start the Playwright crawler in the background
76+
background_crawler_task = asyncio.create_task(playwright_crawler.run([]))
77+
78+
# Run the Parsel crawler with the initial URL and wait for it to finish
79+
await parsel_crawler.run(['https://crawlee.dev/blog'])
80+
81+
# Wait for the Playwright crawler to finish processing all requests
82+
while not await playwright_rq.is_empty():
83+
playwright_crawler.log.info('Waiting for Playwright crawler to finish...')
84+
await asyncio.sleep(5)
85+
86+
# Stop the Playwright crawler after all requests are processed
87+
playwright_crawler.stop()
88+
89+
# Wait for the background Playwright crawler task to complete
90+
await background_crawler_task
91+
92+
93+
if __name__ == '__main__':
94+
asyncio.run(main())
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
---
2+
id: run-parallel-crawlers
3+
title: Run parallel crawlers
4+
---
5+
6+
import ApiLink from '@site/src/components/ApiLink';
7+
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
8+
9+
import RunParallelCrawlersExample from '!!raw-loader!roa-loader!./code_examples/run_parallel_crawlers.py';
10+
11+
This example demonstrates how to run two parallel crawlers where one crawler processes links discovered by another crawler.
12+
13+
In some situations, you may need different approaches for scraping data from a website. For example, you might use <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> for navigating JavaScript-heavy pages and a faster, more lightweight <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> for processing static pages. One way to solve this is to use <ApiLink to="class/AdaptivePlaywrightCrawler">`AdaptivePlaywrightCrawler`</ApiLink>, see the [Adaptive Playwright crawler example](./adaptive-playwright-crawler) to learn more.
14+
15+
The code below demonstrates an alternative approach using two separate crawlers. Links are passed between crawlers via <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> aliases. The `keep_alive` option allows the Playwright crawler to run in the background and wait for incoming links without stopping when its queue is empty. You can also use different storage clients for each crawler without losing the ability to pass links between queues. Learn more about available storage clients in this [guide](/python/docs/guides/storage-clients).
16+
17+
<RunnableCodeBlock className="language-python" language="python">
18+
{RunParallelCrawlersExample}
19+
</RunnableCodeBlock>

0 commit comments

Comments
 (0)