Skip to content

Commit 704b76d

Browse files
committed
address the feedback
1 parent 4342563 commit 704b76d

File tree

8 files changed

+125
-110
lines changed

8 files changed

+125
-110
lines changed

docs/03_guides/05_crawlee.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ The [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler) works
3535

3636
## Actor with PlaywrightCrawler
3737

38-
The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) is built for handling dynamic web pages that rely on JavaScript for content generation. Using the [Playwright](https://playwright.dev/) library, it provides a browser-based automation environment to interact with complex websites. Below is an example of how to use [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) in an Apify Actor.
38+
The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) is built for handling dynamic web pages that rely on JavaScript for content rendering. Using the [Playwright](https://playwright.dev/) library, it provides a browser-based automation environment to interact with complex websites. Below is an example of how to use [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) in an Apify Actor.
3939

4040
<CodeBlock className="language-python">
4141
{CrawleePlaywrightExample}

docs/03_guides/code/01_beautifulsoup_httpx.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
from urllib.parse import urljoin
23

34
import httpx
@@ -81,3 +82,7 @@ async def main() -> None:
8182
finally:
8283
# Mark the request as handled to ensure it is not processed again.
8384
await request_queue.mark_request_as_handled(new_request)
85+
86+
87+
if __name__ == '__main__':
88+
asyncio.run(main())

docs/03_guides/code/02_parsel_impit.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
from urllib.parse import urljoin
23

34
import impit
@@ -87,3 +88,7 @@ async def main() -> None:
8788
finally:
8889
# Mark the request as handled to ensure it is not processed again.
8990
await request_queue.mark_request_as_handled(request)
91+
92+
93+
if __name__ == '__main__':
94+
asyncio.run(main())

docs/03_guides/code/03_playwright.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
from urllib.parse import urljoin
23

34
from playwright.async_api import async_playwright
@@ -90,3 +91,7 @@ async def main() -> None:
9091
await page.close()
9192
# Mark the request as handled to ensure it is not processed again.
9293
await request_queue.mark_request_as_handled(request)
94+
95+
96+
if __name__ == '__main__':
97+
asyncio.run(main())

docs/03_guides/code/04_selenium.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,7 @@ async def main() -> None:
100100
await request_queue.mark_request_as_handled(request)
101101

102102
driver.quit()
103+
104+
105+
if __name__ == '__main__':
106+
asyncio.run(main())
Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,36 @@
1+
import asyncio
2+
13
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
24

35
from apify import Actor
46

7+
# Create a crawler.
8+
crawler = BeautifulSoupCrawler(
9+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
10+
max_requests_per_crawl=50,
11+
)
12+
13+
14+
# Define a request handler, which will be called for every request.
15+
@crawler.router.default_handler
16+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
17+
Actor.log.info(f'Scraping {context.request.url}...')
18+
19+
# Extract the desired data.
20+
data = {
21+
'url': context.request.url,
22+
'title': context.soup.title.string if context.soup.title else None,
23+
'h1s': [h1.text for h1 in context.soup.find_all('h1')],
24+
'h2s': [h2.text for h2 in context.soup.find_all('h2')],
25+
'h3s': [h3.text for h3 in context.soup.find_all('h3')],
26+
}
27+
28+
# Store the extracted data to the default dataset.
29+
await context.push_data(data)
30+
31+
# Enqueue additional links found on the current page.
32+
await context.enqueue_links(strategy='same-domain')
33+
534

635
async def main() -> None:
736
# Enter the context of the Actor.
@@ -10,44 +39,17 @@ async def main() -> None:
1039
actor_input = await Actor.get_input() or {}
1140
start_urls = [
1241
url.get('url')
13-
for url in actor_input.get(
14-
'start_urls',
15-
[{'url': 'https://apify.com'}],
16-
)
42+
for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])
1743
]
1844

1945
# Exit if no start URLs are provided.
2046
if not start_urls:
2147
Actor.log.info('No start URLs specified in Actor input, exiting...')
2248
await Actor.exit()
2349

24-
# Create a crawler.
25-
crawler = BeautifulSoupCrawler(
26-
# Limit the crawl to max requests.
27-
# Remove or increase it for crawling all links.
28-
max_requests_per_crawl=50,
29-
)
30-
31-
# Define a request handler, which will be called for every request.
32-
@crawler.router.default_handler
33-
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
34-
url = context.request.url
35-
Actor.log.info(f'Scraping {url}...')
36-
37-
# Extract the desired data.
38-
data = {
39-
'url': context.request.url,
40-
'title': context.soup.title.string if context.soup.title else None,
41-
'h1s': [h1.text for h1 in context.soup.find_all('h1')],
42-
'h2s': [h2.text for h2 in context.soup.find_all('h2')],
43-
'h3s': [h3.text for h3 in context.soup.find_all('h3')],
44-
}
45-
46-
# Store the extracted data to the default dataset.
47-
await context.push_data(data)
48-
49-
# Enqueue additional links found on the current page.
50-
await context.enqueue_links()
51-
5250
# Run the crawler with the starting requests.
5351
await crawler.run(start_urls)
52+
53+
54+
if __name__ == '__main__':
55+
asyncio.run(main())
Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,36 @@
1+
import asyncio
2+
13
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
24

35
from apify import Actor
46

7+
# Create a crawler.
8+
crawler = ParselCrawler(
9+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
10+
max_requests_per_crawl=50,
11+
)
12+
13+
14+
# Define a request handler, which will be called for every request.
15+
@crawler.router.default_handler
16+
async def request_handler(context: ParselCrawlingContext) -> None:
17+
Actor.log.info(f'Scraping {context.request.url}...')
18+
19+
# Extract the desired data.
20+
data = {
21+
'url': context.request.url,
22+
'title': context.selector.xpath('//title/text()').get(),
23+
'h1s': context.selector.xpath('//h1/text()').getall(),
24+
'h2s': context.selector.xpath('//h2/text()').getall(),
25+
'h3s': context.selector.xpath('//h3/text()').getall(),
26+
}
27+
28+
# Store the extracted data to the default dataset.
29+
await context.push_data(data)
30+
31+
# Enqueue additional links found on the current page.
32+
await context.enqueue_links(strategy='same-domain')
33+
534

635
async def main() -> None:
736
# Enter the context of the Actor.
@@ -10,44 +39,17 @@ async def main() -> None:
1039
actor_input = await Actor.get_input() or {}
1140
start_urls = [
1241
url.get('url')
13-
for url in actor_input.get(
14-
'start_urls',
15-
[{'url': 'https://apify.com'}],
16-
)
42+
for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])
1743
]
1844

1945
# Exit if no start URLs are provided.
2046
if not start_urls:
2147
Actor.log.info('No start URLs specified in Actor input, exiting...')
2248
await Actor.exit()
2349

24-
# Create a crawler.
25-
crawler = ParselCrawler(
26-
# Limit the crawl to max requests.
27-
# Remove or increase it for crawling all links.
28-
max_requests_per_crawl=50,
29-
)
30-
31-
# Define a request handler, which will be called for every request.
32-
@crawler.router.default_handler
33-
async def request_handler(context: ParselCrawlingContext) -> None:
34-
url = context.request.url
35-
Actor.log.info(f'Scraping {url}...')
36-
37-
# Extract the desired data.
38-
data = {
39-
'url': context.request.url,
40-
'title': context.selector.xpath('//title/text()').get(),
41-
'h1s': context.selector.xpath('//h1/text()').getall(),
42-
'h2s': context.selector.xpath('//h2/text()').getall(),
43-
'h3s': context.selector.xpath('//h3/text()').getall(),
44-
}
45-
46-
# Store the extracted data to the default dataset.
47-
await context.push_data(data)
48-
49-
# Enqueue additional links found on the current page.
50-
await context.enqueue_links()
51-
5250
# Run the crawler with the starting requests.
5351
await crawler.run(start_urls)
52+
53+
54+
if __name__ == '__main__':
55+
asyncio.run(main())
Lines changed: 37 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,39 @@
1+
import asyncio
2+
13
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
24

35
from apify import Actor
46

7+
# Create a crawler.
8+
crawler = PlaywrightCrawler(
9+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
10+
max_requests_per_crawl=50,
11+
# Run the browser in a headless mode.
12+
headless=True,
13+
browser_launch_options={'args': ['--disable-gpu']},
14+
)
15+
16+
17+
# Define a request handler, which will be called for every request.
18+
@crawler.router.default_handler
19+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
20+
Actor.log.info(f'Scraping {context.request.url}...')
21+
22+
# Extract the desired data.
23+
data = {
24+
'url': context.request.url,
25+
'title': await context.page.title(),
26+
'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
27+
'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
28+
'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
29+
}
30+
31+
# Store the extracted data to the default dataset.
32+
await context.push_data(data)
33+
34+
# Enqueue additional links found on the current page.
35+
await context.enqueue_links(strategy='same-domain')
36+
537

638
async def main() -> None:
739
# Enter the context of the Actor.
@@ -10,57 +42,17 @@ async def main() -> None:
1042
actor_input = await Actor.get_input() or {}
1143
start_urls = [
1244
url.get('url')
13-
for url in actor_input.get(
14-
'start_urls',
15-
[{'url': 'https://apify.com'}],
16-
)
45+
for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])
1746
]
1847

1948
# Exit if no start URLs are provided.
2049
if not start_urls:
2150
Actor.log.info('No start URLs specified in Actor input, exiting...')
2251
await Actor.exit()
2352

24-
# Create a crawler.
25-
crawler = PlaywrightCrawler(
26-
# Limit the crawl to max requests.
27-
# Remove or increase it for crawling all links.
28-
max_requests_per_crawl=50,
29-
headless=True,
30-
browser_launch_options={
31-
'args': ['--disable-gpu'],
32-
},
33-
)
34-
35-
# Define a request handler, which will be called for every request.
36-
@crawler.router.default_handler
37-
async def request_handler(context: PlaywrightCrawlingContext) -> None:
38-
url = context.request.url
39-
Actor.log.info(f'Scraping {url}...')
40-
41-
# Extract the desired data.
42-
data = {
43-
'url': context.request.url,
44-
'title': await context.page.title(),
45-
'h1s': [
46-
await h1.text_content()
47-
for h1 in await context.page.locator('h1').all()
48-
],
49-
'h2s': [
50-
await h2.text_content()
51-
for h2 in await context.page.locator('h2').all()
52-
],
53-
'h3s': [
54-
await h3.text_content()
55-
for h3 in await context.page.locator('h3').all()
56-
],
57-
}
58-
59-
# Store the extracted data to the default dataset.
60-
await context.push_data(data)
61-
62-
# Enqueue additional links found on the current page.
63-
await context.enqueue_links()
64-
6553
# Run the crawler with the starting requests.
6654
await crawler.run(start_urls)
55+
56+
57+
if __name__ == '__main__':
58+
asyncio.run(main())

0 commit comments

Comments
 (0)