Skip to content

Commit 00717bc

Browse files
authored
docs: improve examples in readme (#269)
1 parent 9137f46 commit 00717bc

File tree

1 file changed

+94
-13
lines changed

1 file changed

+94
-13
lines changed

README.md

Lines changed: 94 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,27 +27,108 @@ pip install apify[scrapy]
2727

2828
For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).
2929

30-
## Example
30+
## Examples
31+
32+
Below are few examples demonstrating how to use the Apify SDK with some web scraping-related libraries.
33+
34+
### Apify SDK with HTTPX and BeautifulSoup
35+
36+
This example illustrates how to integrate the Apify SDK with [HTTPX](https://www.python-httpx.org/) and [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) to scrape data from web pages.
3137

3238
```python
3339
from apify import Actor
3440
from bs4 import BeautifulSoup
3541
from httpx import AsyncClient
3642

43+
44+
async def main() -> None:
45+
async with Actor:
46+
# Retrieve the Actor input, and use default values if not provided.
47+
actor_input = await Actor.get_input() or {}
48+
start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
49+
50+
# Open the default request queue for handling URLs to be processed.
51+
request_queue = await Actor.open_request_queue()
52+
53+
# Enqueue the start URLs.
54+
for start_url in start_urls:
55+
url = start_url.get('url')
56+
await request_queue.add_request(url)
57+
58+
# Process the URLs from the request queue.
59+
while request := await request_queue.fetch_next_request():
60+
Actor.log.info(f'Scraping {request.url} ...')
61+
62+
# Fetch the HTTP response from the specified URL using HTTPX.
63+
async with AsyncClient() as client:
64+
response = await client.get(request.url)
65+
66+
# Parse the HTML content using Beautiful Soup.
67+
soup = BeautifulSoup(response.content, 'html.parser')
68+
69+
# Extract the desired data.
70+
data = {
71+
'url': actor_input['url'],
72+
'title': soup.title.string,
73+
'h1s': [h1.text for h1 in soup.find_all('h1')],
74+
'h2s': [h2.text for h2 in soup.find_all('h2')],
75+
'h3s': [h3.text for h3 in soup.find_all('h3')],
76+
}
77+
78+
# Store the extracted data to the default dataset.
79+
await Actor.push_data(data)
80+
```
81+
82+
### Apify SDK with PlaywrightCrawler from Crawlee
83+
84+
This example demonstrates how to use the Apify SDK alongside `PlaywrightCrawler` from [Crawlee](https://crawlee.dev/python) to perform web scraping.
85+
86+
```python
87+
from apify import Actor, Request
88+
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
89+
90+
3791
async def main() -> None:
3892
async with Actor:
39-
# Read the input parameters from the Actor input
40-
actor_input = await Actor.get_input()
41-
# Fetch the HTTP response from the specified URL
42-
async with AsyncClient() as client:
43-
response = await client.get(actor_input['url'])
44-
# Process the HTML content
45-
soup = BeautifulSoup(response.content, 'html.parser')
46-
# Push the extracted data
47-
await Actor.push_data({
48-
'url': actor_input['url'],
49-
'title': soup.title.string,
50-
})
93+
# Retrieve the Actor input, and use default values if not provided.
94+
actor_input = await Actor.get_input() or {}
95+
start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
96+
97+
# Exit if no start URLs are provided.
98+
if not start_urls:
99+
Actor.log.info('No start URLs specified in Actor input, exiting...')
100+
await Actor.exit()
101+
102+
# Create a crawler.
103+
crawler = PlaywrightCrawler(
104+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
105+
max_requests_per_crawl=50,
106+
headless=True,
107+
)
108+
109+
# Define a request handler, which will be called for every request.
110+
@crawler.router.default_handler
111+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
112+
url = context.request.url
113+
Actor.log.info(f'Scraping {url}...')
114+
115+
# Extract the desired data.
116+
data = {
117+
'url': context.request.url,
118+
'title': await context.page.title(),
119+
'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()],
120+
'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()],
121+
'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()],
122+
}
123+
124+
# Store the extracted data to the default dataset.
125+
await context.push_data(data)
126+
127+
# Enqueue additional links found on the current page.
128+
await context.enqueue_links()
129+
130+
# Run the crawler with the starting URLs.
131+
await crawler.run(start_urls)
51132
```
52133

53134
## What are Actors?

0 commit comments

Comments
 (0)