Skip to content

Commit 7623e5a

Browse files
authored
chore: Update of Python templates (#310)
A few minor updates to the Python templates that came to mind while working on [apify-sdk-python#378](apify/apify-sdk-python#378). I wanted to propagate these updates here as well, as the templates are closely aligned with the code used in the guides.
1 parent 1d59913 commit 7623e5a

File tree

5 files changed

+116
-67
lines changed

5 files changed

+116
-67
lines changed

templates/python-beautifulsoup/src/main.py

Lines changed: 54 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ async def main() -> None:
2121
Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
2222
the field of web scraping significantly.
2323
"""
24+
# Enter the context of the Actor.
2425
async with Actor:
2526
# Retrieve the Actor input, and use default values if not provided.
2627
actor_input = await Actor.get_input() or {}
@@ -39,49 +40,58 @@ async def main() -> None:
3940
for start_url in start_urls:
4041
url = start_url.get('url')
4142
Actor.log.info(f'Enqueuing {url} ...')
42-
request = Request.from_url(url, user_data={'depth': 0})
43-
await request_queue.add_request(request)
44-
45-
# Process the URLs from the request queue.
46-
while request := await request_queue.fetch_next_request():
47-
url = request.url
48-
depth = request.user_data['depth']
49-
Actor.log.info(f'Scraping {url} ...')
50-
51-
try:
52-
# Fetch the HTTP response from the specified URL using HTTPX.
53-
async with AsyncClient() as client:
43+
new_request = Request.from_url(url, user_data={'depth': 0})
44+
await request_queue.add_request(new_request)
45+
46+
# Create an HTTPX client to fetch the HTML content of the URLs.
47+
async with AsyncClient() as client:
48+
# Process the URLs from the request queue.
49+
while request := await request_queue.fetch_next_request():
50+
url = request.url
51+
52+
if not isinstance(request.user_data['depth'], (str, int)):
53+
raise TypeError('Request.depth is an enexpected type.')
54+
55+
depth = int(request.user_data['depth'])
56+
Actor.log.info(f'Scraping {url} (depth={depth}) ...')
57+
58+
try:
59+
# Fetch the HTTP response from the specified URL using HTTPX.
5460
response = await client.get(url, follow_redirects=True)
5561

56-
# Parse the HTML content using Beautiful Soup.
57-
soup = BeautifulSoup(response.content, 'html.parser')
58-
59-
# If the current depth is less than max_depth, find nested links and enqueue them.
60-
if depth < max_depth:
61-
for link in soup.find_all('a'):
62-
link_href = link.get('href')
63-
link_url = urljoin(url, link_href)
64-
65-
if link_url.startswith(('http://', 'https://')):
66-
Actor.log.info(f'Enqueuing {link_url} ...')
67-
request = Request.from_url(link_url, user_data={'depth': depth + 1})
68-
await request_queue.add_request(request)
69-
70-
# Extract the desired data.
71-
data = {
72-
'url': url,
73-
'title': soup.title.string if soup.title else None,
74-
'h1s': [h1.text for h1 in soup.find_all('h1')],
75-
'h2s': [h2.text for h2 in soup.find_all('h2')],
76-
'h3s': [h3.text for h3 in soup.find_all('h3')],
77-
}
78-
79-
# Store the extracted data to the default dataset.
80-
await Actor.push_data(data)
81-
82-
except Exception:
83-
Actor.log.exception(f'Cannot extract data from {url}.')
84-
85-
finally:
86-
# Mark the request as handled to ensure it is not processed again.
87-
await request_queue.mark_request_as_handled(request)
62+
# Parse the HTML content using Beautiful Soup.
63+
soup = BeautifulSoup(response.content, 'html.parser')
64+
65+
# If the current depth is less than max_depth, find nested links
66+
# and enqueue them.
67+
if depth < max_depth:
68+
for link in soup.find_all('a'):
69+
link_href = link.get('href')
70+
link_url = urljoin(url, link_href)
71+
72+
if link_url.startswith(('http://', 'https://')):
73+
Actor.log.info(f'Enqueuing {link_url} ...')
74+
new_request = Request.from_url(
75+
link_url,
76+
user_data={'depth': depth + 1},
77+
)
78+
await request_queue.add_request(new_request)
79+
80+
# Extract the desired data.
81+
data = {
82+
'url': url,
83+
'title': soup.title.string if soup.title else None,
84+
'h1s': [h1.text for h1 in soup.find_all('h1')],
85+
'h2s': [h2.text for h2 in soup.find_all('h2')],
86+
'h3s': [h3.text for h3 in soup.find_all('h3')],
87+
}
88+
89+
# Store the extracted data to the default dataset.
90+
await Actor.push_data(data)
91+
92+
except Exception:
93+
Actor.log.exception(f'Cannot extract data from {url}.')
94+
95+
finally:
96+
# Mark the request as handled to ensure it is not processed again.
97+
await request_queue.mark_request_as_handled(new_request)

templates/python-crawlee-beautifulsoup/src/main.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,17 @@ async def main() -> None:
1717
Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
1818
the field of web scraping significantly.
1919
"""
20+
# Enter the context of the Actor.
2021
async with Actor:
2122
# Retrieve the Actor input, and use default values if not provided.
2223
actor_input = await Actor.get_input() or {}
23-
start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
24+
start_urls = [
25+
url.get('url')
26+
for url in actor_input.get(
27+
'start_urls',
28+
[{'url': 'https://apify.com'}],
29+
)
30+
]
2431

2532
# Exit if no start URLs are provided.
2633
if not start_urls:

templates/python-crawlee-playwright/src/main.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,17 @@ async def main() -> None:
1717
Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
1818
the field of web scraping significantly.
1919
"""
20+
# Enter the context of the Actor.
2021
async with Actor:
2122
# Retrieve the Actor input, and use default values if not provided.
2223
actor_input = await Actor.get_input() or {}
23-
start_urls = [url.get('url') for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}])]
24+
start_urls = [
25+
url.get('url')
26+
for url in actor_input.get(
27+
'start_urls',
28+
[{'url': 'https://apify.com'}],
29+
)
30+
]
2431

2532
# Exit if no start URLs are provided.
2633
if not start_urls:

templates/python-playwright/src/main.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414

1515
# Note: To run this Actor locally, ensure that Playwright browsers are installed.
1616
# Run `playwright install --with-deps` in the Actor's virtual environment to install them.
17-
# When running on the Apify platform, these dependencies are already included in the Actor's Docker image.
17+
# When running on the Apify platform, these dependencies are already included
18+
# in the Actor's Docker image.
1819

1920

2021
async def main() -> None:
@@ -24,6 +25,7 @@ async def main() -> None:
2425
Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
2526
the field of web scraping significantly.
2627
"""
28+
# Enter the context of the Actor.
2729
async with Actor:
2830
# Retrieve the Actor input, and use default values if not provided.
2931
actor_input = await Actor.get_input() or {}
@@ -42,38 +44,49 @@ async def main() -> None:
4244
for start_url in start_urls:
4345
url = start_url.get('url')
4446
Actor.log.info(f'Enqueuing {url} ...')
45-
request = Request.from_url(url, user_data={'depth': 0})
46-
await request_queue.add_request(request)
47+
new_request = Request.from_url(url, user_data={'depth': 0})
48+
await request_queue.add_request(new_request)
4749

4850
Actor.log.info('Launching Playwright...')
4951

5052
# Launch Playwright and open a new browser context.
5153
async with async_playwright() as playwright:
5254
# Configure the browser to launch in headless mode as per Actor configuration.
53-
browser = await playwright.chromium.launch(headless=Actor.config.headless, args=['--disable-gpu'])
55+
browser = await playwright.chromium.launch(
56+
headless=Actor.config.headless,
57+
args=['--disable-gpu'],
58+
)
5459
context = await browser.new_context()
5560

5661
# Process the URLs from the request queue.
5762
while request := await request_queue.fetch_next_request():
5863
url = request.url
59-
depth = request.user_data['depth']
60-
Actor.log.info(f'Scraping {url} ...')
64+
65+
if not isinstance(request.user_data['depth'], (str, int)):
66+
raise TypeError('Request.depth is an enexpected type.')
67+
68+
depth = int(request.user_data['depth'])
69+
Actor.log.info(f'Scraping {url} (depth={depth}) ...')
6170

6271
try:
6372
# Open a new page in the browser context and navigate to the URL.
6473
page = await context.new_page()
6574
await page.goto(url)
6675

67-
# If the current depth is less than max_depth, find nested links and enqueue them.
76+
# If the current depth is less than max_depth, find nested links
77+
# and enqueue them.
6878
if depth < max_depth:
6979
for link in await page.locator('a').all():
7080
link_href = await link.get_attribute('href')
7181
link_url = urljoin(url, link_href)
7282

7383
if link_url.startswith(('http://', 'https://')):
7484
Actor.log.info(f'Enqueuing {link_url} ...')
75-
request = Request.from_url(link_url, user_data={'depth': depth + 1})
76-
await request_queue.add_request(request)
85+
new_request = Request.from_url(
86+
link_url,
87+
user_data={'depth': depth + 1},
88+
)
89+
await request_queue.add_request(new_request)
7790

7891
# Extract the desired data.
7992
data = {

templates/python-selenium/src/main.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@
1616
from apify import Actor, Request
1717

1818
# To run this Actor locally, you need to have the Selenium Chromedriver installed.
19-
# Follow the installation guide at: https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
20-
# When running on the Apify platform, the Chromedriver is already included in the Actor's Docker image.
21-
19+
# Follow the installation guide at:
20+
# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
21+
# When running on the Apify platform, the Chromedriver is already included
22+
# in the Actor's Docker image.
2223

2324
async def main() -> None:
2425
"""Main entry point for the Apify Actor.
@@ -27,6 +28,7 @@ async def main() -> None:
2728
Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
2829
the field of web scraping significantly.
2930
"""
31+
# Enter the context of the Actor.
3032
async with Actor:
3133
# Retrieve the Actor input, and use default values if not provided.
3234
actor_input = await Actor.get_input() or {}
@@ -45,8 +47,8 @@ async def main() -> None:
4547
for start_url in start_urls:
4648
url = start_url.get('url')
4749
Actor.log.info(f'Enqueuing {url} ...')
48-
request = Request.from_url(url, user_data={'depth': 0})
49-
await request_queue.add_request(request)
50+
new_request = Request.from_url(url, user_data={'depth': 0})
51+
await request_queue.add_request(new_request)
5052

5153
# Launch a new Selenium Chrome WebDriver and configure it.
5254
Actor.log.info('Launching Chrome WebDriver...')
@@ -61,28 +63,38 @@ async def main() -> None:
6163

6264
# Test WebDriver setup by navigating to an example page.
6365
driver.get('http://www.example.com')
64-
assert driver.title == 'Example Domain'
66+
if driver.title != 'Example Domain':
67+
raise ValueError('Failed to open example page.')
6568

6669
# Process the URLs from the request queue.
6770
while request := await request_queue.fetch_next_request():
6871
url = request.url
69-
depth = request.user_data['depth']
70-
Actor.log.info(f'Scraping {url} ...')
72+
73+
if not isinstance(request.user_data['depth'], (str, int)):
74+
raise TypeError('Request.depth is an enexpected type.')
75+
76+
depth = int(request.user_data['depth'])
77+
Actor.log.info(f'Scraping {url} (depth={depth}) ...')
7178

7279
try:
73-
# Navigate to the URL using Selenium WebDriver. Use asyncio.to_thread for non-blocking execution.
80+
# Navigate to the URL using Selenium WebDriver. Use asyncio.to_thread
81+
# for non-blocking execution.
7482
await asyncio.to_thread(driver.get, url)
7583

76-
# If the current depth is less than max_depth, find nested links and enqueue them.
84+
# If the current depth is less than max_depth, find nested links
85+
# and enqueue them.
7786
if depth < max_depth:
7887
for link in driver.find_elements(By.TAG_NAME, 'a'):
7988
link_href = link.get_attribute('href')
8089
link_url = urljoin(url, link_href)
8190

8291
if link_url.startswith(('http://', 'https://')):
8392
Actor.log.info(f'Enqueuing {link_url} ...')
84-
request = Request.from_url(link_url, user_data={'depth': depth + 1})
85-
await request_queue.add_request(request)
93+
new_request = Request.from_url(
94+
link_url,
95+
user_data={'depth': depth + 1},
96+
)
97+
await request_queue.add_request(new_request)
8698

8799
# Extract the desired data.
88100
data = {

0 commit comments

Comments
 (0)