@@ -21,6 +21,7 @@ async def main() -> None:
21
21
Asynchronous execution is required for communication with Apify platform, and it also enhances performance in
22
22
the field of web scraping significantly.
23
23
"""
24
+ # Enter the context of the Actor.
24
25
async with Actor :
25
26
# Retrieve the Actor input, and use default values if not provided.
26
27
actor_input = await Actor .get_input () or {}
@@ -39,49 +40,58 @@ async def main() -> None:
39
40
for start_url in start_urls :
40
41
url = start_url .get ('url' )
41
42
Actor .log .info (f'Enqueuing { url } ...' )
42
- request = Request .from_url (url , user_data = {'depth' : 0 })
43
- await request_queue .add_request (request )
44
-
45
- # Process the URLs from the request queue.
46
- while request := await request_queue .fetch_next_request ():
47
- url = request .url
48
- depth = request .user_data ['depth' ]
49
- Actor .log .info (f'Scraping { url } ...' )
50
-
51
- try :
52
- # Fetch the HTTP response from the specified URL using HTTPX.
53
- async with AsyncClient () as client :
43
+ new_request = Request .from_url (url , user_data = {'depth' : 0 })
44
+ await request_queue .add_request (new_request )
45
+
46
+ # Create an HTTPX client to fetch the HTML content of the URLs.
47
+ async with AsyncClient () as client :
48
+ # Process the URLs from the request queue.
49
+ while request := await request_queue .fetch_next_request ():
50
+ url = request .url
51
+
52
+ if not isinstance (request .user_data ['depth' ], (str , int )):
53
+ raise TypeError ('Request.depth is an enexpected type.' )
54
+
55
+ depth = int (request .user_data ['depth' ])
56
+ Actor .log .info (f'Scraping { url } (depth={ depth } ) ...' )
57
+
58
+ try :
59
+ # Fetch the HTTP response from the specified URL using HTTPX.
54
60
response = await client .get (url , follow_redirects = True )
55
61
56
- # Parse the HTML content using Beautiful Soup.
57
- soup = BeautifulSoup (response .content , 'html.parser' )
58
-
59
- # If the current depth is less than max_depth, find nested links and enqueue them.
60
- if depth < max_depth :
61
- for link in soup .find_all ('a' ):
62
- link_href = link .get ('href' )
63
- link_url = urljoin (url , link_href )
64
-
65
- if link_url .startswith (('http://' , 'https://' )):
66
- Actor .log .info (f'Enqueuing { link_url } ...' )
67
- request = Request .from_url (link_url , user_data = {'depth' : depth + 1 })
68
- await request_queue .add_request (request )
69
-
70
- # Extract the desired data.
71
- data = {
72
- 'url' : url ,
73
- 'title' : soup .title .string if soup .title else None ,
74
- 'h1s' : [h1 .text for h1 in soup .find_all ('h1' )],
75
- 'h2s' : [h2 .text for h2 in soup .find_all ('h2' )],
76
- 'h3s' : [h3 .text for h3 in soup .find_all ('h3' )],
77
- }
78
-
79
- # Store the extracted data to the default dataset.
80
- await Actor .push_data (data )
81
-
82
- except Exception :
83
- Actor .log .exception (f'Cannot extract data from { url } .' )
84
-
85
- finally :
86
- # Mark the request as handled to ensure it is not processed again.
87
- await request_queue .mark_request_as_handled (request )
62
+ # Parse the HTML content using Beautiful Soup.
63
+ soup = BeautifulSoup (response .content , 'html.parser' )
64
+
65
+ # If the current depth is less than max_depth, find nested links
66
+ # and enqueue them.
67
+ if depth < max_depth :
68
+ for link in soup .find_all ('a' ):
69
+ link_href = link .get ('href' )
70
+ link_url = urljoin (url , link_href )
71
+
72
+ if link_url .startswith (('http://' , 'https://' )):
73
+ Actor .log .info (f'Enqueuing { link_url } ...' )
74
+ new_request = Request .from_url (
75
+ link_url ,
76
+ user_data = {'depth' : depth + 1 },
77
+ )
78
+ await request_queue .add_request (new_request )
79
+
80
+ # Extract the desired data.
81
+ data = {
82
+ 'url' : url ,
83
+ 'title' : soup .title .string if soup .title else None ,
84
+ 'h1s' : [h1 .text for h1 in soup .find_all ('h1' )],
85
+ 'h2s' : [h2 .text for h2 in soup .find_all ('h2' )],
86
+ 'h3s' : [h3 .text for h3 in soup .find_all ('h3' )],
87
+ }
88
+
89
+ # Store the extracted data to the default dataset.
90
+ await Actor .push_data (data )
91
+
92
+ except Exception :
93
+ Actor .log .exception (f'Cannot extract data from { url } .' )
94
+
95
+ finally :
96
+ # Mark the request as handled to ensure it is not processed again.
97
+ await request_queue .mark_request_as_handled (new_request )
0 commit comments