1313 python profile_crawler.py https://www.erome.com/marieanita
1414
1515"""
16+ from __future__ import annotations
1617
1718import logging
1819import re
@@ -32,55 +33,57 @@ def fetch_profile_page(url: str) -> BeautifulSoup:
3233 try :
3334 response = requests .get (url , timeout = 10 )
3435 response .raise_for_status ()
35- return BeautifulSoup (response .text , "html.parser" )
3636
3737 except requests .RequestException as req_err :
3838 message = f"Error fetching the page: { req_err } "
3939 logging .exception (message )
4040 sys .exit (1 )
4141
42+ return BeautifulSoup (response .text , "html.parser" )
43+
44+
45+ def extract_page_number (page_link : dict [str , str ]) -> int | None :
46+ """Extract the page number from a URL."""
47+ try :
48+ # Extract page number using regex and convert to integer
49+ return int (re .search (r"page=(\d+)" , page_link ["href" ]).group (1 ))
50+
51+ except (AttributeError , ValueError , TypeError ) as err :
52+ message = f"Error extracting page index from { page_link ['href' ]} : { err } "
53+ logging .exception (message )
54+ return None
55+
4256
4357def get_profile_page_links (
4458 soup : BeautifulSoup ,
4559 profile : str ,
46- next_page_tag : str = "?page=" ,
4760) -> list [str ]:
4861 """Extract and profile page links from a BeautifulSoup object."""
4962 try :
5063 # Regular expression to find all 'a' tags with href that match "?page="
5164 # followed by a number
5265 page_links = soup .find_all (
5366 "a" ,
54- {"href" : re .compile (f"/{ profile } \\ { next_page_tag } \\ d+" )},
67+ {"href" : re .compile (f"/{ profile } \\ ?page= \\ d+" )},
5568 )
5669
57- page_numbers = []
58- for page_link in page_links :
59- try :
60- # Extract page number using regex and convert to integer
61- page_number = int (re .search (r"page=(\d+)" , page_link ["href" ]).group (1 ))
62- page_numbers .append (page_number )
63-
64- except (AttributeError , ValueError , TypeError ) as err :
65- message = f"Error extracting page index from { page_link ['href' ]} : { err } "
66- logging .exception (message )
67-
68- max_page_number = max (page_numbers ) if page_numbers else None
69-
70- formatted_page_links = []
71- if max_page_number is not None :
72- # The last item of the page_links list isn't useful, so it is discarded
73- formatted_page_links = [
74- HOST_PAGE + page_link ["href" ] for page_link in page_links [:- 1 ]
75- ]
76-
77- return formatted_page_links
78-
7970 except (AttributeError , TypeError , KeyError ) as err :
8071 message = f"An error occurred while processing the soup: { err } "
8172 logging .exception (message )
8273 return []
8374
75+ page_numbers = [extract_page_number (page_link ) for page_link in page_links ]
76+ max_page_number = max (page_numbers ) if page_numbers else None
77+
78+ formatted_page_links = []
79+ if max_page_number is not None :
80+ # The last item of the page_links list isn't useful, so it is discarded
81+ formatted_page_links = [
82+ HOST_PAGE + page_link ["href" ] for page_link in page_links [:- 1 ]
83+ ]
84+
85+ return formatted_page_links
86+
8487
8588def extract_album_links_in_page (soup : BeautifulSoup ) -> list [str ]:
8689 """Extract album links from a BeautifulSoup object representing a webpage."""
@@ -125,19 +128,15 @@ def process_profile_url(url: str) -> None:
125128 generate_profile_dump (profile_album_links )
126129
127130 except ValueError as val_err :
128- message = f"Value error : { val_err } "
131+ message = f"Error occurred processing profile URL : { val_err } "
129132 logging .exception (message )
130133
131- finally :
132- console .print ("[green]✓[/green] Dump file successfully generated." )
134+ else :
135+ console .print ("[green]✓[/green] Dump file successfully generated.\n " )
133136
134137
135138def main () -> None :
136139 """Execute the profile album extraction process."""
137- if len (sys .argv ) != 2 :
138- logging .error ("Usage: python profile_crawler.py <profile_page_url>" )
139- sys .exit (1 )
140-
141140 url = sys .argv [1 ]
142141 process_profile_url (url )
143142
0 commit comments