1414from urllib .parse import urljoin
1515
1616import cloudscraper
17+ import requests
1718from bs4 import BeautifulSoup , ResultSet
1819from tqdm import tqdm
1920
2829RESTAURANT_PATH_MARKER = "/restaurants/"
2930
3031# Create a cloudscraper instance that handles Cloudflare
31- scraper = cloudscraper .create_scraper (
32- browser = {
33- 'browser' : 'firefox' ,
34- 'platform' : 'windows' ,
35- 'mobile' : False
36- }
37- )
32+ scraper = cloudscraper .create_scraper (browser = {"browser" : "firefox" , "platform" : "windows" , "mobile" : False })
3833
3934
4035def fetch_url (url : str , error_context : str ) -> bytes | None :
41- """
42- Fetch URL content with unified error handling.
43- """
44-
36+ """Fetch URL content with unified error handling."""
4537 try :
4638 response = scraper .get (url , timeout = 15 )
4739
@@ -51,7 +43,7 @@ def fetch_url(url: str, error_context: str) -> bytes | None:
5143
5244 return response .content
5345
54- except Exception as e :
46+ except requests . exceptions . RequestException as e :
5547 logger .error ("%s error for %s (%s): %s" , type (e ).__name__ , error_context , url , e )
5648 return None
5749
@@ -63,7 +55,6 @@ def find_restaurant_schema(schema_scripts: ResultSet) -> dict | None:
6355 Searches for @type: "Restaurant" in the schema data, handling both
6456 single objects and arrays of objects.
6557 """
66-
6758 for script in schema_scripts :
6859 try :
6960 data = json .loads (script .string )
@@ -86,7 +77,6 @@ def find_restaurant_schema(schema_scripts: ResultSet) -> dict | None:
8677
8778def extract_schema_data (url : str ) -> dict | None :
8879 """Extract Restaurant schema.org data from a restaurant page."""
89-
9080 data = fetch_url (url , "restaurant page" )
9181 if not data :
9282 return None
@@ -99,7 +89,6 @@ def extract_schema_data(url: str) -> dict | None:
9989
10090def get_restaurant_links (list_url : str ) -> list [str ]:
10191 """Get all unique restaurant links from the list view page."""
102-
10392 data = fetch_url (list_url , "list page" )
10493 if not data :
10594 return []
@@ -128,7 +117,6 @@ def flatten_schema_data(schema: dict) -> dict[str, str]:
128117
129118 Extracts common fields, address components, and geo coordinates.
130119 """
131-
132120 flat = {}
133121
134122 # Map schema.org field names to CSV column names
@@ -170,7 +158,6 @@ def flatten_schema_data(schema: dict) -> dict[str, str]:
170158
171159def save_to_csv (restaurants : list [dict ], output_path : Path ) -> None :
172160 """Save restaurant data to CSV file."""
173-
174161 # Collect all unique field names
175162 fieldnames = sorted (set ().union (* (r .keys () for r in restaurants )))
176163
@@ -185,7 +172,6 @@ def save_to_csv(restaurants: list[dict], output_path: Path) -> None:
185172
186173def main () -> None :
187174 """Hey, I just met you, and this is crazy, but I'm the main function, so call me maybe."""
188-
189175 # Fetch restaurant URLs
190176 logger .info ("Fetching restaurant links..." )
191177 restaurant_urls = get_restaurant_links (BASE_URL )
@@ -216,4 +202,4 @@ def main() -> None:
216202
217203
218204if __name__ == "__main__" :
219- main ()
205+ main ()
0 commit comments