Skip to content

Commit af921ae

Browse files
committed
fix: data menu data formatting
1 parent 6dc0121 commit af921ae

File tree

5 files changed

+6581
-60
lines changed

5 files changed

+6581
-60
lines changed

backend/app/api/routes/scrapper.py

Lines changed: 129 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
MenuSubCategoryCreate,
1616
MenuItemCreate
1717
)
18+
from app.api.routes.utils import transform_restaurant_data
1819

1920
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
2021
logger = logging.getLogger(__name__)
@@ -26,10 +27,9 @@ class ZomatoUrl(BaseModel):
2627

2728
def fetch_zomato_data(url: str) -> str:
2829
headers = {
29-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
30+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
3031
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
3132
'Accept-Language': 'en-US,en;q=0.5',
32-
'Connection': 'keep-alive',
3333
}
3434

3535
try:
@@ -43,33 +43,25 @@ def fetch_zomato_data(url: str) -> str:
4343

4444
def parse_zomato_page(html_content: str) -> dict:
4545
try:
46-
logger.debug("Starting page parsing...")
4746
soup = BeautifulSoup(html_content, 'html.parser')
48-
4947
scripts = soup.find_all('script')
50-
target_script = None
5148

5249
for script in scripts:
5350
if script.string and 'window.__PRELOADED_STATE__' in script.string:
54-
target_script = script
55-
break
56-
57-
if not target_script:
58-
raise ValueError("Could not find PRELOADED_STATE in page")
59-
60-
json_str = target_script.string.split('window.__PRELOADED_STATE__ = JSON.parse(')[1]
61-
json_str = json_str.split(');')[0].strip()
62-
json_str = json_str.strip('"').replace('\\"', '"').replace('\\\\', '\\').replace('\\n', '')
63-
64-
return json.loads(json_str)
65-
51+
json_str = script.string.split('window.__PRELOADED_STATE__ = JSON.parse(')[1]
52+
json_str = json_str.split(');')[0].strip()
53+
json_str = json_str.strip('"').replace('\\"', '"').replace('\\\\', '\\')
54+
return json.loads(json_str)
55+
56+
raise ValueError("Could not find PRELOADED_STATE in page")
6657
except Exception as e:
6758
logger.error(f"Error parsing page: {str(e)}")
6859
raise
6960

7061
def extract_menu_data(json_data: dict) -> dict:
7162
try:
72-
logger.debug("Starting menu data extraction...")
63+
with open('data.json', 'w') as f:
64+
json.dump(json_data, f, indent=4)
7365
restaurant_data = json_data.get('pages', {}).get('current', {})
7466
restaurant_details = json_data.get('pages', {}).get('restaurant', {})
7567

@@ -79,56 +71,99 @@ def extract_menu_data(json_data: dict) -> dict:
7971

8072
res_info = restaurant_details[restaurant_id].get('sections', {})
8173
basic_info = res_info.get('SECTION_BASIC_INFO', {})
82-
74+
menu_widget = res_info.get('SECTION_MENU_WIDGET', {})
75+
76+
77+
# Validate required name field
78+
if not basic_info.get('name'):
79+
raise ValueError("Restaurant name is required")
80+
8381
restaurant_info = {
84-
'name': basic_info.get('name'),
85-
'cuisines': basic_info.get('cuisine_string'),
86-
'rating': {
87-
'aggregate_rating': basic_info.get('rating', {}).get('aggregate_rating'),
88-
'votes': basic_info.get('rating', {}).get('votes'),
89-
'rating_text': basic_info.get('rating', {}).get('rating_text')
90-
},
91-
'location': {
92-
'locality': restaurant_data.get('pageDescription', ''),
93-
'url': basic_info.get('resUrl')
94-
},
95-
'timing': {
96-
'description': basic_info.get('timing', {}).get('timing_desc'),
97-
'hours': basic_info.get('timing', {}).get('customised_timings', {}).get('opening_hours', [])
82+
'cuisine_type': basic_info.get('cuisine_string', ''),
83+
'venue': {
84+
'name': basic_info.get('name', ''),
85+
'address': basic_info.get('address', ''),
86+
'locality': basic_info.get('locality_verbose', ''),
87+
'city': basic_info.get('city', ''),
88+
'latitude': basic_info.get('latitude', '0'),
89+
'longitude': basic_info.get('longitude', '0'),
90+
'zipcode': basic_info.get('zipcode', ''),
91+
'rating': basic_info.get('rating', {}).get('aggregate_rating', '0'),
92+
'timing': basic_info.get('timing', {}).get('timing', ''),
93+
'avg_cost_for_two': basic_info.get('average_cost_for_two', 0)
9894
}
9995
}
100-
96+
97+
10198
menu_categories = []
102-
menu_data = res_info.get('SECTION_MENU_WIDGET', {})
103-
104-
for category in menu_data.get('categories', []):
105-
category_items = {
106-
'category': category.get('name', ''),
107-
'items': []
99+
print("Catorgies", menu_widget.get('menu', {}).get('categories', []))
100+
for category in menu_widget.get('menu', {}).get('categories', []):
101+
print("category", category)
102+
category_data = {
103+
'name': category.get('name', ''),
104+
'description': category.get('description', ''),
105+
'subcategories': []
108106
}
109107

108+
# Group items by subcategory
109+
subcategories = {}
110110
for item in category.get('items', []):
111+
subcategory_name = item.get('category', 'Other')
112+
113+
if subcategory_name not in subcategories:
114+
subcategories[subcategory_name] = {
115+
'name': subcategory_name,
116+
'description': '',
117+
'items': []
118+
}
119+
111120
menu_item = {
112-
'name': item.get('name'),
113-
'description': item.get('description', ''),
114-
'price': float(item.get('price', 0)),
115-
'image_url': item.get('imageUrl', ''),
121+
'name': item.get('name', ''),
122+
'description': item.get('desc', ''),
116123
'is_veg': item.get('isVeg', True),
117-
'spice_level': item.get('spiceLevel', 'None')
124+
'image_url': item.get('itemImage', ''),
125+
'variants': []
118126
}
119-
category_items['items'].append(menu_item)
127+
128+
# Handle variants
129+
if item.get('variantsV2'):
130+
for variant in item['variantsV2']:
131+
menu_item['variants'].append({
132+
'name': variant.get('variantName', ''),
133+
'price': float(variant.get('price', 0)) / 100,
134+
'is_default': variant.get('isDefault', False)
135+
})
136+
else:
137+
menu_item['variants'].append({
138+
'name': 'Regular',
139+
'price': float(item.get('defaultPrice', 0)) / 100,
140+
'is_default': True
141+
})
142+
143+
subcategories[subcategory_name]['items'].append(menu_item)
120144

121-
if category_items['items']:
122-
menu_categories.append(category_items)
123-
145+
# Add non-empty subcategories to category
146+
category_data['subcategories'] = [
147+
subcat for subcat in subcategories.values()
148+
if subcat['items']
149+
]
150+
151+
if category_data['subcategories']:
152+
menu_categories.append(category_data)
153+
124154
return {
155+
'status': 'success',
125156
'restaurant_info': restaurant_info,
126157
'menu': menu_categories
127158
}
128-
159+
129160
except Exception as e:
130161
logger.error(f"Error extracting menu data: {str(e)}")
131-
raise
162+
raise HTTPException(
163+
status_code=500,
164+
detail=f"Failed to extract menu data: {str(e)}"
165+
)
166+
132167

133168
async def create_restaurant(client: httpx.AsyncClient, restaurant_data: RestaurantCreate):
134169
response = await client.post("/venue/restaurants/", json=restaurant_data.dict())
@@ -161,18 +196,21 @@ async def scrape_and_create_menu(
161196
current_user: UserBusiness = Depends(get_business_user)
162197
):
163198
try:
164-
# 1. Scrape data
199+
# 1. Scrape and log data
165200
html_content = fetch_zomato_data(str(request.url))
166201
json_data = parse_zomato_page(html_content)
167202
scraped_data = extract_menu_data(json_data)
203+
204+
logger.info("Scraped Restaurant Info:")
205+
logger.info(f"Name: {scraped_data['restaurant_info']['name']}")
206+
logger.info(f"Cuisines: {scraped_data['restaurant_info']['cuisines']}")
168207

169208
async with httpx.AsyncClient() as client:
170209
# 2. Create Restaurant
171210
venue_data = VenueCreate(
172211
name=scraped_data['restaurant_info']['name'],
173212
description="Restaurant imported from Zomato",
174-
opening_time=scraped_data['restaurant_info']['timing']['opening_time'],
175-
avg_expense_for_two=float(scraped_data['restaurant_info']['avg_cost_for_two']),
213+
avg_expense_for_two=scraped_data['restaurant_info']['avg_cost_for_two'],
176214
zomato_link=str(request.url)
177215
)
178216

@@ -196,15 +234,13 @@ async def scrape_and_create_menu(
196234

197235
# 4. Create Categories, Subcategories, and Items sequentially
198236
for category in scraped_data['menu']:
199-
# Create Category
200237
category_data = MenuCategoryCreate(
201238
name=category['category'],
202239
menu_id=menu_id
203240
)
204241
category_id = await create_category(client, category_data)
205242
logger.info(f"Created category: {category['category']}")
206243

207-
# Create default subcategory
208244
subcategory_data = MenuSubCategoryCreate(
209245
name=f"{category['category']} Items",
210246
category_id=category_id,
@@ -213,7 +249,6 @@ async def scrape_and_create_menu(
213249
subcategory_id = await create_subcategory(client, subcategory_data)
214250
logger.info(f"Created subcategory for {category['category']}")
215251

216-
# Create items one by one
217252
for item in category['items']:
218253
item_data = MenuItemCreate(
219254
name=item['name'],
@@ -230,9 +265,45 @@ async def scrape_and_create_menu(
230265
"message": "Menu successfully created",
231266
"venue_id": str(venue_id),
232267
"menu_id": str(menu_id),
233-
"restaurant_name": scraped_data['restaurant_info']['name']
268+
"restaurant_name": scraped_data['restaurant_info']['name'],
269+
"scraped_data": scraped_data
234270
}
235271

236272
except Exception as e:
237273
logger.error(f"Menu creation failed: {str(e)}")
238274
raise HTTPException(status_code=500, detail=f"Failed to create menu: {str(e)}")
275+
276+
277+
@router.get("/menu/scrape")
278+
async def get_scraped_menu(url: str):
279+
try:
280+
# Validate URL
281+
zomato_url = ZomatoUrl(url=url)
282+
283+
# Scrape data using existing functions
284+
html_content = fetch_zomato_data(str(zomato_url.url))
285+
json_data = parse_zomato_page(html_content)
286+
# print(json_data)
287+
# scraped_data = extract_menu_data(json_data)
288+
print("==== cleaning the data ====")
289+
scraped_data = transform_restaurant_data(json_data)
290+
print(scraped_data)
291+
292+
return {
293+
"status": "success",
294+
"restaurant_info": scraped_data,
295+
"menu": scraped_data['menu']
296+
}
297+
298+
except ValueError as ve:
299+
logger.error(f"Invalid URL format: {str(ve)}")
300+
raise HTTPException(
301+
status_code=400,
302+
detail=f"Invalid Zomato URL: {str(ve)}"
303+
)
304+
except Exception as e:
305+
logger.error(f"Scraping failed: {str(e)}")
306+
raise HTTPException(
307+
status_code=500,
308+
detail=f"Failed to scrape menu data: {str(e)}"
309+
)

0 commit comments

Comments
 (0)