Skip to content

Commit 3fbc56e

Browse files
committed
update scrape
1 parent f917cef commit 3fbc56e

File tree

4 files changed

+120
-28
lines changed

4 files changed

+120
-28
lines changed

pipelines/crawl/test.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
4+
response = requests.get("https://www.princeton.edu/events/2025/millstone-exhibition-jasper-waldman-0", timeout=2)
5+
assert(response.status_code == 200)
6+
soup = BeautifulSoup(response.text, 'html.parser')
7+
text = str(soup)
8+
print(text)

pipelines/extension/scrape.py

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ def get_weather():
2626
def get_menus():
2727
dining_halls = {
2828
"Whitman": 8,
29-
"Wucox": 2,
3029
"Roma": 1,
3130
"Forbes": 3,
3231
"Center for Jewish Life": 5,
@@ -39,22 +38,35 @@ def get_menus():
3938
text = requests.get(url).text
4039
soup = BeautifulSoup(text, features="lxml")
4140
menus = soup.findAll("div", {"class" : "card mealCard"})
42-
4341
for menu in menus:
44-
text = menu.text.replace("Nutrition", '').replace('\r', '')
45-
items = [item.strip() for item in text.split("\n") if item.strip()]
46-
subitems = {}
47-
category = None
48-
for i in range(1, len(items)):
49-
if items[i][0] == '-':
50-
key = items[i].replace('-- ', '').replace(' --', '')
51-
subitems[key] = []
52-
category = key
53-
else:
54-
subitems[category].append(items[i])
55-
dhall_result[items[0]] = subitems
42+
header_div = menu.select_one(".card-header")
43+
card_header = header_div.find(string=True, recursive=False).strip()
44+
45+
# 2) Stations -> foods mapping
46+
stations = {}
47+
accordion = menu.select_one(".accordion.accordion-flush")
48+
49+
# Each .mealStation is followed by one or more .accordion-item(s)
50+
# until the next .mealStation (or the end)
51+
for station_div in accordion.select("div.mealStation"):
52+
station_name = station_div.get_text(strip=True)
53+
foods = []
54+
55+
# Walk forward through siblings until the next mealStation
56+
for sib in station_div.find_next_siblings():
57+
classes = sib.get("class", [])
58+
if "mealStation" in classes:
59+
break # next station reached
60+
if "accordion-item" in classes:
61+
title_el = sib.select_one(".title")
62+
if title_el:
63+
foods.append(title_el.get_text(strip=True))
64+
65+
stations[station_name] = foods
66+
67+
dhall_result[card_header] = stations
68+
5669
result[dhall] = dhall_result
57-
result["_id"] = "dhall"
5870
return result
5971

6072
def get_prince():

pipelines/extension/scrape_new.py

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -80,20 +80,34 @@ def get_menus():
8080
text = requests.get(url).text
8181
soup = BeautifulSoup(text, features="lxml")
8282
menus = soup.findAll("div", {"class" : "card mealCard"})
83-
8483
for menu in menus:
85-
text = menu.text.replace("Nutrition", '').replace('\r', '')
86-
items = [item.strip() for item in text.split("\n") if item.strip()]
87-
subitems = {}
88-
category = None
89-
for i in range(1, len(items)):
90-
if items[i][0] == '-':
91-
key = items[i].replace('-- ', '').replace(' --', '')
92-
subitems[key] = []
93-
category = key
94-
else:
95-
subitems[category].append(items[i])
96-
dhall_result[items[0]] = subitems
84+
header_div = menu.select_one(".card-header")
85+
card_header = header_div.find(string=True, recursive=False).strip()
86+
87+
# 2) Stations -> foods mapping
88+
stations = {}
89+
accordion = menu.select_one(".accordion.accordion-flush")
90+
91+
# Each .mealStation is followed by one or more .accordion-item(s)
92+
# until the next .mealStation (or the end)
93+
for station_div in accordion.select("div.mealStation"):
94+
station_name = station_div.get_text(strip=True)
95+
foods = []
96+
97+
# Walk forward through siblings until the next mealStation
98+
for sib in station_div.find_next_siblings():
99+
classes = sib.get("class", [])
100+
if "mealStation" in classes:
101+
break # next station reached
102+
if "accordion-item" in classes:
103+
title_el = sib.select_one(".title")
104+
if title_el:
105+
foods.append(title_el.get_text(strip=True))
106+
107+
stations[station_name] = foods
108+
109+
dhall_result[card_header] = stations
110+
97111
result[dhall] = dhall_result
98112
return result
99113

pipelines/extension/test.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
4+
def get_menus():
5+
dining_halls = {
6+
"Whitman": 8,
7+
"Roma": 1,
8+
"Forbes": 3,
9+
"Center for Jewish Life": 5,
10+
"Yeh/NCW": 6
11+
}
12+
result = {}
13+
for dhall, index in dining_halls.items():
14+
dhall_result = {}
15+
url = "https://menus.princeton.edu/dining/_Foodpro/online-menu/menuDetails.asp?locationNum={:02d}".format(index)
16+
text = requests.get(url).text
17+
soup = BeautifulSoup(text, features="lxml")
18+
menus = soup.findAll("div", {"class" : "card mealCard"})
19+
for menu in menus:
20+
header_div = menu.select_one(".card-header")
21+
card_header = header_div.find(string=True, recursive=False).strip()
22+
23+
# 2) Stations -> foods mapping
24+
stations = {}
25+
accordion = menu.select_one(".accordion.accordion-flush")
26+
27+
# Each .mealStation is followed by one or more .accordion-item(s)
28+
# until the next .mealStation (or the end)
29+
for station_div in accordion.select("div.mealStation"):
30+
station_name = station_div.get_text(strip=True)
31+
foods = []
32+
33+
# Walk forward through siblings until the next mealStation
34+
for sib in station_div.find_next_siblings():
35+
classes = sib.get("class", [])
36+
if "mealStation" in classes:
37+
break # next station reached
38+
if "accordion-item" in classes:
39+
title_el = sib.select_one(".title")
40+
if title_el:
41+
foods.append(title_el.get_text(strip=True))
42+
43+
stations[station_name] = foods
44+
45+
dhall_result[card_header] = stations
46+
47+
result[dhall] = dhall_result
48+
return result
49+
50+
print(get_menus())
51+
52+
# items[0] is something like "Entree"
53+
# subitems is a list, something like ["French Toast", "Scrambled Eggs"]
54+
55+
# orderedData.push({
56+
# cat: priority[i],
57+
# items: dhallData[priority[i]].slice(0, 3).join(", "),
58+
# });

0 commit comments

Comments
 (0)