Skip to content

Commit bf706de

Browse files
committed
[main] Added full parsing for pre-req and all course fields
1 parent 5ff6a44 commit bf706de

File tree

1 file changed

+215
-42
lines changed

1 file changed

+215
-42
lines changed

webscraperudel/webscraper.py

Lines changed: 215 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@
66
from bs4 import BeautifulSoup, ResultSet
77
from webdriver_manager.firefox import GeckoDriverManager
88
from selenium import webdriver
9+
import re
910

1011
ALL_COURSE_URL = 'https://udapps.nss.udel.edu/CoursesSearch/search-results?term=2228&search_type=A&course_sec=&session=All&course_title=&instr_name=&text_info=All&campus=&instrtn_mode=All&time_start_hh=&time_start_ampm=&credit=Any&keyword=&geneduc=&subj_area_code=&college='
12+
not_available = 'N/A'
13+
course_information = {}
1114

1215
campus_mapping = {
1316
'NEWRK': 'Newark',
@@ -22,7 +25,7 @@
2225
'T': 'Tuesday',
2326
'W': 'Wednesday',
2427
'TR': 'Thursday',
25-
'R': 'Friday'
28+
'F': 'Friday',
2629
}
2730

2831
location_mapping = {
@@ -77,16 +80,12 @@ def parse_course_days(daystr: str):
7780
course_days = []
7881
if 'M' in daystr:
7982
course_days.append(course_mapping['M'])
80-
elif 'W' in daystr:
83+
if 'W' in daystr:
8184
course_days.append(course_mapping['W'])
82-
elif 'F' in daystr:
85+
if 'F' in daystr:
8386
course_days.append(course_mapping['F'])
84-
elif 'TR' in daystr:
87+
if 'TR' in daystr:
8588
course_days.append(course_mapping['TR'])
86-
elif 'R' in daystr and 'TR' not in daystr:
87-
course_days.append(course_mapping['R'])
88-
elif 'R' in daystr and daystr.count('R') == 2:
89-
course_days.append(course_mapping['R'])
9089
return course_days
9190

9291

@@ -97,10 +96,16 @@ def parse_course_time(timestr: str):
9796
Returns:
9897
The parsed time of the course, [lower, upper] bounds
9998
"""
100-
split_timestr = timestr.replace('PM', '').split(' - ')
99+
split_timestr = timestr.split(' - ')
101100
left_bound = split_timestr[0]
102101
right_bound = split_timestr[1]
103-
return [left_bound, right_bound]
102+
if '\n' in right_bound:
103+
_ind = right_bound.index('\n')
104+
right_bound = right_bound[0:_ind].strip()
105+
right_bound = right_bound.replace(
106+
'AM' if 'AM' in right_bound else 'PM', '')
107+
is_am = 'AM' in right_bound
108+
return [left_bound, right_bound, is_am]
104109

105110

106111
def parse_course_location(locationstr: str):
@@ -121,44 +126,212 @@ def parse_course_location(locationstr: str):
121126

122127

123128
def main():
124-
base_url = 'https://udapps.nss.udel.edu/CoursesSearch/search-results'
129+
base_url = 'https: // udapps.nss.udel.edu/CoursesSearch/'
130+
next_button = None
131+
started_searching = False
125132
page = requests.get('{}?{}'.format(
126133
ALL_COURSE_URL, generate_search_endpoint('A')))
127134
while (not page):
128135
pass
129136
soup = BeautifulSoup(page.content, "html.parser")
130-
course_information = {}
131137
rows: ResultSet = soup.tbody.find_all('tr')
132-
for eachrow in rows[:1]:
133-
# print(eachrow.contents)
134-
[name, number, section] = parse_course_name(
135-
eachrow.find('td', class_='course').a.text)
136-
print([name, number, section])
137-
# print([name, number, section])
138-
# print(eachrow.contents[3].string)
139-
# print([name, number, section])
140-
# course_title = eachrow.children[1].text()
141-
# course_campus = eachrow.find('td', class_='campus').text().strip()
142-
# if course_campus in campus_mapping:
143-
# course_campus = campus_mapping[course_campus]
144-
# course_credits = eachrow.children[4].text().replace('Hrs', '').strip()
145-
# course_days = parse_course_days(
146-
# eachrow.find('td', class_='day').text().strip())
147-
# [start, end] = parse_course_time(
148-
# eachrow.find('td', class_='time').text().strip())
149-
# course_location = parse_course_location(eachrow.find(
150-
# 'td', class_='location').children[0].text().strip())
151-
# course_information[name]: dict = {
152-
# course_number: number,
153-
# course_section: section,
154-
# course_title: course_title,
155-
# course_campus: course_campus,
156-
# course_credits: int(course_credits),
157-
# course_days: course_days,
158-
# course_start_time: start,
159-
# course_end_time: end,
160-
# course_location: course_location,
161-
# }
138+
name = ''
139+
number = ''
140+
section = ''
141+
course_title = ''
142+
course_campus = ''
143+
course_total_seats = ''
144+
course_credits = ''
145+
course_day = ''
146+
course_time = ''
147+
course_location = ''
148+
course_teacher = ''
149+
course_prereqs = []
150+
course_prereqs_or = False
151+
course_coreqs = []
152+
course_coreqs_or = False
153+
154+
for eachrow in rows:
155+
course_prereqs = []
156+
course_prereqs_or = False
157+
course_coreqs = []
158+
course_coreqs_or = False
159+
try:
160+
[name, number, section] = parse_course_name(
161+
eachrow.find('td', class_='course').a.text)
162+
except:
163+
name = not_available
164+
number = not_available
165+
section = not_available
166+
try:
167+
course_title = eachrow.contents[3].text.strip().split(' ')[
168+
0].strip()
169+
except:
170+
course_title = not_available
171+
try:
172+
course_campus = campus_mapping[eachrow.find(
173+
'td', class_='campus').text.strip()]
174+
except:
175+
course_campus = not_available
176+
try:
177+
course_total_seats = eachrow.find(
178+
'td', class_='openseats').text.strip().replace('CURRENTLY FULL', '').split(' OF ')[1].strip()
179+
except:
180+
course_total_seats = not_available
181+
try:
182+
course_credits = eachrow.find(
183+
'td', string=re.compile('Hrs')).text.strip().split(' Hrs')[0]
184+
except:
185+
course_credits = not_available
186+
try:
187+
course_day = parse_course_days(
188+
eachrow.find('td', class_='day').text.strip())
189+
except:
190+
course_day = not_available
191+
try:
192+
course_time = parse_course_time(
193+
eachrow.find('td', class_='time').text.strip())
194+
except:
195+
course_time = not_available
196+
try:
197+
course_location = parse_course_location(
198+
eachrow.find('td', class_='location').a.text.strip())
199+
except:
200+
course_location = not_available
201+
try:
202+
course_teacher = eachrow.contents[len(
203+
eachrow.contents) - 4].text.strip()
204+
except:
205+
course_teacher = not_available
206+
stored_result = {}
207+
stored_result['name'] = name
208+
stored_result['number'] = number
209+
stored_result['section'] = section
210+
stored_result['title'] = course_title
211+
stored_result['campus'] = course_campus
212+
stored_result['total_seats'] = course_total_seats
213+
stored_result['credits'] = course_credits
214+
stored_result['day'] = course_day
215+
stored_result['course_time'] = course_time
216+
stored_result['location'] = course_location
217+
stored_result['teacher'] = course_teacher
218+
try:
219+
course_detail_link = eachrow.find_all(
220+
'a', class_='coursenum')[0]['href']
221+
url_ = f'{base_url}{course_detail_link}'.replace(
222+
' // ', '//').replace('§ion', '&section')
223+
course_detail_page = requests.get(
224+
url_)
225+
souped_content = BeautifulSoup(
226+
course_detail_page.content, 'html.parser')
227+
pre_req_paragraphs = souped_content.find_all(
228+
'p', string=re.compile('PREREQ|Prerequisites'))
229+
for eachsoupparagraph in pre_req_paragraphs:
230+
if 'PREREQ' in eachsoupparagraph.text:
231+
eachparagraph = eachsoupparagraph.text
232+
capital_ind = eachparagraph.index('PREREQ')
233+
capital_start = eachparagraph[capital_ind + 6:]
234+
capital_second_start = capital_start.index(name)
235+
capital_start = capital_start[capital_second_start:]
236+
capital_end_index = capital_start.index('.')
237+
capital_substr = capital_start[0:capital_end_index]
238+
if 'or' in capital_substr:
239+
split_ors = capital_substr.split(' or ')
240+
for eachclass in split_ors:
241+
if eachclass not in course_prereqs:
242+
course_prereqs.append(eachclass.strip())
243+
course_prereqs_or = True
244+
elif 'and' in capital_substr:
245+
split_and = capital_substr.split(' and ')
246+
for eachclass in split_and:
247+
if eachclass not in course_prereqs:
248+
course_prereqs.append(eachclass.strip())
249+
else:
250+
capital_substr = capital_substr.strip()
251+
if capital_substr not in course_prereqs:
252+
course_prereqs.append(capital_substr)
253+
elif 'Prerequisites' in eachparagraph.text:
254+
lowercase_ind = eachparagraph.index('Prerequisites:')
255+
capital_start = eachparagraph[lowercase_ind + 14:]
256+
capital_second_start = capital_start.index(name)
257+
capital_start = capital_start[capital_second_start:]
258+
capital_end_index = capital_start.index('.')
259+
capital_substr = capital_start[0:capital_end_index]
260+
if 'or' in capital_substr:
261+
split_ors = capital_substr.split(' or ')
262+
for eachclass in split_ors:
263+
if eachclass not in course_prereqs:
264+
course_prereqs.append(eachclass.strip())
265+
course_prereqs_or = True
266+
elif 'and' in capital_substr:
267+
split_and = capital_substr.split(' and ')
268+
for eachclass in split_and:
269+
if eachclass not in course_prereqs:
270+
course_prereqs.append(eachclass.strip())
271+
else:
272+
capital_substr = capital_substr.strip()
273+
if capital_substr not in course_prereqs:
274+
course_prereqs.append(capital_substr)
275+
if len(course_prereqs) > 0:
276+
course_prereqs.append(course_prereqs_or)
277+
except:
278+
pass
279+
try:
280+
coreq_paragraphs = souped_content.find_all(
281+
'p', string=re.compile('COREQ|Corequisites'))
282+
for eachsoupparagraph in coreq_paragraphs:
283+
if 'COREQ' in eachsoupparagraph.text:
284+
eachparagraph = eachsoupparagraph.text
285+
capital_ind = eachparagraph.index('COREQ')
286+
capital_start = eachparagraph[capital_ind + 6:]
287+
capital_second_start = capital_start.index(name)
288+
capital_start = capital_start[capital_second_start:]
289+
capital_end_index = capital_start.index('.')
290+
capital_substr = capital_start[0:capital_end_index]
291+
if 'or' in capital_substr:
292+
split_ors = capital_substr.split(' or ')
293+
for eachclass in split_ors:
294+
if eachclass not in course_coreqs:
295+
course_coreqs.append(eachclass.strip())
296+
course_coreqs_or = True
297+
elif 'and' in capital_substr:
298+
split_and = capital_substr.split(' and ')
299+
for eachclass in split_and:
300+
if eachclass not in course_coreqs:
301+
course_coreqs.append(eachclass.strip())
302+
else:
303+
capital_substr = capital_substr.strip()
304+
if capital_substr not in course_coreqs:
305+
course_coreqs.append(capital_substr)
306+
elif 'Corequisites' in eachparagraph.text:
307+
lowercase_ind = eachparagraph.index('Corequisites:')
308+
capital_start = eachparagraph[lowercase_ind + 14:]
309+
capital_second_start = capital_start.index(name)
310+
capital_start = capital_start[capital_second_start:]
311+
capital_end_index = capital_start.index('.')
312+
capital_substr = capital_start[0:capital_end_index]
313+
if 'or' in capital_substr:
314+
split_ors = capital_substr.split(' or ')
315+
for eachclass in split_ors:
316+
if eachclass not in course_coreqs:
317+
course_coreqs.append(eachclass.strip())
318+
course_coreqs_or = True
319+
elif 'and' in capital_substr:
320+
split_and = capital_substr.split(' and ')
321+
for eachclass in split_and:
322+
if eachclass not in course_coreqs:
323+
course_coreqs.append(eachclass.strip())
324+
else:
325+
capital_substr = capital_substr.strip()
326+
if capital_substr not in course_coreqs:
327+
course_coreqs.append(capital_substr)
328+
if len(course_coreqs) > 0:
329+
course_coreqs.append(course_coreqs)
330+
except:
331+
pass
332+
stored_result['prereqs'] = course_prereqs
333+
stored_result['coreqs'] = course_coreqs
334+
course_information[f'{name}{number}{section}'] = stored_result
162335
pprint(course_information)
163336

164337

0 commit comments

Comments
 (0)