Skip to content

Commit 99f6887

Browse files
committed
[main] Added base for udel webscraper to improve course data in CourseScheduler
1 parent ae57ab5 commit 99f6887

File tree

2 files changed

+161
-1
lines changed

2 files changed

+161
-1
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
obj
44
bin
55
.cache
6-
*.cache
6+
*.cache
7+
*.log

webscraperudel/webscraper.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import time
2+
from pprint import pprint
3+
from requests.auth import HTTPBasicAuth
4+
from bs4 import BeautifulSoup, ResultSet
5+
from webdriver_manager.firefox import GeckoDriverManager
6+
from selenium import webdriver
7+
8+
campus_mapping = {
9+
'NEWRK': 'Newark',
10+
'DOVER': 'Dover',
11+
'GTOWN': 'Georgetown',
12+
'LEWES': 'Lewes',
13+
'WILM': 'Wilmington',
14+
}
15+
16+
course_mapping = {
17+
'M': 'Monday',
18+
'T': 'Tuesday',
19+
'W': 'Wednesday',
20+
'TR': 'Thursday',
21+
'R': 'Friday'
22+
}
23+
24+
location_mapping = {
25+
'GOR': 'Gore',
26+
'PRN': 'Purnell',
27+
'ALS': 'Alison',
28+
'LEH': 'Alfred Lerner',
29+
'MDH': 'McDowell',
30+
'MEM': "Memorial",
31+
'SPL': 'Spencer Lab',
32+
'WHL': 'Willard'
33+
}
34+
35+
36+
def generate_search_endpoint(prefix):
37+
return f'term=2228&search_type={prefix}&course_sec={prefix}&session=All&course_title=&instr_name=&text_info=All&campus=&instrtn_mode=All&time_start_hh=&time_start_ampm=&credit=Any&keyword=&geneduc=&subj_area_code=&college='
38+
39+
40+
def parse_course_name(name: str):
41+
"""
42+
Arguments:
43+
name: the parsed course name, will come in like ACCT200010, and we parse it into it's respective parts
44+
[ACCT, 200, 010], the name, course number, and section
45+
Returns:
46+
The parsed course name, into it's name, number, and section
47+
"""
48+
name_ = ''
49+
number_ = ''
50+
section_ = ''
51+
is_number = False
52+
for i in range(len(name)):
53+
if not is_number and name.isdigit():
54+
if len(number_) == 3:
55+
section_ += name[i]
56+
else:
57+
number_ += name[i]
58+
else:
59+
name_ += name[i]
60+
if i < len(name) - 1:
61+
is_number = name[i + 1].isdigit()
62+
return [name_, number_, section_]
63+
64+
65+
def parse_course_days(daystr: str):
66+
"""
67+
Arguments:
68+
daystr - The string of the day, MWF, TR, R, etc
69+
70+
Returns:
71+
The parsed daystr, which will be an array of the days ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
72+
"""
73+
course_days = []
74+
if 'M' in daystr:
75+
course_days.append(course_mapping['M'])
76+
elif 'W' in daystr:
77+
course_days.append(course_mapping['W'])
78+
elif 'F' in daystr:
79+
course_days.append(course_mapping['F'])
80+
elif 'TR' in daystr:
81+
course_days.append(course_mapping['TR'])
82+
elif 'R' in daystr and 'TR' not in daystr:
83+
course_days.append(course_mapping['R'])
84+
elif 'R' in daystr and daystr.count('R') == 2:
85+
course_days.append(course_mapping['R'])
86+
return course_days
87+
88+
89+
def parse_course_time(timestr: str):
90+
"""
91+
Arguments:
92+
timestr - The string of the time of the course, 3:30PM - 5:15PM
93+
Returns:
94+
The parsed time of the course, [lower, upper] bounds
95+
"""
96+
split_timestr = timestr.replace('PM', '').split(' - ')
97+
left_bound = split_timestr[0]
98+
right_bound = split_timestr[1]
99+
return [left_bound, right_bound]
100+
101+
102+
def parse_course_location(locationstr: str):
103+
"""
104+
Arguments:
105+
locationstr - The string of the location of the course
106+
Returns:
107+
The parsed location string
108+
"""
109+
room = ''
110+
loc = ''
111+
for eachletter in locationstr:
112+
if eachletter.isdigit():
113+
room += eachletter
114+
else:
115+
loc += eachletter
116+
return [location_mapping[loc] if loc in location_mapping else loc, room]
117+
118+
119+
def main():
120+
base_url = 'https://udapps.nss.udel.edu/CoursesSearch/search-results'
121+
br = webdriver.Firefox(executable_path=GeckoDriverManager().install())
122+
br.get('{}?{}'.format(base_url, generate_search_endpoint('A')))
123+
time.sleep(2)
124+
soup = BeautifulSoup(br.page_source, "html.parser")
125+
course_information = {}
126+
odd_rows: ResultSet = soup.find_all('tr', class_='odd')
127+
even_rows = soup.find_all('tr', class_='even')
128+
for eachrow in odd_rows:
129+
[name, number, section] = parse_course_name(eachrow.find(
130+
'a', class_='coursenum').string)
131+
print(eachrow.contents[3].string)
132+
print([name, number, section])
133+
# course_title = eachrow.children[1].text()
134+
# course_campus = eachrow.find('td', class_='campus').text().strip()
135+
# if course_campus in campus_mapping:
136+
# course_campus = campus_mapping[course_campus]
137+
# course_credits = eachrow.children[4].text().replace('Hrs', '').strip()
138+
# course_days = parse_course_days(
139+
# eachrow.find('td', class_='day').text().strip())
140+
# [start, end] = parse_course_time(
141+
# eachrow.find('td', class_='time').text().strip())
142+
# course_location = parse_course_location(eachrow.find(
143+
# 'td', class_='location').children[0].text().strip())
144+
# course_information[name]: dict = {
145+
# course_number: number,
146+
# course_section: section,
147+
# course_title: course_title,
148+
# course_campus: course_campus,
149+
# course_credits: int(course_credits),
150+
# course_days: course_days,
151+
# course_start_time: start,
152+
# course_end_time: end,
153+
# course_location: course_location,
154+
# }
155+
pprint(course_information)
156+
157+
158+
if __name__ == '__main__':
159+
main()

0 commit comments

Comments
 (0)