Skip to content

Commit ba272d9

Browse files
authored
Coursebook parser building/room bug #48 (#58)
* upgraded chromedp version * fixed room parsing issue for locations * refactor some lines * change default meeting days from null to empty list * minor refactor of getMeetings * remove redundant lines from getMeetings
1 parent f185ec4 commit ba272d9

File tree

7 files changed

+107
-83
lines changed

7 files changed

+107
-83
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ require (
1010
github.com/joho/godotenv v1.5.1
1111
github.com/valyala/fastjson v1.6.4
1212
go.mongodb.org/mongo-driver v1.15.0
13+
golang.org/x/net v0.21.0
1314
)
1415

1516
require (
@@ -50,7 +51,6 @@ require (
5051
github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a // indirect
5152
golang.org/x/arch v0.7.0 // indirect
5253
golang.org/x/crypto v0.22.0 // indirect
53-
golang.org/x/net v0.24.0 // indirect
5454
golang.org/x/sync v0.7.0 // indirect
5555
golang.org/x/sys v0.29.0 // indirect
5656
golang.org/x/text v0.14.0 // indirect

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,8 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v
128128
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
129129
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
130130
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
131-
golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w=
132-
golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
131+
golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4=
132+
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
133133
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
134134
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
135135
golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=

parser/courseParser.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import (
55
"regexp"
66
"strconv"
77

8+
"github.com/PuerkitoBio/goquery"
9+
810
"github.com/UTDNebula/api-tools/utils"
911
"github.com/UTDNebula/nebula-api/api/schema"
1012
"go.mongodb.org/mongo-driver/bson/primitive"
@@ -31,7 +33,7 @@ func getCatalogYear(session schema.AcademicSession) string {
3133
}
3234
}
3335

34-
func parseCourse(courseNum string, session schema.AcademicSession, rowInfo map[string]string, classInfo map[string]string) *schema.Course {
36+
func parseCourse(courseNum string, session schema.AcademicSession, rowInfo map[string]*goquery.Selection, classInfo map[string]string) *schema.Course {
3537
// Courses are internally keyed by their internal course number and the catalog year they're part of
3638
catalogYear := getCatalogYear(session)
3739
courseKey := courseNum + catalogYear
@@ -51,9 +53,9 @@ func parseCourse(courseNum string, session schema.AcademicSession, rowInfo map[s
5153
course.Id = primitive.NewObjectID()
5254
course.Course_number = idMatches[2]
5355
course.Subject_prefix = idMatches[1]
54-
course.Title = rowInfo["Course Title:"]
55-
course.Description = rowInfo["Description:"]
56-
course.School = rowInfo["College:"]
56+
course.Title = utils.TrimWhitespace(rowInfo["Course Title:"].Text())
57+
course.Description = utils.TrimWhitespace(rowInfo["Description:"].Text())
58+
course.School = utils.TrimWhitespace(rowInfo["College:"].Text())
5759
course.Credit_hours = classInfo["Semester Credit Hours:"]
5860
course.Class_level = classInfo["Class Level:"]
5961
course.Activity_type = classInfo["Activity Type:"]

parser/parser.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -117,19 +117,20 @@ func parse(path string) {
117117
var syllabusURI string
118118

119119
// Dictionary to hold the row data, keyed by row header
120-
rowInfo := make(map[string]string, len(infoRows.Nodes))
120+
rowInfo := make(map[string]*goquery.Selection, len(infoRows.Nodes))
121121

122122
// Populate rowInfo
123123
infoRows.Each(func(_ int, row *goquery.Selection) {
124124
rowHeader := utils.TrimWhitespace(row.FindMatcher(goquery.Single("th")).Text())
125-
rowData := row.FindMatcher(goquery.Single("td"))
126-
rowInfo[rowHeader] = utils.TrimWhitespace(rowData.Text())
127-
// Get syllabusURI from syllabus row link
128-
if rowHeader == "Syllabus:" {
129-
syllabusURI, _ = rowData.FindMatcher(goquery.Single("a")).Attr("href")
130-
}
125+
rowInfo[rowHeader] = row.FindMatcher(goquery.Single("td"))
126+
131127
})
132128

129+
// Get syllabusURI from syllabus row link
130+
if syllabus, ok := rowInfo["syllabus"]; ok {
131+
syllabusURI, _ = syllabus.FindMatcher(goquery.Single("a")).Attr("href")
132+
}
133+
133134
// Get the rows of the class info subtable
134135
infoSubTable := infoTable.FindMatcher(goquery.Single("table.courseinfo__classsubtable > tbody"))
135136
infoRows = infoSubTable.ChildrenFiltered("tr")

parser/professorParser.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@ package parser
33
import (
44
"strings"
55

6+
"github.com/PuerkitoBio/goquery"
7+
68
"github.com/UTDNebula/api-tools/utils"
79
"github.com/UTDNebula/nebula-api/api/schema"
810
"go.mongodb.org/mongo-driver/bson/primitive"
911
)
1012

11-
func parseProfessors(sectionId primitive.ObjectID, rowInfo map[string]string, classInfo map[string]string) []primitive.ObjectID {
12-
professorText := rowInfo["Instructor(s):"]
13+
func parseProfessors(sectionId primitive.ObjectID, rowInfo map[string]*goquery.Selection, classInfo map[string]string) []primitive.ObjectID {
14+
professorText := utils.TrimWhitespace(rowInfo["Instructor(s):"].Text())
1315
professorMatches := personRegexp.FindAllStringSubmatch(professorText, -1)
1416
var profRefs []primitive.ObjectID = make([]primitive.ObjectID, 0, len(professorMatches))
1517
for _, match := range professorMatches {

parser/requisiteParser.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import (
77
"strconv"
88
"strings"
99

10+
"github.com/PuerkitoBio/goquery"
11+
1012
"github.com/UTDNebula/api-tools/utils"
1113
"github.com/UTDNebula/nebula-api/api/schema"
1214
)
@@ -367,16 +369,16 @@ var coreqRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(Corequisites?:(.*))`)
367369
var reqRegexes [3]*regexp.Regexp = [3]*regexp.Regexp{preOrCoreqRegexp, prereqRegexp, coreqRegexp}
368370

369371
// Returns a closure that parses the course's requisites
370-
func getReqParser(course *schema.Course, hasEnrollmentReqs bool, enrollmentReqs string) func() {
372+
func getReqParser(course *schema.Course, hasEnrollmentReqs bool, enrollmentReqs *goquery.Selection) func() {
371373
return func() {
372374
// Pointer array to course requisite properties must be in same order as reqRegexes above
373375
courseReqs := [3]**schema.CollectionRequirement{&course.Co_or_pre_requisites, &course.Prerequisites, &course.Corequisites}
374376
// The actual text to check for requisites
375377
var checkText string
376378
// Extract req text from the enrollment req info if it exists, otherwise try using the description
377379
if hasEnrollmentReqs {
378-
course.Enrollment_reqs = enrollmentReqs
379-
checkText = enrollmentReqs
380+
course.Enrollment_reqs = utils.TrimWhitespace(enrollmentReqs.Text())
381+
checkText = utils.TrimWhitespace(enrollmentReqs.Text())
380382
} else {
381383
checkText = course.Description
382384
}

parser/sectionParser.go

Lines changed: 81 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,22 @@
11
package parser
22

33
import (
4-
"fmt"
54
"regexp"
65
"strings"
76
"time"
87

8+
"github.com/PuerkitoBio/goquery"
99
"github.com/UTDNebula/api-tools/utils"
1010
"github.com/UTDNebula/nebula-api/api/schema"
1111
"go.mongodb.org/mongo-driver/bson/primitive"
12+
"golang.org/x/net/html/atom"
1213
)
1314

1415
var sectionPrefixRegexp *regexp.Regexp = utils.Regexpf(`^(?i)%s\.(%s)`, utils.R_SUBJ_COURSE, utils.R_SECTION_CODE)
1516
var coreRegexp *regexp.Regexp = regexp.MustCompile(`[0-9]{3}`)
1617
var personRegexp *regexp.Regexp = regexp.MustCompile(`(.+)・(.+)・(.+)`)
1718

18-
func parseSection(courseRef *schema.Course, classNum string, syllabusURI string, session schema.AcademicSession, rowInfo map[string]string, classInfo map[string]string) {
19+
func parseSection(courseRef *schema.Course, classNum string, syllabusURI string, session schema.AcademicSession, rowInfo map[string]*goquery.Selection, classInfo map[string]string) {
1920
// Get subject prefix and course number by doing a regexp match on the section id
2021
sectionId := classInfo["Class Section:"]
2122
idMatches := sectionPrefixRegexp.FindStringSubmatch(sectionId)
@@ -34,7 +35,7 @@ func parseSection(courseRef *schema.Course, classNum string, syllabusURI string,
3435
section.Professors = parseProfessors(section.Id, rowInfo, classInfo)
3536

3637
// Get all TA/RA info
37-
assistantText := rowInfo["TA/RA(s):"]
38+
assistantText := utils.TrimWhitespace(rowInfo["TA/RA(s):"].Text())
3839
assistantMatches := personRegexp.FindAllStringSubmatch(assistantText, -1)
3940
section.Teaching_assistants = make([]schema.Assistant, 0, len(assistantMatches))
4041
for _, match := range assistantMatches {
@@ -50,18 +51,17 @@ func parseSection(courseRef *schema.Course, classNum string, syllabusURI string,
5051

5152
section.Internal_class_number = classNum
5253
section.Instruction_mode = classInfo["Instruction Mode:"]
53-
section.Meetings = getMeetings(rowInfo, classInfo)
54+
section.Meetings = getMeetings(rowInfo)
5455

5556
// Parse core flags (may or may not exist)
56-
coreText, hasCore := rowInfo["Core:"]
57-
if hasCore {
58-
section.Core_flags = coreRegexp.FindAllString(coreText, -1)
57+
58+
if coreText, hasCore := rowInfo["Core:"]; hasCore {
59+
section.Core_flags = coreRegexp.FindAllString(utils.TrimWhitespace(coreText.Text()), -1)
5960
}
6061

6162
section.Syllabus_uri = syllabusURI
6263

63-
semesterGrades, exists := GradeMap[session.Name]
64-
if exists {
64+
if semesterGrades, ok := GradeMap[session.Name]; ok {
6565
// We have to trim leading zeroes from the section number in order to match properly, since the grade data does not use leading zeroes
6666
trimmedSectionNumber := strings.TrimLeft(section.Section_number, "0")
6767
// Key into grademap should be uppercased like the grade data
@@ -79,76 +79,93 @@ func parseSection(courseRef *schema.Course, classNum string, syllabusURI string,
7979
courseRef.Sections = append(courseRef.Sections, section.Id)
8080
}
8181

82-
var termRegexp *regexp.Regexp = utils.Regexpf(`(?i)Term: (%s)`, utils.R_TERM_CODE)
83-
var datesRegexp *regexp.Regexp = utils.Regexpf(`(?:Start|End)s: (%s)`, utils.R_DATE_MDY)
84-
85-
func getAcademicSession(rowInfo map[string]string) schema.AcademicSession {
82+
func getAcademicSession(rowInfo map[string]*goquery.Selection) schema.AcademicSession {
8683
session := schema.AcademicSession{}
87-
scheduleText := rowInfo["Schedule:"]
8884

89-
session.Name = termRegexp.FindStringSubmatch(scheduleText)[1]
90-
dateMatches := datesRegexp.FindAllStringSubmatch(scheduleText, -1)
91-
92-
datesFound := len(dateMatches)
93-
switch {
94-
case datesFound == 1:
95-
startDate, err := time.ParseInLocation("January 2, 2006", dateMatches[0][1], timeLocation)
96-
if err != nil {
97-
panic(err)
98-
}
99-
session.Start_date = startDate
100-
case datesFound == 2:
101-
startDate, err := time.ParseInLocation("January 2, 2006", dateMatches[0][1], timeLocation)
102-
if err != nil {
103-
panic(err)
85+
infoNodes := rowInfo["Schedule:"].FindMatcher(goquery.Single("p.courseinfo__sectionterm")).Contents().Nodes
86+
for _, node := range infoNodes {
87+
if node.DataAtom == atom.B {
88+
//since the key is not a TextElement, the Text is stored in it's first child, a TextElement
89+
key := utils.TrimWhitespace(node.FirstChild.Data)
90+
value := utils.TrimWhitespace(node.NextSibling.Data)
91+
92+
switch key {
93+
case "Term:":
94+
session.Name = value
95+
case "Starts:":
96+
session.Start_date = parseTimeOrPanic(value)
97+
case "Ends:":
98+
session.End_date = parseTimeOrPanic(value)
99+
}
104100
}
105-
endDate, err := time.ParseInLocation("January 2, 2006", dateMatches[1][1], timeLocation)
106-
if err != nil {
107-
panic(err)
108-
}
109-
session.Start_date = startDate
110-
session.End_date = endDate
111101
}
112102
return session
113103
}
114104

115-
var meetingsRegexp *regexp.Regexp = utils.Regexpf(`(%s)-(%s)\W+((?:%s(?:, )?)+)\W+(%s)-(%s)(?:\W+(?:(\S+)\s+(\S+)))`, utils.R_DATE_MDY, utils.R_DATE_MDY, utils.R_WEEKDAY, utils.R_TIME_AM_PM, utils.R_TIME_AM_PM)
105+
var meetingDatesRegexp = regexp.MustCompile(utils.R_DATE_MDY)
106+
var meetingDaysRegexp = regexp.MustCompile(utils.R_WEEKDAY)
107+
var meetingTimesRegexp = regexp.MustCompile(utils.R_TIME_AM_PM)
116108

117-
func getMeetings(rowInfo map[string]string, classInfo map[string]string) []schema.Meeting {
118-
scheduleText := rowInfo["Schedule:"]
119-
meetingMatches := meetingsRegexp.FindAllStringSubmatch(scheduleText, -1)
120-
var meetings []schema.Meeting = make([]schema.Meeting, 0, len(meetingMatches))
121-
for _, match := range meetingMatches {
122-
meeting := schema.Meeting{}
109+
func getMeetings(rowInfo map[string]*goquery.Selection) []schema.Meeting {
110+
meetingItems := rowInfo["Schedule:"].Find("div.courseinfo__meeting-item--multiple")
111+
var meetings []schema.Meeting = make([]schema.Meeting, 0, meetingItems.Length())
123112

124-
startDate, err := time.ParseInLocation("January 2, 2006", match[1], timeLocation)
125-
if err != nil {
126-
panic(err)
113+
meetingItems.Each(func(i int, s *goquery.Selection) {
114+
meeting := schema.Meeting{}
115+
meetingInfo := s.FindMatcher(goquery.Single("p.courseinfo__meeting-time"))
116+
117+
dates := meetingDatesRegexp.FindAllString(meetingInfo.Text(), -1)
118+
if len(dates) == 2 {
119+
meeting.Start_date = parseTimeOrPanic(dates[0])
120+
meeting.End_date = parseTimeOrPanic(dates[1])
121+
} else if len(dates) == 1 {
122+
meeting.Start_date = parseTimeOrPanic(dates[0])
123+
meeting.End_date = meeting.Start_date
127124
}
128-
meeting.Start_date = startDate
129125

130-
endDate, err := time.ParseInLocation("January 2, 2006", match[2], timeLocation)
131-
if err != nil {
132-
panic(err)
126+
days := meetingDaysRegexp.FindAllString(meetingInfo.Text(), -1)
127+
if days != nil {
128+
meeting.Meeting_days = days
129+
} else {
130+
meeting.Meeting_days = []string{} //avoid null in the json
133131
}
134-
meeting.End_date = endDate
135-
136-
meeting.Meeting_days = strings.Split(match[3], ", ")
137132

138-
// Don't parse time into time object, adds unnecessary extra data
139-
meeting.Start_time = match[4]
140-
meeting.End_time = match[5]
141-
142-
// Only add location data if it's available
143-
if len(match) > 6 {
144-
location := schema.Location{}
145-
location.Building = match[6]
146-
location.Room = match[7]
147-
location.Map_uri = fmt.Sprintf("https://locator.utdallas.edu/%s_%s", location.Building, location.Room)
148-
meeting.Location = location
133+
times := meetingTimesRegexp.FindAllString(meetingInfo.Text(), -1)
134+
if len(times) == 2 {
135+
meeting.Start_time = times[0]
136+
meeting.End_time = times[1]
137+
} else if len(times) == 1 {
138+
meeting.Start_time = times[0]
139+
meeting.End_time = meeting.Start_time
149140
}
150141

142+
if locationInfo := meetingInfo.FindMatcher(goquery.Single("a")); locationInfo != nil {
143+
mapUri := locationInfo.AttrOr("href", "")
144+
145+
//only add locations for meetings that have actual data, all meetings have a link some are not visible or empty
146+
if mapUri != "" && mapUri != "https://locator.utdallas.edu/" && mapUri != "https://locator.utdallas.edu/ONLINE" {
147+
splitText := strings.Split(utils.TrimWhitespace(locationInfo.Text()), " ")
148+
149+
if len(splitText) == 2 {
150+
meeting.Location = schema.Location{
151+
Building: splitText[0],
152+
Room: splitText[1],
153+
Map_uri: mapUri,
154+
}
155+
}
156+
}
157+
}
151158
meetings = append(meetings, meeting)
152-
}
159+
})
153160
return meetings
154161
}
162+
163+
const timeLayout = "January 2, 2006"
164+
165+
func parseTimeOrPanic(value string) time.Time {
166+
date, err := time.ParseInLocation(timeLayout, value, timeLocation)
167+
if err != nil {
168+
panic(err)
169+
}
170+
return date
171+
}

0 commit comments

Comments
 (0)