Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
github.com/joho/godotenv v1.5.1
github.com/valyala/fastjson v1.6.4
go.mongodb.org/mongo-driver v1.15.0
golang.org/x/net v0.21.0
)

require (
Expand Down Expand Up @@ -50,7 +51,6 @@ require (
github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a // indirect
golang.org/x/arch v0.7.0 // indirect
golang.org/x/crypto v0.22.0 // indirect
golang.org/x/net v0.24.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.29.0 // indirect
golang.org/x/text v0.14.0 // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v
golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w=
golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
Expand Down
10 changes: 6 additions & 4 deletions parser/courseParser.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"regexp"
"strconv"

"github.com/PuerkitoBio/goquery"

"github.com/UTDNebula/api-tools/utils"
"github.com/UTDNebula/nebula-api/api/schema"
"go.mongodb.org/mongo-driver/bson/primitive"
Expand All @@ -31,7 +33,7 @@ func getCatalogYear(session schema.AcademicSession) string {
}
}

func parseCourse(courseNum string, session schema.AcademicSession, rowInfo map[string]string, classInfo map[string]string) *schema.Course {
func parseCourse(courseNum string, session schema.AcademicSession, rowInfo map[string]*goquery.Selection, classInfo map[string]string) *schema.Course {
// Courses are internally keyed by their internal course number and the catalog year they're part of
catalogYear := getCatalogYear(session)
courseKey := courseNum + catalogYear
Expand All @@ -51,9 +53,9 @@ func parseCourse(courseNum string, session schema.AcademicSession, rowInfo map[s
course.Id = primitive.NewObjectID()
course.Course_number = idMatches[2]
course.Subject_prefix = idMatches[1]
course.Title = rowInfo["Course Title:"]
course.Description = rowInfo["Description:"]
course.School = rowInfo["College:"]
course.Title = utils.TrimWhitespace(rowInfo["Course Title:"].Text())
course.Description = utils.TrimWhitespace(rowInfo["Description:"].Text())
course.School = utils.TrimWhitespace(rowInfo["College:"].Text())
course.Credit_hours = classInfo["Semester Credit Hours:"]
course.Class_level = classInfo["Class Level:"]
course.Activity_type = classInfo["Activity Type:"]
Expand Down
15 changes: 8 additions & 7 deletions parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,19 +117,20 @@ func parse(path string) {
var syllabusURI string

// Dictionary to hold the row data, keyed by row header
rowInfo := make(map[string]string, len(infoRows.Nodes))
rowInfo := make(map[string]*goquery.Selection, len(infoRows.Nodes))

// Populate rowInfo
infoRows.Each(func(_ int, row *goquery.Selection) {
rowHeader := utils.TrimWhitespace(row.FindMatcher(goquery.Single("th")).Text())
rowData := row.FindMatcher(goquery.Single("td"))
rowInfo[rowHeader] = utils.TrimWhitespace(rowData.Text())
// Get syllabusURI from syllabus row link
if rowHeader == "Syllabus:" {
syllabusURI, _ = rowData.FindMatcher(goquery.Single("a")).Attr("href")
}
rowInfo[rowHeader] = row.FindMatcher(goquery.Single("td"))

})

// Get syllabusURI from syllabus row link
if syllabus, ok := rowInfo["syllabus"]; ok {
syllabusURI, _ = syllabus.FindMatcher(goquery.Single("a")).Attr("href")
}

// Get the rows of the class info subtable
infoSubTable := infoTable.FindMatcher(goquery.Single("table.courseinfo__classsubtable > tbody"))
infoRows = infoSubTable.ChildrenFiltered("tr")
Expand Down
6 changes: 4 additions & 2 deletions parser/professorParser.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@ package parser
import (
"strings"

"github.com/PuerkitoBio/goquery"

"github.com/UTDNebula/api-tools/utils"
"github.com/UTDNebula/nebula-api/api/schema"
"go.mongodb.org/mongo-driver/bson/primitive"
)

func parseProfessors(sectionId primitive.ObjectID, rowInfo map[string]string, classInfo map[string]string) []primitive.ObjectID {
professorText := rowInfo["Instructor(s):"]
func parseProfessors(sectionId primitive.ObjectID, rowInfo map[string]*goquery.Selection, classInfo map[string]string) []primitive.ObjectID {
professorText := utils.TrimWhitespace(rowInfo["Instructor(s):"].Text())
professorMatches := personRegexp.FindAllStringSubmatch(professorText, -1)
var profRefs []primitive.ObjectID = make([]primitive.ObjectID, 0, len(professorMatches))
for _, match := range professorMatches {
Expand Down
8 changes: 5 additions & 3 deletions parser/requisiteParser.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import (
"strconv"
"strings"

"github.com/PuerkitoBio/goquery"

"github.com/UTDNebula/api-tools/utils"
"github.com/UTDNebula/nebula-api/api/schema"
)
Expand Down Expand Up @@ -367,16 +369,16 @@ var coreqRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(Corequisites?:(.*))`)
var reqRegexes [3]*regexp.Regexp = [3]*regexp.Regexp{preOrCoreqRegexp, prereqRegexp, coreqRegexp}

// Returns a closure that parses the course's requisites
func getReqParser(course *schema.Course, hasEnrollmentReqs bool, enrollmentReqs string) func() {
func getReqParser(course *schema.Course, hasEnrollmentReqs bool, enrollmentReqs *goquery.Selection) func() {
return func() {
// Pointer array to course requisite properties must be in same order as reqRegexes above
courseReqs := [3]**schema.CollectionRequirement{&course.Co_or_pre_requisites, &course.Prerequisites, &course.Corequisites}
// The actual text to check for requisites
var checkText string
// Extract req text from the enrollment req info if it exists, otherwise try using the description
if hasEnrollmentReqs {
course.Enrollment_reqs = enrollmentReqs
checkText = enrollmentReqs
course.Enrollment_reqs = utils.TrimWhitespace(enrollmentReqs.Text())
checkText = utils.TrimWhitespace(enrollmentReqs.Text())
} else {
checkText = course.Description
}
Expand Down
145 changes: 81 additions & 64 deletions parser/sectionParser.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
package parser

import (
"fmt"
"regexp"
"strings"
"time"

"github.com/PuerkitoBio/goquery"
"github.com/UTDNebula/api-tools/utils"
"github.com/UTDNebula/nebula-api/api/schema"
"go.mongodb.org/mongo-driver/bson/primitive"
"golang.org/x/net/html/atom"
)

var sectionPrefixRegexp *regexp.Regexp = utils.Regexpf(`^(?i)%s\.(%s)`, utils.R_SUBJ_COURSE, utils.R_SECTION_CODE)
var coreRegexp *regexp.Regexp = regexp.MustCompile(`[0-9]{3}`)
var personRegexp *regexp.Regexp = regexp.MustCompile(`(.+)・(.+)・(.+)`)

func parseSection(courseRef *schema.Course, classNum string, syllabusURI string, session schema.AcademicSession, rowInfo map[string]string, classInfo map[string]string) {
func parseSection(courseRef *schema.Course, classNum string, syllabusURI string, session schema.AcademicSession, rowInfo map[string]*goquery.Selection, classInfo map[string]string) {
// Get subject prefix and course number by doing a regexp match on the section id
sectionId := classInfo["Class Section:"]
idMatches := sectionPrefixRegexp.FindStringSubmatch(sectionId)
Expand All @@ -34,7 +35,7 @@ func parseSection(courseRef *schema.Course, classNum string, syllabusURI string,
section.Professors = parseProfessors(section.Id, rowInfo, classInfo)

// Get all TA/RA info
assistantText := rowInfo["TA/RA(s):"]
assistantText := utils.TrimWhitespace(rowInfo["TA/RA(s):"].Text())
assistantMatches := personRegexp.FindAllStringSubmatch(assistantText, -1)
section.Teaching_assistants = make([]schema.Assistant, 0, len(assistantMatches))
for _, match := range assistantMatches {
Expand All @@ -50,18 +51,17 @@ func parseSection(courseRef *schema.Course, classNum string, syllabusURI string,

section.Internal_class_number = classNum
section.Instruction_mode = classInfo["Instruction Mode:"]
section.Meetings = getMeetings(rowInfo, classInfo)
section.Meetings = getMeetings(rowInfo)

// Parse core flags (may or may not exist)
coreText, hasCore := rowInfo["Core:"]
if hasCore {
section.Core_flags = coreRegexp.FindAllString(coreText, -1)

if coreText, hasCore := rowInfo["Core:"]; hasCore {
section.Core_flags = coreRegexp.FindAllString(utils.TrimWhitespace(coreText.Text()), -1)
}

section.Syllabus_uri = syllabusURI

semesterGrades, exists := GradeMap[session.Name]
if exists {
if semesterGrades, ok := GradeMap[session.Name]; ok {
// We have to trim leading zeroes from the section number in order to match properly, since the grade data does not use leading zeroes
trimmedSectionNumber := strings.TrimLeft(section.Section_number, "0")
// Key into grademap should be uppercased like the grade data
Expand All @@ -79,76 +79,93 @@ func parseSection(courseRef *schema.Course, classNum string, syllabusURI string,
courseRef.Sections = append(courseRef.Sections, section.Id)
}

var termRegexp *regexp.Regexp = utils.Regexpf(`(?i)Term: (%s)`, utils.R_TERM_CODE)
var datesRegexp *regexp.Regexp = utils.Regexpf(`(?:Start|End)s: (%s)`, utils.R_DATE_MDY)

func getAcademicSession(rowInfo map[string]string) schema.AcademicSession {
func getAcademicSession(rowInfo map[string]*goquery.Selection) schema.AcademicSession {
session := schema.AcademicSession{}
scheduleText := rowInfo["Schedule:"]

session.Name = termRegexp.FindStringSubmatch(scheduleText)[1]
dateMatches := datesRegexp.FindAllStringSubmatch(scheduleText, -1)

datesFound := len(dateMatches)
switch {
case datesFound == 1:
startDate, err := time.ParseInLocation("January 2, 2006", dateMatches[0][1], timeLocation)
if err != nil {
panic(err)
}
session.Start_date = startDate
case datesFound == 2:
startDate, err := time.ParseInLocation("January 2, 2006", dateMatches[0][1], timeLocation)
if err != nil {
panic(err)
infoNodes := rowInfo["Schedule:"].FindMatcher(goquery.Single("p.courseinfo__sectionterm")).Contents().Nodes
for _, node := range infoNodes {
if node.DataAtom == atom.B {
//since the key is not a TextElement, the Text is stored in it's first child, a TextElement
key := utils.TrimWhitespace(node.FirstChild.Data)
value := utils.TrimWhitespace(node.NextSibling.Data)

switch key {
case "Term:":
session.Name = value
case "Starts:":
session.Start_date = parseTimeOrPanic(value)
case "Ends:":
session.End_date = parseTimeOrPanic(value)
}
}
endDate, err := time.ParseInLocation("January 2, 2006", dateMatches[1][1], timeLocation)
if err != nil {
panic(err)
}
session.Start_date = startDate
session.End_date = endDate
}
return session
}

var meetingsRegexp *regexp.Regexp = utils.Regexpf(`(%s)-(%s)\W+((?:%s(?:, )?)+)\W+(%s)-(%s)(?:\W+(?:(\S+)\s+(\S+)))`, utils.R_DATE_MDY, utils.R_DATE_MDY, utils.R_WEEKDAY, utils.R_TIME_AM_PM, utils.R_TIME_AM_PM)
var meetingDatesRegexp = regexp.MustCompile(utils.R_DATE_MDY)
var meetingDaysRegexp = regexp.MustCompile(utils.R_WEEKDAY)
var meetingTimesRegexp = regexp.MustCompile(utils.R_TIME_AM_PM)

func getMeetings(rowInfo map[string]string, classInfo map[string]string) []schema.Meeting {
scheduleText := rowInfo["Schedule:"]
meetingMatches := meetingsRegexp.FindAllStringSubmatch(scheduleText, -1)
var meetings []schema.Meeting = make([]schema.Meeting, 0, len(meetingMatches))
for _, match := range meetingMatches {
meeting := schema.Meeting{}
func getMeetings(rowInfo map[string]*goquery.Selection) []schema.Meeting {
meetingItems := rowInfo["Schedule:"].Find("div.courseinfo__meeting-item--multiple")
var meetings []schema.Meeting = make([]schema.Meeting, 0, meetingItems.Length())

startDate, err := time.ParseInLocation("January 2, 2006", match[1], timeLocation)
if err != nil {
panic(err)
meetingItems.Each(func(i int, s *goquery.Selection) {
meeting := schema.Meeting{}
meetingInfo := s.FindMatcher(goquery.Single("p.courseinfo__meeting-time"))

dates := meetingDatesRegexp.FindAllString(meetingInfo.Text(), -1)
if len(dates) == 2 {
meeting.Start_date = parseTimeOrPanic(dates[0])
meeting.End_date = parseTimeOrPanic(dates[1])
} else if len(dates) == 1 {
meeting.Start_date = parseTimeOrPanic(dates[0])
meeting.End_date = meeting.Start_date
}
meeting.Start_date = startDate

endDate, err := time.ParseInLocation("January 2, 2006", match[2], timeLocation)
if err != nil {
panic(err)
days := meetingDaysRegexp.FindAllString(meetingInfo.Text(), -1)
if days != nil {
meeting.Meeting_days = days
} else {
meeting.Meeting_days = []string{} //avoid null in the json
}
meeting.End_date = endDate

meeting.Meeting_days = strings.Split(match[3], ", ")

// Don't parse time into time object, adds unnecessary extra data
meeting.Start_time = match[4]
meeting.End_time = match[5]

// Only add location data if it's available
if len(match) > 6 {
location := schema.Location{}
location.Building = match[6]
location.Room = match[7]
location.Map_uri = fmt.Sprintf("https://locator.utdallas.edu/%s_%s", location.Building, location.Room)
meeting.Location = location
times := meetingTimesRegexp.FindAllString(meetingInfo.Text(), -1)
if len(times) == 2 {
meeting.Start_time = times[0]
meeting.End_time = times[1]
} else if len(times) == 1 {
meeting.Start_time = times[0]
meeting.End_time = meeting.Start_time
}

if locationInfo := meetingInfo.FindMatcher(goquery.Single("a")); locationInfo != nil {
mapUri := locationInfo.AttrOr("href", "")

//only add locations for meetings that have actual data, all meetings have a link some are not visible or empty
if mapUri != "" && mapUri != "https://locator.utdallas.edu/" && mapUri != "https://locator.utdallas.edu/ONLINE" {
splitText := strings.Split(utils.TrimWhitespace(locationInfo.Text()), " ")

if len(splitText) == 2 {
meeting.Location = schema.Location{
Building: splitText[0],
Room: splitText[1],
Map_uri: mapUri,
}
}
}
}
meetings = append(meetings, meeting)
}
})
return meetings
}

const timeLayout = "January 2, 2006"

func parseTimeOrPanic(value string) time.Time {
date, err := time.ParseInLocation(timeLayout, value, timeLocation)
if err != nil {
panic(err)
}
return date
}