diff --git a/go.mod b/go.mod index e8952e7..11723d5 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/joho/godotenv v1.5.1 github.com/valyala/fastjson v1.6.4 go.mongodb.org/mongo-driver v1.15.0 + golang.org/x/net v0.21.0 ) require ( @@ -50,7 +51,6 @@ require ( github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a // indirect golang.org/x/arch v0.7.0 // indirect golang.org/x/crypto v0.22.0 // indirect - golang.org/x/net v0.24.0 // indirect golang.org/x/sync v0.7.0 // indirect golang.org/x/sys v0.29.0 // indirect golang.org/x/text v0.14.0 // indirect diff --git a/go.sum b/go.sum index b7d9fdd..5a80b22 100644 --- a/go.sum +++ b/go.sum @@ -128,8 +128,8 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w= -golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= +golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= diff --git a/parser/courseParser.go b/parser/courseParser.go index 484e232..bc7ea33 100644 --- a/parser/courseParser.go +++ b/parser/courseParser.go @@ -5,6 +5,8 @@ import ( "regexp" "strconv" + "github.com/PuerkitoBio/goquery" + "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" "go.mongodb.org/mongo-driver/bson/primitive" @@ -31,7 +33,7 @@ func getCatalogYear(session schema.AcademicSession) string { } } -func parseCourse(courseNum string, session schema.AcademicSession, rowInfo map[string]string, classInfo map[string]string) *schema.Course { +func parseCourse(courseNum string, session schema.AcademicSession, rowInfo map[string]*goquery.Selection, classInfo map[string]string) *schema.Course { // Courses are internally keyed by their internal course number and the catalog year they're part of catalogYear := getCatalogYear(session) courseKey := courseNum + catalogYear @@ -51,9 +53,9 @@ func parseCourse(courseNum string, session schema.AcademicSession, rowInfo map[s course.Id = primitive.NewObjectID() course.Course_number = idMatches[2] course.Subject_prefix = idMatches[1] - course.Title = rowInfo["Course Title:"] - course.Description = rowInfo["Description:"] - course.School = rowInfo["College:"] + course.Title = utils.TrimWhitespace(rowInfo["Course Title:"].Text()) + course.Description = utils.TrimWhitespace(rowInfo["Description:"].Text()) + course.School = utils.TrimWhitespace(rowInfo["College:"].Text()) course.Credit_hours = classInfo["Semester Credit Hours:"] course.Class_level = classInfo["Class Level:"] course.Activity_type = classInfo["Activity Type:"] diff --git a/parser/parser.go b/parser/parser.go index 606e5b2..421f5eb 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -117,19 +117,20 @@ func parse(path string) { var syllabusURI string // Dictionary to hold the row data, keyed by row header - rowInfo := make(map[string]string, len(infoRows.Nodes)) + rowInfo := make(map[string]*goquery.Selection, len(infoRows.Nodes)) // Populate rowInfo infoRows.Each(func(_ int, row *goquery.Selection) { rowHeader := utils.TrimWhitespace(row.FindMatcher(goquery.Single("th")).Text()) - rowData := row.FindMatcher(goquery.Single("td")) - rowInfo[rowHeader] = utils.TrimWhitespace(rowData.Text()) - // Get syllabusURI from syllabus row link - if rowHeader == "Syllabus:" { - syllabusURI, _ = rowData.FindMatcher(goquery.Single("a")).Attr("href") - } + rowInfo[rowHeader] = row.FindMatcher(goquery.Single("td")) + }) + // Get syllabusURI from syllabus row link + if syllabus, ok := rowInfo["syllabus"]; ok { + syllabusURI, _ = syllabus.FindMatcher(goquery.Single("a")).Attr("href") + } + // Get the rows of the class info subtable infoSubTable := infoTable.FindMatcher(goquery.Single("table.courseinfo__classsubtable > tbody")) infoRows = infoSubTable.ChildrenFiltered("tr") diff --git a/parser/professorParser.go b/parser/professorParser.go index 2184074..667a45a 100644 --- a/parser/professorParser.go +++ b/parser/professorParser.go @@ -3,13 +3,15 @@ package parser import ( "strings" + "github.com/PuerkitoBio/goquery" + "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" "go.mongodb.org/mongo-driver/bson/primitive" ) -func parseProfessors(sectionId primitive.ObjectID, rowInfo map[string]string, classInfo map[string]string) []primitive.ObjectID { - professorText := rowInfo["Instructor(s):"] +func parseProfessors(sectionId primitive.ObjectID, rowInfo map[string]*goquery.Selection, classInfo map[string]string) []primitive.ObjectID { + professorText := utils.TrimWhitespace(rowInfo["Instructor(s):"].Text()) professorMatches := personRegexp.FindAllStringSubmatch(professorText, -1) var profRefs []primitive.ObjectID = make([]primitive.ObjectID, 0, len(professorMatches)) for _, match := range professorMatches { diff --git a/parser/requisiteParser.go b/parser/requisiteParser.go index 095f150..743521a 100644 --- a/parser/requisiteParser.go +++ b/parser/requisiteParser.go @@ -7,6 +7,8 @@ import ( "strconv" "strings" + "github.com/PuerkitoBio/goquery" + "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" ) @@ -367,7 +369,7 @@ var coreqRegexp *regexp.Regexp = regexp.MustCompile(`(?i)(Corequisites?:(.*))`) var reqRegexes [3]*regexp.Regexp = [3]*regexp.Regexp{preOrCoreqRegexp, prereqRegexp, coreqRegexp} // Returns a closure that parses the course's requisites -func getReqParser(course *schema.Course, hasEnrollmentReqs bool, enrollmentReqs string) func() { +func getReqParser(course *schema.Course, hasEnrollmentReqs bool, enrollmentReqs *goquery.Selection) func() { return func() { // Pointer array to course requisite properties must be in same order as reqRegexes above courseReqs := [3]**schema.CollectionRequirement{&course.Co_or_pre_requisites, &course.Prerequisites, &course.Corequisites} @@ -375,8 +377,8 @@ func getReqParser(course *schema.Course, hasEnrollmentReqs bool, enrollmentReqs var checkText string // Extract req text from the enrollment req info if it exists, otherwise try using the description if hasEnrollmentReqs { - course.Enrollment_reqs = enrollmentReqs - checkText = enrollmentReqs + course.Enrollment_reqs = utils.TrimWhitespace(enrollmentReqs.Text()) + checkText = utils.TrimWhitespace(enrollmentReqs.Text()) } else { checkText = course.Description } diff --git a/parser/sectionParser.go b/parser/sectionParser.go index 0a4b768..b056164 100644 --- a/parser/sectionParser.go +++ b/parser/sectionParser.go @@ -1,21 +1,22 @@ package parser import ( - "fmt" "regexp" "strings" "time" + "github.com/PuerkitoBio/goquery" "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" "go.mongodb.org/mongo-driver/bson/primitive" + "golang.org/x/net/html/atom" ) var sectionPrefixRegexp *regexp.Regexp = utils.Regexpf(`^(?i)%s\.(%s)`, utils.R_SUBJ_COURSE, utils.R_SECTION_CODE) var coreRegexp *regexp.Regexp = regexp.MustCompile(`[0-9]{3}`) var personRegexp *regexp.Regexp = regexp.MustCompile(`(.+)・(.+)・(.+)`) -func parseSection(courseRef *schema.Course, classNum string, syllabusURI string, session schema.AcademicSession, rowInfo map[string]string, classInfo map[string]string) { +func parseSection(courseRef *schema.Course, classNum string, syllabusURI string, session schema.AcademicSession, rowInfo map[string]*goquery.Selection, classInfo map[string]string) { // Get subject prefix and course number by doing a regexp match on the section id sectionId := classInfo["Class Section:"] idMatches := sectionPrefixRegexp.FindStringSubmatch(sectionId) @@ -34,7 +35,7 @@ func parseSection(courseRef *schema.Course, classNum string, syllabusURI string, section.Professors = parseProfessors(section.Id, rowInfo, classInfo) // Get all TA/RA info - assistantText := rowInfo["TA/RA(s):"] + assistantText := utils.TrimWhitespace(rowInfo["TA/RA(s):"].Text()) assistantMatches := personRegexp.FindAllStringSubmatch(assistantText, -1) section.Teaching_assistants = make([]schema.Assistant, 0, len(assistantMatches)) for _, match := range assistantMatches { @@ -50,18 +51,17 @@ func parseSection(courseRef *schema.Course, classNum string, syllabusURI string, section.Internal_class_number = classNum section.Instruction_mode = classInfo["Instruction Mode:"] - section.Meetings = getMeetings(rowInfo, classInfo) + section.Meetings = getMeetings(rowInfo) // Parse core flags (may or may not exist) - coreText, hasCore := rowInfo["Core:"] - if hasCore { - section.Core_flags = coreRegexp.FindAllString(coreText, -1) + + if coreText, hasCore := rowInfo["Core:"]; hasCore { + section.Core_flags = coreRegexp.FindAllString(utils.TrimWhitespace(coreText.Text()), -1) } section.Syllabus_uri = syllabusURI - semesterGrades, exists := GradeMap[session.Name] - if exists { + if semesterGrades, ok := GradeMap[session.Name]; ok { // We have to trim leading zeroes from the section number in order to match properly, since the grade data does not use leading zeroes trimmedSectionNumber := strings.TrimLeft(section.Section_number, "0") // Key into grademap should be uppercased like the grade data @@ -79,76 +79,93 @@ func parseSection(courseRef *schema.Course, classNum string, syllabusURI string, courseRef.Sections = append(courseRef.Sections, section.Id) } -var termRegexp *regexp.Regexp = utils.Regexpf(`(?i)Term: (%s)`, utils.R_TERM_CODE) -var datesRegexp *regexp.Regexp = utils.Regexpf(`(?:Start|End)s: (%s)`, utils.R_DATE_MDY) - -func getAcademicSession(rowInfo map[string]string) schema.AcademicSession { +func getAcademicSession(rowInfo map[string]*goquery.Selection) schema.AcademicSession { session := schema.AcademicSession{} - scheduleText := rowInfo["Schedule:"] - session.Name = termRegexp.FindStringSubmatch(scheduleText)[1] - dateMatches := datesRegexp.FindAllStringSubmatch(scheduleText, -1) - - datesFound := len(dateMatches) - switch { - case datesFound == 1: - startDate, err := time.ParseInLocation("January 2, 2006", dateMatches[0][1], timeLocation) - if err != nil { - panic(err) - } - session.Start_date = startDate - case datesFound == 2: - startDate, err := time.ParseInLocation("January 2, 2006", dateMatches[0][1], timeLocation) - if err != nil { - panic(err) + infoNodes := rowInfo["Schedule:"].FindMatcher(goquery.Single("p.courseinfo__sectionterm")).Contents().Nodes + for _, node := range infoNodes { + if node.DataAtom == atom.B { + //since the key is not a TextElement, the Text is stored in it's first child, a TextElement + key := utils.TrimWhitespace(node.FirstChild.Data) + value := utils.TrimWhitespace(node.NextSibling.Data) + + switch key { + case "Term:": + session.Name = value + case "Starts:": + session.Start_date = parseTimeOrPanic(value) + case "Ends:": + session.End_date = parseTimeOrPanic(value) + } } - endDate, err := time.ParseInLocation("January 2, 2006", dateMatches[1][1], timeLocation) - if err != nil { - panic(err) - } - session.Start_date = startDate - session.End_date = endDate } return session } -var meetingsRegexp *regexp.Regexp = utils.Regexpf(`(%s)-(%s)\W+((?:%s(?:, )?)+)\W+(%s)-(%s)(?:\W+(?:(\S+)\s+(\S+)))`, utils.R_DATE_MDY, utils.R_DATE_MDY, utils.R_WEEKDAY, utils.R_TIME_AM_PM, utils.R_TIME_AM_PM) +var meetingDatesRegexp = regexp.MustCompile(utils.R_DATE_MDY) +var meetingDaysRegexp = regexp.MustCompile(utils.R_WEEKDAY) +var meetingTimesRegexp = regexp.MustCompile(utils.R_TIME_AM_PM) -func getMeetings(rowInfo map[string]string, classInfo map[string]string) []schema.Meeting { - scheduleText := rowInfo["Schedule:"] - meetingMatches := meetingsRegexp.FindAllStringSubmatch(scheduleText, -1) - var meetings []schema.Meeting = make([]schema.Meeting, 0, len(meetingMatches)) - for _, match := range meetingMatches { - meeting := schema.Meeting{} +func getMeetings(rowInfo map[string]*goquery.Selection) []schema.Meeting { + meetingItems := rowInfo["Schedule:"].Find("div.courseinfo__meeting-item--multiple") + var meetings []schema.Meeting = make([]schema.Meeting, 0, meetingItems.Length()) - startDate, err := time.ParseInLocation("January 2, 2006", match[1], timeLocation) - if err != nil { - panic(err) + meetingItems.Each(func(i int, s *goquery.Selection) { + meeting := schema.Meeting{} + meetingInfo := s.FindMatcher(goquery.Single("p.courseinfo__meeting-time")) + + dates := meetingDatesRegexp.FindAllString(meetingInfo.Text(), -1) + if len(dates) == 2 { + meeting.Start_date = parseTimeOrPanic(dates[0]) + meeting.End_date = parseTimeOrPanic(dates[1]) + } else if len(dates) == 1 { + meeting.Start_date = parseTimeOrPanic(dates[0]) + meeting.End_date = meeting.Start_date } - meeting.Start_date = startDate - endDate, err := time.ParseInLocation("January 2, 2006", match[2], timeLocation) - if err != nil { - panic(err) + days := meetingDaysRegexp.FindAllString(meetingInfo.Text(), -1) + if days != nil { + meeting.Meeting_days = days + } else { + meeting.Meeting_days = []string{} //avoid null in the json } - meeting.End_date = endDate - - meeting.Meeting_days = strings.Split(match[3], ", ") - // Don't parse time into time object, adds unnecessary extra data - meeting.Start_time = match[4] - meeting.End_time = match[5] - - // Only add location data if it's available - if len(match) > 6 { - location := schema.Location{} - location.Building = match[6] - location.Room = match[7] - location.Map_uri = fmt.Sprintf("https://locator.utdallas.edu/%s_%s", location.Building, location.Room) - meeting.Location = location + times := meetingTimesRegexp.FindAllString(meetingInfo.Text(), -1) + if len(times) == 2 { + meeting.Start_time = times[0] + meeting.End_time = times[1] + } else if len(times) == 1 { + meeting.Start_time = times[0] + meeting.End_time = meeting.Start_time } + if locationInfo := meetingInfo.FindMatcher(goquery.Single("a")); locationInfo != nil { + mapUri := locationInfo.AttrOr("href", "") + + //only add locations for meetings that have actual data, all meetings have a link some are not visible or empty + if mapUri != "" && mapUri != "https://locator.utdallas.edu/" && mapUri != "https://locator.utdallas.edu/ONLINE" { + splitText := strings.Split(utils.TrimWhitespace(locationInfo.Text()), " ") + + if len(splitText) == 2 { + meeting.Location = schema.Location{ + Building: splitText[0], + Room: splitText[1], + Map_uri: mapUri, + } + } + } + } meetings = append(meetings, meeting) - } + }) return meetings } + +const timeLayout = "January 2, 2006" + +func parseTimeOrPanic(value string) time.Time { + date, err := time.ParseInLocation(timeLayout, value, timeLocation) + if err != nil { + panic(err) + } + return date +}