diff --git a/main.go b/main.go index 5df58b7..6680c8b 100644 --- a/main.go +++ b/main.go @@ -32,6 +32,7 @@ func main() { scrapeCoursebook := flag.Bool("coursebook", false, "Alongside -scrape, signifies that coursebook should be scraped.") term := flag.String("term", "", "Alongside -coursebook, specifies the term to scrape, i.e. 23S") startPrefix := flag.String("startprefix", "", "Alongside -coursebook, specifies the course prefix to start scraping from, i.e. cp_span") + resume := flag.Bool("resume", false, "Alongside -coursebook, signifies that scraping should begin at the last complete prefix and should not re-scrape existing data") // Flag for profile scraping scrapeProfiles := flag.Bool("profiles", false, "Alongside -scrape, signifies that professor profiles should be scraped.") @@ -102,7 +103,7 @@ func main() { if *term == "" { log.Panic("No term specified for coursebook scraping! Use -term to specify.") } - scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir) + scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume) case *scrapeOrganizations: scrapers.ScrapeOrganizations(*outDir) case *scrapeCalendar: diff --git a/scrapers/coursebook.go b/scrapers/coursebook.go index e9c3c25..4f6119c 100644 --- a/scrapers/coursebook.go +++ b/scrapers/coursebook.go @@ -5,179 +5,364 @@ package scrapers import ( - "bytes" + "context" "errors" "fmt" + "io" "log" + "math" + "net" "net/http" "os" + "path/filepath" + "regexp" + "slices" + "sort" "strings" "time" "github.com/UTDNebula/api-tools/utils" ) -func ScrapeCoursebook(term string, startPrefix string, outDir string) { - // Start chromedp - chromedpCtx, cancel := utils.InitChromeDp() - defer cancel() +var ( + prefixRegex = regexp.MustCompile("cp_[a-z]{0,5}") + termRegex = regexp.MustCompile("[0-9]{1,2}[sfu]") +) - coursePrefixes := utils.GetCoursePrefixes(chromedpCtx) +const ( + reqThrottle = 400 * time.Millisecond + prefixThrottle = 5 * time.Second + httpTimeout = 10 * time.Second +) - // Find index of starting prefix, if one has been given - startPrefixIndex := 0 - if startPrefix != "" && startPrefix != coursePrefixes[0] { - for i, prefix := range coursePrefixes { - if prefix == startPrefix { - startPrefixIndex = i - break - } - } - if startPrefixIndex == 0 { - log.Panic("Failed to find provided course prefix! Remember, the format is cp_!") - } +// ScrapeCoursebook scrapes utd coursebook for the provided term (semester) +func ScrapeCoursebook(term string, startPrefix string, outDir string, resume bool) { + if startPrefix != "" && !prefixRegex.MatchString(startPrefix) { + log.Fatalf("Invalid starting prefix %s, must match format cp_{abcde}", startPrefix) + } + if !termRegex.MatchString(term) { + log.Fatalf("Invalid term %s, must match format {00-99}{s/f/u}", term) } - // Init http client - cli := &http.Client{} + scraper := newCoursebookScraper(term, outDir) + defer scraper.chromedpCancel() - // Make the output directory for this term - termDir := fmt.Sprintf("%s/%s", outDir, term) - if err := os.MkdirAll(termDir, 0777); err != nil { - panic(err) + if resume && startPrefix == "" { + // providing a starting prefix overrides the resume flag + startPrefix = scraper.lastCompletePrefix() } - // Keep track of how many total sections we've scraped - totalSections := 0 + log.Printf("[Begin Scrape] Starting scrape for term %s with %d prefixes", term, len(scraper.prefixes)) - // Scrape all sections for each course prefix - for prefixIndex, coursePrefix := range coursePrefixes { + totalTime := time.Now() + for i, prefix := range scraper.prefixes { + if startPrefix != "" && strings.Compare(prefix, startPrefix) < 0 { + continue + } + + start := time.Now() + if err := scraper.ensurePrefixFolder(prefix); err != nil { + log.Fatal(err) + } + + var sectionIds []string + var err error + + // if resume we skip existing entries otherwise overwrite them + if resume { + sectionIds, err = scraper.getMissingIdsForPrefix(prefix) + } else { + sectionIds, err = scraper.getSectionIdsForPrefix(prefix) + } - // Skip to startPrefixIndex - if prefixIndex < startPrefixIndex { + if err != nil { + log.Fatalf("Error getting section ids for %s ", prefix) + } + + if len(sectionIds) == 0 { + log.Printf("No sections found for %s ", prefix) continue } - // Make a directory in the output for this course prefix - courseDir := fmt.Sprintf("%s/%s", termDir, coursePrefix) - if err := os.MkdirAll(courseDir, 0777); err != nil { - panic(err) - } - // Get a fresh token at the start of each new prefix because we can lol - coursebookHeaders := utils.RefreshToken(chromedpCtx) - // Give coursebook some time to recognize the new token - time.Sleep(500 * time.Millisecond) - // String builder to store accumulated course HTML data for both class levels - courseBuilder := strings.Builder{} - - log.Printf("Finding sections for %s...", coursePrefix) - - // Get courses for term and prefix, split by grad and undergrad to avoid 300 section cap - for _, clevel := range []string{"clevel_u", "clevel_g"} { - queryStr := fmt.Sprintf("action=search&s%%5B%%5D=term_%s&s%%5B%%5D=%s&s%%5B%%5D=%s", term, coursePrefix, clevel) - - // Try HTTP request, retrying if necessary - var res *http.Response - err := utils.Retry(func() error { - req, err := http.NewRequest("POST", "https://coursebook.utdallas.edu/clips/clip-cb11-hat.zog", strings.NewReader(queryStr)) - if err != nil { - panic(err) - } - req.Header = coursebookHeaders - res, err = cli.Do(req) - if res != nil && res.StatusCode != 200 { - return errors.New("Non-200 response status code") - } - return err - }, 10, func(numRetries int) { - log.Printf("WARN: Section find for %s failed! Performing retry #%d...", coursePrefix, numRetries) - coursebookHeaders = utils.RefreshToken(chromedpCtx) - // Wait proportionally long to how many times we've retried; generally works pretty well - time.Sleep(500 * time.Millisecond * time.Duration(numRetries)) - }) + log.Printf("[Scrape Prefix] %s (%d/%d): Found %d sections to scrape.", prefix, i+1, len(scraper.prefixes), len(sectionIds)) + for _, sectionId := range sectionIds { + content, err := scraper.getSectionContent(sectionId) if err != nil { - panic(err) + log.Fatalf("Error getting section content for section %s: %v", sectionId, err) + } + if err := scraper.writeSection(prefix, sectionId, content); err != nil { + log.Fatalf("Error writing section %s: %v", sectionId, err) } + time.Sleep(reqThrottle) + } - buf := bytes.Buffer{} - buf.ReadFrom(res.Body) - courseBuilder.Write(buf.Bytes()) - } - // Find all section IDs in returned data - sectionRegexp := utils.Regexpf(`View details for section (%s%s\.\w+\.%s)`, coursePrefix[3:], utils.R_COURSE_CODE, utils.R_TERM_CODE) - smatches := sectionRegexp.FindAllStringSubmatch(courseBuilder.String(), -1) - sectionIDs := make([]string, 0, len(smatches)) - for _, matchSet := range smatches { - sectionIDs = append(sectionIDs, matchSet[1]) - } - log.Printf("Found %d sections for %s", len(sectionIDs), coursePrefix) - - // Get a new token before starting the section lookup - coursebookHeaders = utils.RefreshToken(chromedpCtx) - // Give coursebook some time to recognize the new token - time.Sleep(500 * time.Millisecond) - - // Get HTML data for all section IDs - sectionsInCoursePrefix := 0 - for sectionIndex, id := range sectionIDs { - - // Get section info - // Worth noting that the "req" param in the request below doesn't actually seem to matter... consider it filler to make sure the request goes through - queryStr := fmt.Sprintf("id=%s&req=b30da8ab21637dbef35fd7682f48e1c1W0ypMhaj%%2FdsnYn3Wa03BrxSNgCeyvLfvucSTobcSXRf38SWaUaNfMjJQn%%2BdcabF%%2F7ZuG%%2BdKqHAqmrxEKyg8AdB0FqVGcz4rkff3%%2B3SIUIt8%%3D&action=info", id) - - // Try HTTP request, retrying if necessary - var res *http.Response - err := utils.Retry(func() error { - req, err := http.NewRequest("POST", "https://coursebook.utdallas.edu/clips/clip-cb11-hat.zog", strings.NewReader(queryStr)) - if err != nil { - panic(err) - } - req.Header = coursebookHeaders - res, err = cli.Do(req) - if res != nil && res.StatusCode != 200 { - return errors.New("Non-200 response status code") - } - return err - }, 10, func(numRetries int) { - log.Printf("WARN: Section id lookup for %s failed! Performing retry #%d...", id, numRetries) - coursebookHeaders = utils.RefreshToken(chromedpCtx) - // Wait proportionally long to how many times we've retried; generally works pretty well - time.Sleep(500 * time.Millisecond * time.Duration(numRetries)) - }) + // At the end of the prefix loop + log.Printf("[End Prefix] %s: Scraped %d sections in %v.", prefix, len(sectionIds), time.Since(start)) + time.Sleep(prefixThrottle) + } + log.Printf("[Scrape Complete] Finished scraping term %s in %v. Total sections %d: Total retries %d", term, time.Since(totalTime), scraper.totalScrapedSections, scraper.reqRetries) - if err != nil { - panic(err) + if err := scraper.validate(); err != nil { + log.Fatal("Validating failed: ", err) + } +} + +type coursebookScraper struct { + chromedpCtx context.Context + chromedpCancel context.CancelFunc + httpClient *http.Client + coursebookHeaders map[string][]string + prefixes []string + term string + outDir string + + prefixIdsCache map[string][]string + + //metrics + reqRetries int + totalScrapedSections int +} + +func newCoursebookScraper(term string, outDir string) *coursebookScraper { + ctx, cancel := utils.InitChromeDp() + httpClient := &http.Client{ + Timeout: httpTimeout, + } + + //prefixes in alphabetical order for skip prefix flag + prefixes := utils.GetCoursePrefixes(ctx) + sort.Strings(prefixes) + return &coursebookScraper{ + chromedpCtx: ctx, + chromedpCancel: cancel, + httpClient: httpClient, + prefixes: prefixes, + coursebookHeaders: utils.RefreshToken(ctx), + term: term, + outDir: outDir, + prefixIdsCache: make(map[string][]string), + } +} + +// lastCompletePrefix returns the last prefix (alphabetical order) that contains +// html files for all of its section ids. returns an empty string if there are no +// complete prefixes +func (s *coursebookScraper) lastCompletePrefix() string { + if err := s.ensureOutputFolder(); err != nil { + log.Fatal(err) + } + + dir, err := os.ReadDir(filepath.Join(s.outDir, s.term)) + if err != nil { + log.Fatalf("failed to read output directory: %v", err) + } + + foundPrefixes := make([]string, 0, len(s.prefixes)) + for _, file := range dir { + foundPrefixes = append(foundPrefixes, file.Name()) + } + + sort.Strings(foundPrefixes) + slices.Reverse(foundPrefixes) + + for _, prefix := range foundPrefixes { + missing, err := s.getMissingIdsForPrefix(prefix) + if err != nil { + log.Fatalf("Failed to get ids: %v", err) + } + if len(missing) == 0 { + return prefix + } + time.Sleep(reqThrottle) + } + return "" +} + +// ensurePrefixFolder creates {outDir}/term if it does not exist + +func (s *coursebookScraper) ensureOutputFolder() error { + if err := os.MkdirAll(filepath.Join(s.outDir, s.term), 0755); err != nil { + return fmt.Errorf("failed to create term forlder: %w", err) + } + return nil +} + +// ensurePrefixFolder creates {outDir}/term/prefix if it does not exist +func (s *coursebookScraper) ensurePrefixFolder(prefix string) error { + if err := os.MkdirAll(filepath.Join(s.outDir, s.term, prefix), 0755); err != nil { + return fmt.Errorf("failed to create folder for %s: %w", prefix, err) + } + return nil +} + +// writeSection writes content to file {outDir}/term/prefix/{id}.html +func (s *coursebookScraper) writeSection(prefix string, id string, content string) error { + if err := os.WriteFile(filepath.Join(s.outDir, s.term, prefix, id+".html"), []byte(content), 0644); err != nil { + return fmt.Errorf("failed to write section %s: %w", id, err) + } + return nil +} + +// getSectionContent calls internal coursebook api to get the html for the provided id +// retries up to 3 times, each time refreshing the token and waiting longer +func (s *coursebookScraper) getSectionContent(id string) (string, error) { + queryStr := fmt.Sprintf("id=%s&req=b30da8ab21637dbef35fd7682f48e1c1W0ypMhaj%%2FdsnYn3Wa03BrxSNgCeyvLfvucSTobcSXRf38SWaUaNfMjJQn%%2BdcabF%%2F7ZuG%%2BdKqHAqmrxEKyg8AdB0FqVGcz4rkff3%%2B3SIUIt8%%3D&action=info", id) + response, err := s.req(queryStr, 3, id) + if err != nil { + return "", fmt.Errorf("get section content for id %s failed: %w", id, err) + } + s.totalScrapedSections++ + return response, nil +} + +// getMissingIdsForPrefix calls getSectionIdsForPrefix and filters out the ids that already +// exist in the prefix directory +func (s *coursebookScraper) getMissingIdsForPrefix(prefix string) ([]string, error) { + path := filepath.Join(s.outDir, s.term, prefix) + + sectionIds, err := s.getSectionIdsForPrefix(prefix) + if err != nil { + return sectionIds, err + } + + if _, err := os.Stat(path); err != nil { + if os.IsNotExist(err) { + return sectionIds, nil + } + return sectionIds, fmt.Errorf("failed to access folder %s: %w", path, err) + } + + dir, err := os.ReadDir(path) + if err != nil { + log.Panicf("Failed to access folder %s: %v", path, err) + } + + foundIds := make(map[string]bool) + for _, file := range dir { + id := strings.TrimSuffix(file.Name(), ".html") + foundIds[id] = true + } + + var filteredIds []string + for _, id := range sectionIds { + if !foundIds[id] { + filteredIds = append(filteredIds, id) + } + } + + return filteredIds, nil +} + +// getSectionIdsForPrefix calls internal coursebook api to get all section ids for the provide prefix +// retries up to 10 times, each time refreshing the token and waiting longer. +func (s *coursebookScraper) getSectionIdsForPrefix(prefix string) ([]string, error) { + if ids, ok := s.prefixIdsCache[prefix]; ok { + return ids, nil + } + + sections := make([]string, 0, 100) + for _, clevel := range []string{"clevel_u", "clevel_g"} { + queryStr := fmt.Sprintf("action=search&s%%5B%%5D=term_%s&s%%5B%%5D=%s&s%%5B%%5D=%s", s.term, prefix, clevel) + content, err := s.req(queryStr, 10, fmt.Sprintf("%s:%s", prefix, clevel)) + if err != nil { + return nil, fmt.Errorf("failed to fetch sections: %s", err) + } + sectionRegexp := utils.Regexpf(`View details for section (%s%s\.\w+\.%s)`, prefix[3:], utils.R_COURSE_CODE, utils.R_TERM_CODE) + matches := sectionRegexp.FindAllStringSubmatch(content, -1) + for _, match := range matches { + sections = append(sections, match[1]) + } + } + + s.prefixIdsCache[prefix] = sections + return sections, nil +} + +// req utility function for making calling the coursebook api +func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (string, error) { + var res *http.Response + err := utils.Retry(func() error { + req, err := http.NewRequest("POST", "https://coursebook.utdallas.edu/clips/clip-cb11-hat.zog", strings.NewReader(queryStr)) + if err != nil { + log.Fatalf("Http request failed: %v", err) + } + req.Header = s.coursebookHeaders + + start := time.Now() + res, err = s.httpClient.Do(req) + dur := time.Since(start) + + if res != nil { + if res.StatusCode != 200 { + return errors.New("non-200 response status code") } + utils.VPrintf("[Request Success] Request for [%s] took %v", reqName, dur) + } else if err != nil { + var netErr net.Error + if errors.As(err, &netErr) && netErr.Timeout() { + utils.VPrintf("[Timeout] Request for [%s] timed out", reqName) + } else { + utils.VPrintf("[Request Error] Request for %s failed: %v", reqName, err) + } + } + + return err + }, retries, func(numRetries int) { + utils.VPrintf("[Request Retry] Attempt %d of %d for request %s", numRetries, retries, reqName) + s.coursebookHeaders = utils.RefreshToken(s.chromedpCtx) + s.reqRetries++ - fptr, err := os.Create(fmt.Sprintf("%s/%s.html", courseDir, id)) + //back off exponentially + time.Sleep(time.Duration(math.Pow(2, float64(numRetries))) * time.Second) + }) + if err != nil { + return "", err + } + defer res.Body.Close() + content, err := io.ReadAll(res.Body) + if err != nil { + return "", fmt.Errorf("failed to read response body: %s", err) + } + return string(content), nil +} + +// validate returns true if each prefix contains all required ids +// if it does not it will re-scrape all missing sections +func (s *coursebookScraper) validate() error { + log.Printf("[Begin Validation] Starting Validation for term %s", s.term) + + for _, prefix := range s.prefixes { + ids, err := s.getMissingIdsForPrefix(prefix) + if err != nil { + return err + } + if len(ids) == 0 { + log.Printf("[Validation] %s is correct", prefix) + continue + } + + log.Printf("[Validation] Missing %d sections for %s", len(ids), prefix) + + if err := s.ensurePrefixFolder(prefix); err != nil { + log.Fatal(err) + } + + for _, id := range ids { + content, err := s.getSectionContent(id) if err != nil { - panic(err) + return fmt.Errorf("error getting section content for section %s: %v", id, err) } - buf := bytes.Buffer{} - buf.ReadFrom(res.Body) - if _, err := fptr.Write(buf.Bytes()); err != nil { - panic(err) + if err := s.writeSection(prefix, id, content); err != nil { + return fmt.Errorf("error writing section %s: %v", id, err) } - fptr.Close() - - // Report success, refresh token periodically - utils.VPrintf("Got section: %s", id) - if sectionIndex%30 == 0 && sectionIndex != 0 { - // Ratelimit? What ratelimit? - coursebookHeaders = utils.RefreshToken(chromedpCtx) - // Give coursebook some time to recognize the new token - time.Sleep(500 * time.Millisecond) - } - sectionsInCoursePrefix++ - } - log.Printf("Finished scraping %s. Got %d sections.\n----------------------------------------------------", coursePrefix, sectionsInCoursePrefix) - // Panic if we got fewer sections than we should've - if sectionsInCoursePrefix != len(sectionIDs) { - log.Panicf("Section count mismatch! Expected %d sections for %s, got %d", sectionsInCoursePrefix, coursePrefix, sectionsInCoursePrefix) + time.Sleep(reqThrottle) } - totalSections += sectionsInCoursePrefix + + log.Printf("[Validation] %s is correct", prefix) + time.Sleep(prefixThrottle) } - log.Printf("Done scraping term! Scraped %d sections.", totalSections) + log.Print("[End Validation] Validation Successful") + return nil }