Skip to content

Commit 9adb01a

Browse files
committed
Fixed validation, cleaned up logging, updated rate limiting to be more stable
1 parent 3026aa7 commit 9adb01a

File tree

2 files changed

+118
-60
lines changed

2 files changed

+118
-60
lines changed

main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ func main() {
3232
scrapeCoursebook := flag.Bool("coursebook", false, "Alongside -scrape, signifies that coursebook should be scraped.")
3333
term := flag.String("term", "", "Alongside -coursebook, specifies the term to scrape, i.e. 23S")
3434
startPrefix := flag.String("startprefix", "", "Alongside -coursebook, specifies the course prefix to start scraping from, i.e. cp_span")
35-
resume := flag.Bool("resume", false, "Alongside -scrape, signifies that coursebook should be scraped.")
35+
resume := flag.Bool("resume", false, "Alongside -coursebook, signifies that scraping should begin at the last complete prefix and should not re-scrape existing data")
3636

3737
// Flag for profile scraping
3838
scrapeProfiles := flag.Bool("profiles", false, "Alongside -scrape, signifies that professor profiles should be scraped.")

scrapers/coursebook.go

Lines changed: 117 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,12 @@ import (
1010
"fmt"
1111
"io"
1212
"log"
13+
"math"
14+
"net"
1315
"net/http"
1416
"os"
1517
"path/filepath"
18+
"regexp"
1619
"slices"
1720
"sort"
1821
"strings"
@@ -21,29 +24,41 @@ import (
2124
"github.com/UTDNebula/api-tools/utils"
2225
)
2326

27+
var (
28+
prefixRegex = regexp.MustCompile("cp_[a-z]{0,5}")
29+
termRegex = regexp.MustCompile("[0-9]{1,2}[sfu]")
30+
)
31+
32+
const reqThrottle = 400 * time.Millisecond
33+
const prefixThrottle = 5 * time.Second
34+
2435
// ScrapeCoursebook scrapes utd coursebook for the provided term (semester)
2536
// if resume flag is true then
2637
func ScrapeCoursebook(term string, startPrefix string, outDir string, resume bool) {
38+
if startPrefix != "" && !prefixRegex.MatchString(startPrefix) {
39+
log.Fatalf("Invalid starting prefix %s, must match format cp_{abcde}", startPrefix)
40+
}
41+
if !termRegex.MatchString(term) {
42+
log.Fatalf("Invalid term %s, must match format {00-99}{s/f/u}", term)
43+
}
44+
2745
scraper := newCoursebookScraper(term, outDir)
2846
defer scraper.cancel()
2947

30-
if startPrefix == "" {
48+
if resume && startPrefix == "" {
3149
// providing a starting prefix overrides the resume flag
3250
startPrefix = scraper.lastCompletePrefix()
3351
}
3452

35-
skipped := 0
36-
for _, prefix := range scraper.prefixes {
37-
if startPrefix == "" || strings.Compare(prefix, startPrefix) == -1 {
38-
utils.VPrintf("Skipping prefix %s", prefix)
39-
continue
40-
}
53+
log.Printf("[Begin Scrape] Starting scrape for term %s with %d prefixes", term, len(scraper.prefixes))
4154

42-
if skipped != -1 {
43-
log.Printf("Skipped %d prefixes", skipped)
44-
skipped = -1
55+
totalTime := time.Now()
56+
for i, prefix := range scraper.prefixes {
57+
if startPrefix != "" && strings.Compare(prefix, startPrefix) < 0 {
58+
continue
4559
}
4660

61+
start := time.Now()
4762
if err := scraper.ensurePrefixFolder(prefix); err != nil {
4863
log.Fatal(err)
4964
}
@@ -57,10 +72,18 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo
5772
} else {
5873
sectionIds, err = scraper.getSectionIdsForPrefix(prefix)
5974
}
75+
6076
if err != nil {
61-
log.Fatal("Error getting section ids for prefix ", prefix)
77+
log.Fatalf("Error getting section ids for %s ", prefix)
6278
}
6379

80+
if len(sectionIds) == 0 {
81+
log.Printf("No sections found for %s ", prefix)
82+
continue
83+
}
84+
85+
log.Printf("[Scrape Prefix] %s (%d/%d): Found %d sections to scrape.", prefix, i+1, len(scraper.prefixes), len(sectionIds))
86+
6487
for _, sectionId := range sectionIds {
6588
content, err := scraper.getSectionContent(sectionId)
6689
if err != nil {
@@ -69,21 +92,17 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo
6992
if err := scraper.writeSection(prefix, sectionId, content); err != nil {
7093
log.Fatalf("Error writing section %s: %v", sectionId, err)
7194
}
72-
utils.VPrintf("Got section: %s", sectionId)
73-
74-
// wait 3 seconds between requests to avoid rate limiting
75-
time.Sleep(3 * time.Second)
95+
time.Sleep(reqThrottle)
7696
}
7797

98+
// At the end of the prefix loop
99+
log.Printf("[End Prefix] %s: Scraped %d sections in %v.", prefix, len(sectionIds), time.Since(start))
100+
time.Sleep(prefixThrottle)
78101
}
79-
log.Print("Done scraping term!")
80-
log.Print("Validating... ")
102+
log.Printf("[Scrape Complete] Finished scraping term %s in %v. Total sections %d: Total retries %d", term, time.Since(totalTime), scraper.totalScrapedSections, scraper.reqRetries)
81103

82-
success, err := scraper.validate()
83-
if err != nil {
104+
if err := scraper.validate(); err != nil {
84105
log.Fatal("Validating failed: ", err)
85-
} else if success {
86-
log.Print("Validating successful!")
87106
}
88107
}
89108

@@ -95,11 +114,20 @@ type coursebookScraper struct {
95114
prefixes []string
96115
term string
97116
outDir string
117+
118+
prefixIdsCache map[string][]string
119+
120+
//metrics
121+
reqRetries int
122+
totalScrapedSections int
98123
}
99124

100125
func newCoursebookScraper(term string, outDir string) *coursebookScraper {
101126
ctx, cancel := utils.InitChromeDp()
102-
httpClient := &http.Client{}
127+
httpClient := &http.Client{
128+
// longer than 10 seconds is probably a rate limit
129+
Timeout: 10 * time.Second,
130+
}
103131

104132
//prefixes in alphabetical order for skip prefix flag
105133
prefixes := utils.GetCoursePrefixes(ctx)
@@ -112,13 +140,18 @@ func newCoursebookScraper(term string, outDir string) *coursebookScraper {
112140
coursebookHeaders: utils.RefreshToken(ctx),
113141
term: term,
114142
outDir: outDir,
143+
prefixIdsCache: make(map[string][]string),
115144
}
116145
}
117146

118147
// lastCompletePrefix returns the last prefix (alphabetical order) that contains
119148
// html files for all of its section ids. returns an empty string if there are no
120149
// complete prefixes
121150
func (s *coursebookScraper) lastCompletePrefix() string {
151+
if err := s.ensureOutputFolder(); err != nil {
152+
log.Fatal(err)
153+
}
154+
122155
dir, err := os.ReadDir(filepath.Join(s.outDir, s.term))
123156
if err != nil {
124157
log.Fatalf("failed to read output directory: %v", err)
@@ -140,10 +173,20 @@ func (s *coursebookScraper) lastCompletePrefix() string {
140173
if len(missing) == 0 {
141174
return prefix
142175
}
176+
time.Sleep(reqThrottle)
143177
}
144178
return ""
145179
}
146180

181+
// ensurePrefixFolder creates {outDir}/term if it does not exist
182+
183+
func (s *coursebookScraper) ensureOutputFolder() error {
184+
if err := os.MkdirAll(filepath.Join(s.outDir, s.term), 0755); err != nil {
185+
return fmt.Errorf("failed to create term forlder: %w", err)
186+
}
187+
return nil
188+
}
189+
147190
// ensurePrefixFolder creates {outDir}/term/prefix if it does not exist
148191
func (s *coursebookScraper) ensurePrefixFolder(prefix string) error {
149192
if err := os.MkdirAll(filepath.Join(s.outDir, s.term, prefix), 0755); err != nil {
@@ -164,10 +207,11 @@ func (s *coursebookScraper) writeSection(prefix string, id string, content strin
164207
// retries up to 3 times, each time refreshing the token and waiting longer
165208
func (s *coursebookScraper) getSectionContent(id string) (string, error) {
166209
queryStr := fmt.Sprintf("id=%s&req=b30da8ab21637dbef35fd7682f48e1c1W0ypMhaj%%2FdsnYn3Wa03BrxSNgCeyvLfvucSTobcSXRf38SWaUaNfMjJQn%%2BdcabF%%2F7ZuG%%2BdKqHAqmrxEKyg8AdB0FqVGcz4rkff3%%2B3SIUIt8%%3D&action=info", id)
167-
response, err := s.req(queryStr, 3, fmt.Sprintf("section %s content", id))
210+
response, err := s.req(queryStr, 3, id)
168211
if err != nil {
169212
return "", fmt.Errorf("get section content for id %s failed: %w", id, err)
170213
}
214+
s.totalScrapedSections++
171215
return response, nil
172216
}
173217

@@ -203,33 +247,34 @@ func (s *coursebookScraper) getMissingIdsForPrefix(prefix string) ([]string, err
203247
for _, id := range sectionIds {
204248
if !foundIds[id] {
205249
filteredIds = append(filteredIds, id)
206-
} else {
207-
utils.VPrintf("Found section: %s", id)
208-
209250
}
210251
}
211252

212253
return filteredIds, nil
213254
}
214255

215256
// getSectionIdsForPrefix calls internal coursebook api to get all section ids for the provide prefix
216-
// retries up to 10 times, each time refreshing the token and waiting longer
257+
// retries up to 10 times, each time refreshing the token and waiting longer.
217258
func (s *coursebookScraper) getSectionIdsForPrefix(prefix string) ([]string, error) {
259+
if ids, ok := s.prefixIdsCache[prefix]; ok {
260+
return ids, nil
261+
}
262+
218263
sections := make([]string, 0, 100)
219264
for _, clevel := range []string{"clevel_u", "clevel_g"} {
220265
queryStr := fmt.Sprintf("action=search&s%%5B%%5D=term_%s&s%%5B%%5D=%s&s%%5B%%5D=%s", s.term, prefix, clevel)
221-
content, err := s.req(queryStr, 10, fmt.Sprintf("sections for %s", prefix))
266+
content, err := s.req(queryStr, 10, prefix)
222267
if err != nil {
223-
return []string{}, fmt.Errorf("failed to fetch sections: %s", err)
268+
return nil, fmt.Errorf("failed to fetch sections: %s", err)
224269
}
225-
226270
sectionRegexp := utils.Regexpf(`View details for section (%s%s\.\w+\.%s)`, prefix[3:], utils.R_COURSE_CODE, utils.R_TERM_CODE)
227271
smatches := sectionRegexp.FindAllStringSubmatch(content, -1)
228272
for _, match := range smatches {
229273
sections = append(sections, match[1])
230274
}
231275
}
232-
log.Printf("Found %d sections for %s", len(sections), prefix)
276+
277+
s.prefixIdsCache[prefix] = sections
233278
return sections, nil
234279
}
235280

@@ -242,17 +287,27 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s
242287
log.Fatalf("Http request failed: %v", err)
243288
}
244289
req.Header = s.coursebookHeaders
290+
291+
start := time.Now()
245292
res, err = s.httpClient.Do(req)
293+
dur := time.Since(start)
294+
246295
if res != nil && res.StatusCode != 200 {
296+
if netErr, ok := err.(net.Error); ok && netErr.Timeout() {
297+
utils.VPrintf("[Timeout] Request for [%s] timed out", reqName)
298+
return netErr // Return the error to trigger a retry
299+
}
300+
247301
return errors.New("non-200 response status code")
248302
}
303+
304+
utils.VPrintf("[Success] Request for [%s] took %v", reqName, dur)
249305
return err
250306
}, retries, func(numRetries int) {
251-
log.Printf("Failed to get %s, Retry %d of %d", reqName, numRetries, retries)
307+
utils.VPrintf("[Retry] Attempt %d of %d for request [%s]", numRetries, retries, reqName)
252308
s.coursebookHeaders = utils.RefreshToken(s.chromedpCtx)
253-
254-
// front load delay since if the first one fails it is likely the next few will as well
255-
time.Sleep((8 * time.Second) + (500 * time.Millisecond * time.Duration(numRetries)))
309+
s.reqRetries++
310+
time.Sleep(time.Duration(math.Pow(2, float64(numRetries))) * time.Second)
256311
})
257312
if err != nil {
258313
return "", err
@@ -276,38 +331,41 @@ func (s *coursebookScraper) cancel() {
276331
}
277332

278333
// validate returns true if each prefix contains all required ids
279-
func (s *coursebookScraper) validate() (bool, error) {
280-
missing := make(map[string][]string)
334+
// if it does not it will re-scrape all missing sections
335+
func (s *coursebookScraper) validate() error {
336+
log.Printf("[Begin Validation] Starting Validation for term %s", s.term)
281337

282-
count := 0
283338
for _, prefix := range s.prefixes {
284339
ids, err := s.getMissingIdsForPrefix(prefix)
285340
if err != nil {
286-
return false, err
341+
return err
342+
}
343+
if len(ids) == 0 {
344+
log.Printf("[Validation] %s is correct", prefix)
345+
continue
287346
}
288-
if len(ids) > 0 {
289-
count++
347+
348+
log.Printf("[Validation] Missing %d sections for %s", len(ids), prefix)
349+
350+
if err := s.ensurePrefixFolder(prefix); err != nil {
351+
log.Fatal(err)
290352
}
291-
missing[prefix] = ids
292-
time.Sleep(5 * time.Second)
293-
}
294353

295-
for prefix, ids := range missing {
296-
if len(ids) > 0 {
297-
log.Printf("Missing %d sections for prefix: %s", len(ids), prefix)
298-
299-
for _, id := range ids {
300-
content, err := s.getSectionContent(id)
301-
if err != nil {
302-
return false, fmt.Errorf("error getting section content for section %s: %v", id, err)
303-
}
304-
if err := s.writeSection(prefix, id, content); err != nil {
305-
return false, fmt.Errorf("error writing section %s: %v", id, err)
306-
}
307-
utils.VPrintf("Got section: %s", id)
308-
time.Sleep(3 * time.Second)
354+
for _, id := range ids {
355+
content, err := s.getSectionContent(id)
356+
if err != nil {
357+
return fmt.Errorf("error getting section content for section %s: %v", id, err)
309358
}
359+
if err := s.writeSection(prefix, id, content); err != nil {
360+
return fmt.Errorf("error writing section %s: %v", id, err)
361+
}
362+
time.Sleep(3 * time.Second)
310363
}
364+
365+
log.Printf("[Validation] %s is correct", prefix)
366+
time.Sleep(5 * time.Second)
311367
}
312-
return true, nil
368+
369+
log.Print("[End Validation] Validation Successful")
370+
return nil
313371
}

0 commit comments

Comments
 (0)