Skip to content

Commit 199c1ba

Browse files
committed
Improve coursebook scraping hang prevention
1 parent 423a7df commit 199c1ba

File tree

2 files changed

+26
-28
lines changed

2 files changed

+26
-28
lines changed

scrapers/coursebook.go

Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package scrapers
66

77
import (
88
"bytes"
9+
"errors"
910
"fmt"
1011
"log"
1112
"net/http"
@@ -84,20 +85,15 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) {
8485
}
8586
req.Header = coursebookHeaders
8687
res, err = cli.Do(req)
88+
if res.StatusCode != 200 {
89+
return errors.New("Non-200 response status code")
90+
}
8791
return err
8892
}, 10, func(numRetries int) {
89-
log.Printf("ERROR: Section find for course prefix %s failed! Response code was: %s", coursePrefix, res.Status)
90-
// Wait longer if 3 retries fail; we've probably been IP ratelimited...
91-
if numRetries >= 3 {
92-
log.Printf("WARNING: More than 3 retries have failed. Waiting for 5 minutes before attempting further retries.")
93-
time.Sleep(5 * time.Minute)
94-
} else {
95-
log.Printf("Getting new token and retrying in 3 seconds...")
96-
time.Sleep(3 * time.Second)
97-
}
93+
log.Printf("WARN: Section find for course prefix %s failed! Response code was: %s", coursePrefix, res.Status)
9894
coursebookHeaders = utils.RefreshToken(chromedpCtx)
99-
// Give coursebook some time to recognize the new token
100-
time.Sleep(500 * time.Millisecond)
95+
// Wait proportionally long to how many times we've retried; generally works pretty well
96+
time.Sleep(500 * time.Millisecond * time.Duration(numRetries))
10197
})
10298

10399
if err != nil {
@@ -128,7 +124,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) {
128124

129125
// Get section info
130126
// Worth noting that the "req" and "div" params in the request below don't actually seem to matter... consider them filler to make sure the request goes through
131-
queryStr := fmt.Sprintf("id=%s&req=0bd73666091d3d1da057c5eeb6ef20a7df3CTp0iTMYFuu9paDeUptMzLYUiW4BIk9i8LIFcBahX2E2b18WWXkUUJ1Y7Xq6j3WZAKPbREfGX7lZY96lI7btfpVS95YAprdJHX9dc5wM=&action=section&div=r-62childcontent", id)
127+
queryStr := fmt.Sprintf("id=%s&req=b30da8ab21637dbef35fd7682f48e1c1W0ypMhaj%%2FdsnYn3Wa03BrxSNgCeyvLfvucSTobcSXRf38SWaUaNfMjJQn%%2BdcabF%%2F7ZuG%%2BdKqHAqmrxEKyg8AdB0FqVGcz4rkff3%%2B3SIUIt8%%3D&action=info", id)
132128

133129
// Try HTTP request, retrying if necessary
134130
var res *http.Response
@@ -139,20 +135,15 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) {
139135
}
140136
req.Header = coursebookHeaders
141137
res, err = cli.Do(req)
138+
if res.StatusCode != 200 {
139+
return errors.New("Non-200 response status code")
140+
}
142141
return err
143142
}, 10, func(numRetries int) {
144-
log.Printf("ERROR: Section id lookup for id %s failed! Response code was: %s", id, res.Status)
145-
// Wait longer if 3 retries fail; we've probably been IP ratelimited...
146-
if numRetries >= 3 {
147-
log.Printf("WARNING: More than 3 retries have failed. Waiting for 5 minutes before attempting further retries.")
148-
time.Sleep(5 * time.Minute)
149-
} else {
150-
log.Printf("Getting new token and retrying in 3 seconds...")
151-
time.Sleep(3 * time.Second)
152-
}
143+
log.Printf("WARN: Section id lookup for id %s failed! Response code was: %s", id, res.Status)
153144
coursebookHeaders = utils.RefreshToken(chromedpCtx)
154-
// Give coursebook some time to recognize the new token
155-
time.Sleep(500 * time.Millisecond)
145+
// Wait proportionally long to how many times we've retried; generally works pretty well
146+
time.Sleep(500 * time.Millisecond * time.Duration(numRetries))
156147
})
157148

158149
if err != nil {
@@ -181,6 +172,10 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) {
181172
sectionsInCoursePrefix++
182173
}
183174
log.Printf("\nFinished scraping course prefix %s. Got %d sections.", coursePrefix, sectionsInCoursePrefix)
175+
// Panic if we got fewer sections than we should've
176+
if sectionsInCoursePrefix != len(sectionIDs) {
177+
log.Panicf("Section count mismatch! Coursebook has %d sections for course prefix %s but we only got %d", sectionsInCoursePrefix, coursePrefix, sectionsInCoursePrefix)
178+
}
184179
totalSections += sectionsInCoursePrefix
185180
}
186181
log.Printf("\nDone scraping term! Scraped a total of %d sections.", totalSections)

utils/methods.go

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,19 +64,19 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string {
6464

6565
VPrintf("Getting new token...")
6666
err = Retry(func() error {
67-
_, err = chromedp.RunResponse(chromedpCtx,
67+
r, err := chromedp.RunResponse(chromedpCtx,
6868
chromedp.ActionFunc(func(ctx context.Context) error {
6969
err := network.ClearBrowserCookies().Do(ctx)
7070
return err
7171
}),
7272
chromedp.Navigate(`https://wat.utdallas.edu/login`),
73-
chromedp.WaitVisible(`form#login-form`),
7473
chromedp.SendKeys(`input#netid`, netID),
7574
chromedp.SendKeys(`input#password`, password),
76-
chromedp.WaitVisible(`button#login-button`),
7775
chromedp.Click(`button#login-button`),
78-
chromedp.WaitVisible(`body`),
7976
)
77+
if r != nil && r.Status != 200 {
78+
return errors.New("Non-200 response status code")
79+
}
8080
return err
8181
}, 3, delayedRetryCallback)
8282

@@ -89,7 +89,7 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string {
8989
var cookieStrs []string
9090

9191
err = Retry(func() error {
92-
_, err = chromedp.RunResponse(chromedpCtx,
92+
r, err := chromedp.RunResponse(chromedpCtx,
9393
chromedp.Navigate(`https://coursebook.utdallas.edu/`),
9494
chromedp.ActionFunc(func(ctx context.Context) error {
9595
cookies, err := network.GetCookies().Do(ctx)
@@ -108,6 +108,9 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string {
108108
return err
109109
}),
110110
)
111+
if r != nil && r.Status != 200 {
112+
return errors.New("Non-200 response status code")
113+
}
111114
return err
112115
}, 3, delayedRetryCallback)
113116

0 commit comments

Comments
 (0)