@@ -10,9 +10,12 @@ import (
1010 "fmt"
1111 "io"
1212 "log"
13+ "math"
14+ "net"
1315 "net/http"
1416 "os"
1517 "path/filepath"
18+ "regexp"
1619 "slices"
1720 "sort"
1821 "strings"
@@ -21,29 +24,41 @@ import (
2124 "github.com/UTDNebula/api-tools/utils"
2225)
2326
27+ var (
28+ prefixRegex = regexp .MustCompile ("cp_[a-z]{0,5}" )
29+ termRegex = regexp .MustCompile ("[0-9]{1,2}[sfu]" )
30+ )
31+
32+ const reqThrottle = 400 * time .Millisecond
33+ const prefixThrottle = 5 * time .Second
34+
2435// ScrapeCoursebook scrapes utd coursebook for the provided term (semester)
2536// if resume flag is true then
2637func ScrapeCoursebook (term string , startPrefix string , outDir string , resume bool ) {
38+ if startPrefix != "" && ! prefixRegex .MatchString (startPrefix ) {
39+ log .Fatalf ("Invalid starting prefix %s, must match format cp_{abcde}" , startPrefix )
40+ }
41+ if ! termRegex .MatchString (term ) {
42+ log .Fatalf ("Invalid term %s, must match format {00-99}{s/f/u}" , term )
43+ }
44+
2745 scraper := newCoursebookScraper (term , outDir )
2846 defer scraper .cancel ()
2947
30- if startPrefix == "" {
48+ if resume && startPrefix == "" {
3149 // providing a starting prefix overrides the resume flag
3250 startPrefix = scraper .lastCompletePrefix ()
3351 }
3452
35- skipped := 0
36- for _ , prefix := range scraper .prefixes {
37- if startPrefix == "" || strings .Compare (prefix , startPrefix ) == - 1 {
38- utils .VPrintf ("Skipping prefix %s" , prefix )
39- continue
40- }
53+ log .Printf ("[Begin Scrape] Starting scrape for term %s with %d prefixes" , term , len (scraper .prefixes ))
4154
42- if skipped != - 1 {
43- log .Printf ("Skipped %d prefixes" , skipped )
44- skipped = - 1
55+ totalTime := time .Now ()
56+ for i , prefix := range scraper .prefixes {
57+ if startPrefix != "" && strings .Compare (prefix , startPrefix ) < 0 {
58+ continue
4559 }
4660
61+ start := time .Now ()
4762 if err := scraper .ensurePrefixFolder (prefix ); err != nil {
4863 log .Fatal (err )
4964 }
@@ -57,10 +72,18 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo
5772 } else {
5873 sectionIds , err = scraper .getSectionIdsForPrefix (prefix )
5974 }
75+
6076 if err != nil {
61- log .Fatal ("Error getting section ids for prefix " , prefix )
77+ log .Fatalf ("Error getting section ids for %s " , prefix )
6278 }
6379
80+ if len (sectionIds ) == 0 {
81+ log .Printf ("No sections found for %s " , prefix )
82+ continue
83+ }
84+
85+ log .Printf ("[Scrape Prefix] %s (%d/%d): Found %d sections to scrape." , prefix , i + 1 , len (scraper .prefixes ), len (sectionIds ))
86+
6487 for _ , sectionId := range sectionIds {
6588 content , err := scraper .getSectionContent (sectionId )
6689 if err != nil {
@@ -69,21 +92,17 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo
6992 if err := scraper .writeSection (prefix , sectionId , content ); err != nil {
7093 log .Fatalf ("Error writing section %s: %v" , sectionId , err )
7194 }
72- utils .VPrintf ("Got section: %s" , sectionId )
73-
74- // wait 3 seconds between requests to avoid rate limiting
75- time .Sleep (3 * time .Second )
95+ time .Sleep (reqThrottle )
7696 }
7797
98+ // At the end of the prefix loop
99+ log .Printf ("[End Prefix] %s: Scraped %d sections in %v." , prefix , len (sectionIds ), time .Since (start ))
100+ time .Sleep (prefixThrottle )
78101 }
79- log .Print ("Done scraping term!" )
80- log .Print ("Validating... " )
102+ log .Printf ("[Scrape Complete] Finished scraping term %s in %v. Total sections %d: Total retries %d" , term , time .Since (totalTime ), scraper .totalScrapedSections , scraper .reqRetries )
81103
82- success , err := scraper .validate ()
83- if err != nil {
104+ if err := scraper .validate (); err != nil {
84105 log .Fatal ("Validating failed: " , err )
85- } else if success {
86- log .Print ("Validating successful!" )
87106 }
88107}
89108
@@ -95,11 +114,20 @@ type coursebookScraper struct {
95114 prefixes []string
96115 term string
97116 outDir string
117+
118+ prefixIdsCache map [string ][]string
119+
120+ //metrics
121+ reqRetries int
122+ totalScrapedSections int
98123}
99124
100125func newCoursebookScraper (term string , outDir string ) * coursebookScraper {
101126 ctx , cancel := utils .InitChromeDp ()
102- httpClient := & http.Client {}
127+ httpClient := & http.Client {
128+ // longer than 10 seconds is probably a rate limit
129+ Timeout : 10 * time .Second ,
130+ }
103131
104132 //prefixes in alphabetical order for skip prefix flag
105133 prefixes := utils .GetCoursePrefixes (ctx )
@@ -112,13 +140,18 @@ func newCoursebookScraper(term string, outDir string) *coursebookScraper {
112140 coursebookHeaders : utils .RefreshToken (ctx ),
113141 term : term ,
114142 outDir : outDir ,
143+ prefixIdsCache : make (map [string ][]string ),
115144 }
116145}
117146
118147// lastCompletePrefix returns the last prefix (alphabetical order) that contains
119148// html files for all of its section ids. returns an empty string if there are no
120149// complete prefixes
121150func (s * coursebookScraper ) lastCompletePrefix () string {
151+ if err := s .ensureOutputFolder (); err != nil {
152+ log .Fatal (err )
153+ }
154+
122155 dir , err := os .ReadDir (filepath .Join (s .outDir , s .term ))
123156 if err != nil {
124157 log .Fatalf ("failed to read output directory: %v" , err )
@@ -140,10 +173,20 @@ func (s *coursebookScraper) lastCompletePrefix() string {
140173 if len (missing ) == 0 {
141174 return prefix
142175 }
176+ time .Sleep (reqThrottle )
143177 }
144178 return ""
145179}
146180
181+ // ensurePrefixFolder creates {outDir}/term if it does not exist
182+
183+ func (s * coursebookScraper ) ensureOutputFolder () error {
184+ if err := os .MkdirAll (filepath .Join (s .outDir , s .term ), 0755 ); err != nil {
185+ return fmt .Errorf ("failed to create term forlder: %w" , err )
186+ }
187+ return nil
188+ }
189+
147190// ensurePrefixFolder creates {outDir}/term/prefix if it does not exist
148191func (s * coursebookScraper ) ensurePrefixFolder (prefix string ) error {
149192 if err := os .MkdirAll (filepath .Join (s .outDir , s .term , prefix ), 0755 ); err != nil {
@@ -164,10 +207,11 @@ func (s *coursebookScraper) writeSection(prefix string, id string, content strin
164207// retries up to 3 times, each time refreshing the token and waiting longer
165208func (s * coursebookScraper ) getSectionContent (id string ) (string , error ) {
166209 queryStr := fmt .Sprintf ("id=%s&req=b30da8ab21637dbef35fd7682f48e1c1W0ypMhaj%%2FdsnYn3Wa03BrxSNgCeyvLfvucSTobcSXRf38SWaUaNfMjJQn%%2BdcabF%%2F7ZuG%%2BdKqHAqmrxEKyg8AdB0FqVGcz4rkff3%%2B3SIUIt8%%3D&action=info" , id )
167- response , err := s .req (queryStr , 3 , fmt . Sprintf ( "section %s content" , id ) )
210+ response , err := s .req (queryStr , 3 , id )
168211 if err != nil {
169212 return "" , fmt .Errorf ("get section content for id %s failed: %w" , id , err )
170213 }
214+ s .totalScrapedSections ++
171215 return response , nil
172216}
173217
@@ -203,33 +247,34 @@ func (s *coursebookScraper) getMissingIdsForPrefix(prefix string) ([]string, err
203247 for _ , id := range sectionIds {
204248 if ! foundIds [id ] {
205249 filteredIds = append (filteredIds , id )
206- } else {
207- utils .VPrintf ("Found section: %s" , id )
208-
209250 }
210251 }
211252
212253 return filteredIds , nil
213254}
214255
215256// getSectionIdsForPrefix calls internal coursebook api to get all section ids for the provide prefix
216- // retries up to 10 times, each time refreshing the token and waiting longer
257+ // retries up to 10 times, each time refreshing the token and waiting longer.
217258func (s * coursebookScraper ) getSectionIdsForPrefix (prefix string ) ([]string , error ) {
259+ if ids , ok := s .prefixIdsCache [prefix ]; ok {
260+ return ids , nil
261+ }
262+
218263 sections := make ([]string , 0 , 100 )
219264 for _ , clevel := range []string {"clevel_u" , "clevel_g" } {
220265 queryStr := fmt .Sprintf ("action=search&s%%5B%%5D=term_%s&s%%5B%%5D=%s&s%%5B%%5D=%s" , s .term , prefix , clevel )
221- content , err := s .req (queryStr , 10 , fmt . Sprintf ( "sections for %s" , prefix ) )
266+ content , err := s .req (queryStr , 10 , prefix )
222267 if err != nil {
223- return [] string {} , fmt .Errorf ("failed to fetch sections: %s" , err )
268+ return nil , fmt .Errorf ("failed to fetch sections: %s" , err )
224269 }
225-
226270 sectionRegexp := utils .Regexpf (`View details for section (%s%s\.\w+\.%s)` , prefix [3 :], utils .R_COURSE_CODE , utils .R_TERM_CODE )
227271 smatches := sectionRegexp .FindAllStringSubmatch (content , - 1 )
228272 for _ , match := range smatches {
229273 sections = append (sections , match [1 ])
230274 }
231275 }
232- log .Printf ("Found %d sections for %s" , len (sections ), prefix )
276+
277+ s .prefixIdsCache [prefix ] = sections
233278 return sections , nil
234279}
235280
@@ -242,17 +287,27 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s
242287 log .Fatalf ("Http request failed: %v" , err )
243288 }
244289 req .Header = s .coursebookHeaders
290+
291+ start := time .Now ()
245292 res , err = s .httpClient .Do (req )
293+ dur := time .Since (start )
294+
246295 if res != nil && res .StatusCode != 200 {
296+ if netErr , ok := err .(net.Error ); ok && netErr .Timeout () {
297+ utils .VPrintf ("[Timeout] Request for [%s] timed out" , reqName )
298+ return netErr // Return the error to trigger a retry
299+ }
300+
247301 return errors .New ("non-200 response status code" )
248302 }
303+
304+ utils .VPrintf ("[Success] Request for [%s] took %v" , reqName , dur )
249305 return err
250306 }, retries , func (numRetries int ) {
251- log . Printf ( "Failed to get %s, Retry %d of %d " , reqName , numRetries , retries )
307+ utils . VPrintf ( "[Retry] Attempt %d of %d for request [%s] " , numRetries , retries , reqName )
252308 s .coursebookHeaders = utils .RefreshToken (s .chromedpCtx )
253-
254- // front load delay since if the first one fails it is likely the next few will as well
255- time .Sleep ((8 * time .Second ) + (500 * time .Millisecond * time .Duration (numRetries )))
309+ s .reqRetries ++
310+ time .Sleep (time .Duration (math .Pow (2 , float64 (numRetries ))) * time .Second )
256311 })
257312 if err != nil {
258313 return "" , err
@@ -276,38 +331,41 @@ func (s *coursebookScraper) cancel() {
276331}
277332
278333// validate returns true if each prefix contains all required ids
279- func (s * coursebookScraper ) validate () (bool , error ) {
280- missing := make (map [string ][]string )
334+ // if it does not it will re-scrape all missing sections
335+ func (s * coursebookScraper ) validate () error {
336+ log .Printf ("[Begin Validation] Starting Validation for term %s" , s .term )
281337
282- count := 0
283338 for _ , prefix := range s .prefixes {
284339 ids , err := s .getMissingIdsForPrefix (prefix )
285340 if err != nil {
286- return false , err
341+ return err
342+ }
343+ if len (ids ) == 0 {
344+ log .Printf ("[Validation] %s is correct" , prefix )
345+ continue
287346 }
288- if len (ids ) > 0 {
289- count ++
347+
348+ log .Printf ("[Validation] Missing %d sections for %s" , len (ids ), prefix )
349+
350+ if err := s .ensurePrefixFolder (prefix ); err != nil {
351+ log .Fatal (err )
290352 }
291- missing [prefix ] = ids
292- time .Sleep (5 * time .Second )
293- }
294353
295- for prefix , ids := range missing {
296- if len (ids ) > 0 {
297- log .Printf ("Missing %d sections for prefix: %s" , len (ids ), prefix )
298-
299- for _ , id := range ids {
300- content , err := s .getSectionContent (id )
301- if err != nil {
302- return false , fmt .Errorf ("error getting section content for section %s: %v" , id , err )
303- }
304- if err := s .writeSection (prefix , id , content ); err != nil {
305- return false , fmt .Errorf ("error writing section %s: %v" , id , err )
306- }
307- utils .VPrintf ("Got section: %s" , id )
308- time .Sleep (3 * time .Second )
354+ for _ , id := range ids {
355+ content , err := s .getSectionContent (id )
356+ if err != nil {
357+ return fmt .Errorf ("error getting section content for section %s: %v" , id , err )
309358 }
359+ if err := s .writeSection (prefix , id , content ); err != nil {
360+ return fmt .Errorf ("error writing section %s: %v" , id , err )
361+ }
362+ time .Sleep (3 * time .Second )
310363 }
364+
365+ log .Printf ("[Validation] %s is correct" , prefix )
366+ time .Sleep (5 * time .Second )
311367 }
312- return true , nil
368+
369+ log .Print ("[End Validation] Validation Successful" )
370+ return nil
313371}
0 commit comments