@@ -2,7 +2,7 @@ mod schedule;
22
33use std:: { path:: Path , time:: Duration } ;
44
5- use reqwest:: Client ;
5+ use reqwest:: { Client , StatusCode } ;
66pub use schedule:: { BoxScoreUrl , extract_boxscore_urls, extract_boxscore_urls_from_html, schedule_url_for_year} ;
77use thiserror:: Error ;
88use tokio:: time:: sleep;
@@ -16,14 +16,23 @@ use crate::{
1616const BASE_URL : & str = "https://www.baseball-reference.com" ;
1717const USER_AGENT : & str = "Mozilla/5.0 (compatible; BaseballScraper/1.0; educational project)" ;
1818
19- /// Delay between requests to respect rate limiting (10 requests/minute = 6 seconds between requests)
20- const REQUEST_DELAY : Duration = Duration :: from_secs ( 6 ) ;
19+ /// Base delay between requests (~2 requests/second)
20+ const BASE_DELAY : Duration = Duration :: from_secs ( 3 ) ;
21+ /// Maximum delay after repeated backoffs
22+ const MAX_DELAY : Duration = Duration :: from_secs ( 300 ) ;
23+ /// Multiplier applied to delay on rate-limit or server errors
24+ const BACKOFF_MULTIPLIER : f64 = 4.0 ;
25+ /// Maximum retries per request before giving up
26+ const MAX_RETRIES : u32 = 10 ;
2127
2228#[ derive( Error , Debug ) ]
2329pub enum ScrapeError {
2430 #[ error( "HTTP error: {0}" ) ]
2531 Http ( #[ from] reqwest:: Error ) ,
2632
33+ #[ error( "Rate limited (HTTP {0})" ) ]
34+ RateLimited ( u16 ) ,
35+
2736 #[ error( "Parse error: {0}" ) ]
2837 Parse ( #[ from] crate :: parser:: ParseError ) ,
2938
@@ -53,16 +62,20 @@ pub struct Scraper {
5362
5463impl Scraper {
5564 /// Create a new scraper
56- pub fn new ( ) -> Result < Self , ScrapeError > {
65+ ///
66+ /// # Panics
67+ /// Panics if the HTTP client cannot be built.
68+ pub fn new ( ) -> Self {
5769 let client = Client :: builder ( )
5870 . user_agent ( USER_AGENT )
5971 . timeout ( Duration :: from_secs ( 30 ) )
60- . build ( ) ?;
72+ . build ( )
73+ . expect ( "http client should be valid" ) ;
6174
62- Ok ( Self {
75+ Self {
6376 client,
6477 output_dir : None ,
65- } )
78+ }
6679 }
6780
6881 /// Set directory to save downloaded HTML files
@@ -91,12 +104,20 @@ impl Scraper {
91104 Ok ( html)
92105 }
93106
94- /// Fetch a box score page
107+ /// Fetch a box score page, returning the HTML on success or an error with
108+ /// rate-limit awareness. Returns `ScrapeError::RateLimited` for 429 and 5xx
109+ /// responses so callers can back off.
95110 pub async fn fetch_boxscore ( & self , url : & BoxScoreUrl ) -> Result < String , ScrapeError > {
96111 let full_url = format ! ( "{}{}" , BASE_URL , url. path) ;
97112 info ! ( "Fetching: {}" , full_url) ;
98113
99114 let response = self . client . get ( & full_url) . send ( ) . await ?;
115+ let status = response. status ( ) ;
116+
117+ if status == StatusCode :: TOO_MANY_REQUESTS || status. is_server_error ( ) {
118+ return Err ( ScrapeError :: RateLimited ( status. as_u16 ( ) ) ) ;
119+ }
120+
100121 let html = response. text ( ) . await ?;
101122
102123 // Save to file if output directory is set
@@ -110,17 +131,55 @@ impl Scraper {
110131 Ok ( html)
111132 }
112133
113- /// Scrape and import a single box score
114- pub async fn scrape_and_import ( & self , url : & BoxScoreUrl , inserter : & BoxScoreInserter < ' _ > ) -> ScrapeResult {
115- // Fetch the HTML
116- let html = match self . fetch_boxscore ( url) . await {
117- Ok ( h) => h,
118- Err ( e) => {
119- return ScrapeResult :: Failed {
134+ /// Scrape and import a single box score, retrying with backoff on rate-limit errors.
135+ /// Returns the result and whether a rate-limit was hit (so the caller can adjust pacing).
136+ async fn scrape_and_import_with_backoff (
137+ & self ,
138+ url : & BoxScoreUrl ,
139+ inserter : & BoxScoreInserter < ' _ > ,
140+ current_delay : & mut Duration ,
141+ ) -> ScrapeResult {
142+ // Check if game already exists before fetching
143+ match inserter. game_exists ( & url. game_id ) . await {
144+ Ok ( true ) => {
145+ return ScrapeResult :: AlreadyExists {
120146 game_id : url. game_id . clone ( ) ,
121- error : e. to_string ( ) ,
122147 } ;
123148 }
149+ Ok ( false ) => { }
150+ Err ( e) => {
151+ warn ! ( "Failed to check if game exists: {e}, proceeding with fetch" ) ;
152+ }
153+ }
154+
155+ let mut attempt = 0 ;
156+
157+ let html = loop {
158+ match self . fetch_boxscore ( url) . await {
159+ Ok ( h) => {
160+ // Success — ease back toward base delay
161+ * current_delay = ( * current_delay / 2 ) . max ( BASE_DELAY ) ;
162+ break h;
163+ }
164+ Err ( ScrapeError :: RateLimited ( status) ) => {
165+ attempt += 1 ;
166+ * current_delay = current_delay. mul_f64 ( BACKOFF_MULTIPLIER ) . min ( MAX_DELAY ) ;
167+ if attempt > MAX_RETRIES {
168+ return ScrapeResult :: Failed {
169+ game_id : url. game_id . clone ( ) ,
170+ error : format ! ( "Rate limited (HTTP {status}) after {MAX_RETRIES} retries" ) ,
171+ } ;
172+ }
173+ warn ! ( "Rate limited (HTTP {status}), retry {attempt}/{MAX_RETRIES} after {current_delay:?}" ) ;
174+ sleep ( * current_delay) . await ;
175+ }
176+ Err ( e) => {
177+ return ScrapeResult :: Failed {
178+ game_id : url. game_id . clone ( ) ,
179+ error : e. to_string ( ) ,
180+ } ;
181+ }
182+ }
124183 } ;
125184
126185 // Parse the box score
@@ -153,7 +212,10 @@ impl Scraper {
153212 self . scrape_all_with_tracking ( urls, inserter, None ) . await
154213 }
155214
156- /// Scrape multiple box scores with rate limiting and optional failure tracking
215+ /// Scrape multiple box scores with adaptive rate limiting and optional failure tracking.
216+ ///
217+ /// Starts at ~2 requests/second and backs off exponentially on 429 / 5xx
218+ /// errors. After successful requests the delay eases back toward the base rate.
157219 pub async fn scrape_all_with_tracking (
158220 & self ,
159221 urls : & [ BoxScoreUrl ] ,
@@ -162,11 +224,12 @@ impl Scraper {
162224 ) -> Vec < ScrapeResult > {
163225 let mut results = Vec :: with_capacity ( urls. len ( ) ) ;
164226 let total = urls. len ( ) ;
227+ let mut delay = BASE_DELAY ;
165228
166229 for ( i, url) in urls. iter ( ) . enumerate ( ) {
167- info ! ( "[{}/{}] Processing: {}" , i + 1 , total, url. game_id) ;
230+ info ! ( "[{}/{}] Processing: {} (delay: {delay:?}) " , i + 1 , total, url. game_id) ;
168231
169- let result = self . scrape_and_import ( url, inserter) . await ;
232+ let result = self . scrape_and_import_with_backoff ( url, inserter, & mut delay ) . await ;
170233
171234 match & result {
172235 ScrapeResult :: Imported { game_id, db_id } => {
@@ -198,12 +261,13 @@ impl Scraper {
198261 }
199262 }
200263
264+ // Only delay after actual HTTP requests (not skipped duplicates)
265+ let needs_delay = !matches ! ( & result, ScrapeResult :: AlreadyExists { .. } ) ;
266+
201267 results. push ( result) ;
202268
203- // Rate limiting - wait between requests (except for last one)
204- if i < total - 1 {
205- info ! ( "Waiting {:?} before next request..." , REQUEST_DELAY ) ;
206- sleep ( REQUEST_DELAY ) . await ;
269+ if needs_delay && i < total - 1 {
270+ sleep ( delay) . await ;
207271 }
208272 }
209273
@@ -213,6 +277,6 @@ impl Scraper {
213277
214278impl Default for Scraper {
215279 fn default ( ) -> Self {
216- Self :: new ( ) . expect ( "Failed to create default scraper" )
280+ Self :: new ( )
217281 }
218282}
0 commit comments