Skip to content

Commit 50a16f6

Browse files
committed
[scraper] optimize scraper, cleanup deps
1 parent 9aa0845 commit 50a16f6

File tree

6 files changed

+102
-43
lines changed

6 files changed

+102
-43
lines changed

sports/Cargo.toml

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,8 @@ license.workspace = true
1313
[dependencies]
1414
baseballref = {path = './baseballref'}
1515
anyhow.workspace = true
16-
chrono.workspace = true
1716
clap = { workspace = true, features = ["env"] }
18-
reqwest.workspace = true
19-
rust_decimal.workspace = true
20-
scraper.workspace = true
21-
serde.workspace = true
22-
serde_json.workspace = true
23-
sqlx.workspace = true
24-
thiserror.workspace = true
2517
tokio.workspace = true
26-
tracing.workspace = true
2718
tracing-subscriber.workspace = true
2819

2920
[dev-dependencies]

sports/baseballref/Cargo.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,10 @@ reqwest.workspace = true
1919
rust_decimal.workspace = true
2020
scraper.workspace = true
2121
serde.workspace = true
22-
serde_json.workspace = true
2322
sqlx.workspace = true
2423
thiserror.workspace = true
2524
tokio.workspace = true
2625
tracing.workspace = true
27-
tracing-subscriber.workspace = true
2826

2927
[dev-dependencies]
3028
tokio = { workspace = true, features = ["test-util"] }

sports/baseballref/src/cli.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -483,9 +483,9 @@ pub async fn handle_command(command: BaseballCommands) -> anyhow::Result<()> {
483483
// Create scraper
484484
let scraper = if let Some(ref dir) = output_dir {
485485
std::fs::create_dir_all(dir)?;
486-
Scraper::new()?.with_output_dir(dir)
486+
Scraper::new().with_output_dir(dir)
487487
} else {
488-
Scraper::new()?
488+
Scraper::new()
489489
};
490490

491491
// Create inserter
@@ -593,9 +593,9 @@ pub async fn handle_command(command: BaseballCommands) -> anyhow::Result<()> {
593593
let scraper = if let Some(ref dir) = output_dir {
594594
// Create output directory if it doesn't exist
595595
std::fs::create_dir_all(dir)?;
596-
Scraper::new()?.with_output_dir(dir)
596+
Scraper::new().with_output_dir(dir)
597597
} else {
598-
Scraper::new()?
598+
Scraper::new()
599599
};
600600

601601
// Create inserter
@@ -667,9 +667,9 @@ pub async fn handle_command(command: BaseballCommands) -> anyhow::Result<()> {
667667
// Create scraper
668668
let scraper = if let Some(ref dir) = output_dir {
669669
std::fs::create_dir_all(dir)?;
670-
Scraper::new()?.with_output_dir(dir)
670+
Scraper::new().with_output_dir(dir)
671671
} else {
672-
Scraper::new()?
672+
Scraper::new()
673673
};
674674

675675
// Create inserter
@@ -871,9 +871,9 @@ pub async fn handle_command(command: BaseballCommands) -> anyhow::Result<()> {
871871
// Create scraper
872872
let scraper = if let Some(ref dir) = output_dir {
873873
std::fs::create_dir_all(dir)?;
874-
Scraper::new()?.with_output_dir(dir)
874+
Scraper::new().with_output_dir(dir)
875875
} else {
876-
Scraper::new()?
876+
Scraper::new()
877877
};
878878

879879
// Create inserter

sports/baseballref/src/db/box_score.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ impl<'a> BoxScoreInserter<'a> {
4040
Self { pool }
4141
}
4242

43+
/// Check if a game already exists by `bbref_game_id`
44+
pub async fn game_exists(&self, bbref_game_id: &str) -> Result<bool, InsertError> {
45+
Ok(game_exists(self.pool, bbref_game_id).await?)
46+
}
47+
4348
/// Insert a complete box score into the database
4449
/// Uses a transaction to ensure atomicity
4550
pub async fn insert(&self, box_score: &BoxScore) -> Result<i32, InsertError> {

sports/baseballref/src/db/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use std::time::Duration;
1111

1212
pub use box_score::{BoxScoreInserter, InsertError};
1313
pub use failed_scrapes::{FailedScrape, FailedScrapesDb};
14+
pub use games::game_exists;
1415
use sqlx::postgres::{PgPool, PgPoolOptions};
1516

1617
/// Create a database connection pool

sports/baseballref/src/scraper/mod.rs

Lines changed: 88 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ mod schedule;
22

33
use std::{path::Path, time::Duration};
44

5-
use reqwest::Client;
5+
use reqwest::{Client, StatusCode};
66
pub use schedule::{BoxScoreUrl, extract_boxscore_urls, extract_boxscore_urls_from_html, schedule_url_for_year};
77
use thiserror::Error;
88
use tokio::time::sleep;
@@ -16,14 +16,23 @@ use crate::{
1616
const BASE_URL: &str = "https://www.baseball-reference.com";
1717
const USER_AGENT: &str = "Mozilla/5.0 (compatible; BaseballScraper/1.0; educational project)";
1818

19-
/// Delay between requests to respect rate limiting (10 requests/minute = 6 seconds between requests)
20-
const REQUEST_DELAY: Duration = Duration::from_secs(6);
19+
/// Base delay between requests (~2 requests/second)
20+
const BASE_DELAY: Duration = Duration::from_secs(3);
21+
/// Maximum delay after repeated backoffs
22+
const MAX_DELAY: Duration = Duration::from_secs(300);
23+
/// Multiplier applied to delay on rate-limit or server errors
24+
const BACKOFF_MULTIPLIER: f64 = 4.0;
25+
/// Maximum retries per request before giving up
26+
const MAX_RETRIES: u32 = 10;
2127

2228
#[derive(Error, Debug)]
2329
pub enum ScrapeError {
2430
#[error("HTTP error: {0}")]
2531
Http(#[from] reqwest::Error),
2632

33+
#[error("Rate limited (HTTP {0})")]
34+
RateLimited(u16),
35+
2736
#[error("Parse error: {0}")]
2837
Parse(#[from] crate::parser::ParseError),
2938

@@ -53,16 +62,20 @@ pub struct Scraper {
5362

5463
impl Scraper {
5564
/// Create a new scraper
56-
pub fn new() -> Result<Self, ScrapeError> {
65+
///
66+
/// # Panics
67+
/// Panics if the HTTP client cannot be built.
68+
pub fn new() -> Self {
5769
let client = Client::builder()
5870
.user_agent(USER_AGENT)
5971
.timeout(Duration::from_secs(30))
60-
.build()?;
72+
.build()
73+
.expect("http client should be valid");
6174

62-
Ok(Self {
75+
Self {
6376
client,
6477
output_dir: None,
65-
})
78+
}
6679
}
6780

6881
/// Set directory to save downloaded HTML files
@@ -91,12 +104,20 @@ impl Scraper {
91104
Ok(html)
92105
}
93106

94-
/// Fetch a box score page
107+
/// Fetch a box score page, returning the HTML on success or an error with
108+
/// rate-limit awareness. Returns `ScrapeError::RateLimited` for 429 and 5xx
109+
/// responses so callers can back off.
95110
pub async fn fetch_boxscore(&self, url: &BoxScoreUrl) -> Result<String, ScrapeError> {
96111
let full_url = format!("{}{}", BASE_URL, url.path);
97112
info!("Fetching: {}", full_url);
98113

99114
let response = self.client.get(&full_url).send().await?;
115+
let status = response.status();
116+
117+
if status == StatusCode::TOO_MANY_REQUESTS || status.is_server_error() {
118+
return Err(ScrapeError::RateLimited(status.as_u16()));
119+
}
120+
100121
let html = response.text().await?;
101122

102123
// Save to file if output directory is set
@@ -110,17 +131,55 @@ impl Scraper {
110131
Ok(html)
111132
}
112133

113-
/// Scrape and import a single box score
114-
pub async fn scrape_and_import(&self, url: &BoxScoreUrl, inserter: &BoxScoreInserter<'_>) -> ScrapeResult {
115-
// Fetch the HTML
116-
let html = match self.fetch_boxscore(url).await {
117-
Ok(h) => h,
118-
Err(e) => {
119-
return ScrapeResult::Failed {
134+
/// Scrape and import a single box score, retrying with backoff on rate-limit errors.
135+
/// Returns the result and whether a rate-limit was hit (so the caller can adjust pacing).
136+
async fn scrape_and_import_with_backoff(
137+
&self,
138+
url: &BoxScoreUrl,
139+
inserter: &BoxScoreInserter<'_>,
140+
current_delay: &mut Duration,
141+
) -> ScrapeResult {
142+
// Check if game already exists before fetching
143+
match inserter.game_exists(&url.game_id).await {
144+
Ok(true) => {
145+
return ScrapeResult::AlreadyExists {
120146
game_id: url.game_id.clone(),
121-
error: e.to_string(),
122147
};
123148
}
149+
Ok(false) => {}
150+
Err(e) => {
151+
warn!("Failed to check if game exists: {e}, proceeding with fetch");
152+
}
153+
}
154+
155+
let mut attempt = 0;
156+
157+
let html = loop {
158+
match self.fetch_boxscore(url).await {
159+
Ok(h) => {
160+
// Success — ease back toward base delay
161+
*current_delay = (*current_delay / 2).max(BASE_DELAY);
162+
break h;
163+
}
164+
Err(ScrapeError::RateLimited(status)) => {
165+
attempt += 1;
166+
*current_delay = current_delay.mul_f64(BACKOFF_MULTIPLIER).min(MAX_DELAY);
167+
if attempt > MAX_RETRIES {
168+
return ScrapeResult::Failed {
169+
game_id: url.game_id.clone(),
170+
error: format!("Rate limited (HTTP {status}) after {MAX_RETRIES} retries"),
171+
};
172+
}
173+
warn!("Rate limited (HTTP {status}), retry {attempt}/{MAX_RETRIES} after {current_delay:?}");
174+
sleep(*current_delay).await;
175+
}
176+
Err(e) => {
177+
return ScrapeResult::Failed {
178+
game_id: url.game_id.clone(),
179+
error: e.to_string(),
180+
};
181+
}
182+
}
124183
};
125184

126185
// Parse the box score
@@ -153,7 +212,10 @@ impl Scraper {
153212
self.scrape_all_with_tracking(urls, inserter, None).await
154213
}
155214

156-
/// Scrape multiple box scores with rate limiting and optional failure tracking
215+
/// Scrape multiple box scores with adaptive rate limiting and optional failure tracking.
216+
///
217+
/// Starts at ~2 requests/second and backs off exponentially on 429 / 5xx
218+
/// errors. After successful requests the delay eases back toward the base rate.
157219
pub async fn scrape_all_with_tracking(
158220
&self,
159221
urls: &[BoxScoreUrl],
@@ -162,11 +224,12 @@ impl Scraper {
162224
) -> Vec<ScrapeResult> {
163225
let mut results = Vec::with_capacity(urls.len());
164226
let total = urls.len();
227+
let mut delay = BASE_DELAY;
165228

166229
for (i, url) in urls.iter().enumerate() {
167-
info!("[{}/{}] Processing: {}", i + 1, total, url.game_id);
230+
info!("[{}/{}] Processing: {} (delay: {delay:?})", i + 1, total, url.game_id);
168231

169-
let result = self.scrape_and_import(url, inserter).await;
232+
let result = self.scrape_and_import_with_backoff(url, inserter, &mut delay).await;
170233

171234
match &result {
172235
ScrapeResult::Imported { game_id, db_id } => {
@@ -198,12 +261,13 @@ impl Scraper {
198261
}
199262
}
200263

264+
// Only delay after actual HTTP requests (not skipped duplicates)
265+
let needs_delay = !matches!(&result, ScrapeResult::AlreadyExists { .. });
266+
201267
results.push(result);
202268

203-
// Rate limiting - wait between requests (except for last one)
204-
if i < total - 1 {
205-
info!("Waiting {:?} before next request...", REQUEST_DELAY);
206-
sleep(REQUEST_DELAY).await;
269+
if needs_delay && i < total - 1 {
270+
sleep(delay).await;
207271
}
208272
}
209273

@@ -213,6 +277,6 @@ impl Scraper {
213277

214278
impl Default for Scraper {
215279
fn default() -> Self {
216-
Self::new().expect("Failed to create default scraper")
280+
Self::new()
217281
}
218282
}

0 commit comments

Comments
 (0)