Skip to content

Commit f7db620

Browse files
authored
Merge pull request #18 from jqnatividad/security-review
fix: address HIGH and MEDIUM security audit findings
2 parents 3fd4106 + 686f60e commit f7db620

File tree

3 files changed

+70
-12
lines changed

3 files changed

+70
-12
lines changed

src/http.rs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,13 +137,22 @@ fn fetch_full(url: &str, max_bytes: Option<usize>) -> Result<FetchResult, HttpEr
137137
let body = response.into_body();
138138
let mut reader = body.into_reader();
139139

140+
const MAX_BYTES: u64 = 1024 * 1024 * 1024; // 1 GB hard cap
140141
let data = if let Some(bytes) = max_bytes {
141142
let mut buf = Vec::with_capacity(bytes);
142143
reader.take(bytes as u64).read_to_end(&mut buf)?;
143144
buf
144145
} else {
145146
let mut buf = Vec::new();
146-
reader.read_to_end(&mut buf)?;
147+
(&mut reader).take(MAX_BYTES).read_to_end(&mut buf)?;
148+
if buf.len() as u64 == MAX_BYTES {
149+
let mut probe = [0u8; 1];
150+
if reader.read(&mut probe)? > 0 {
151+
eprintln!(
152+
"warning: HTTP response exceeds 1 GB; sniffing on truncated sample — results may be inaccurate"
153+
);
154+
}
155+
}
147156
buf
148157
};
149158

src/main.rs

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -379,21 +379,19 @@ fn print_json_output(path: &str, metadata: &csv_nose::Metadata, verbose: bool) {
379379
}
380380

381381
fn print_csv_output(path: &str, metadata: &csv_nose::Metadata) {
382-
static mut HEADER_PRINTED: bool = false;
382+
use std::sync::atomic::{AtomicBool, Ordering};
383+
static HEADER_PRINTED: AtomicBool = AtomicBool::new(false);
383384

384385
let quote_str = match metadata.dialect.quote {
385386
Quote::None => "none".to_string(),
386387
Quote::Some(q) => format!("{}", q as char),
387388
};
388389

389390
// CSV header (print only for first file or could be configured)
390-
unsafe {
391-
if !HEADER_PRINTED {
392-
println!(
393-
"file,delimiter,quote,has_header,preamble_rows,flexible,is_utf8,num_fields,avg_record_len"
394-
);
395-
HEADER_PRINTED = true;
396-
}
391+
if !HEADER_PRINTED.swap(true, Ordering::Relaxed) {
392+
println!(
393+
"file,delimiter,quote,has_header,preamble_rows,flexible,is_utf8,num_fields,avg_record_len"
394+
);
397395
}
398396

399397
println!(

src/sniffer.rs

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ use crate::tum::score::{DialectScore, find_best_dialect, score_all_dialects_with
1919
use crate::tum::table::{Table, parse_table};
2020
use crate::tum::type_detection::infer_column_types;
2121

22+
/// Maximum buffer size for `SampleSize::Records` mode (100 MB).
23+
const MAX_RECORDS_BYTES: usize = 100 * 1024 * 1024;
24+
2225
/// CSV dialect sniffer using the Table Uniformity Method.
2326
///
2427
/// # Example
@@ -188,14 +191,23 @@ impl Sniffer {
188191
Ok(buffer)
189192
}
190193
SampleSize::All => {
194+
const MAX_BYTES: u64 = 1024 * 1024 * 1024; // 1 GB hard cap
191195
let mut buffer = Vec::new();
192-
reader.read_to_end(&mut buffer)?;
196+
(&mut reader).take(MAX_BYTES).read_to_end(&mut buffer)?;
197+
if buffer.len() as u64 == MAX_BYTES {
198+
let mut probe = [0u8; 1];
199+
if reader.read(&mut probe)? > 0 {
200+
eprintln!(
201+
"warning: input exceeds 1 GB; sniffing on truncated sample — results may be inaccurate"
202+
);
203+
}
204+
}
193205
Ok(buffer)
194206
}
195207
SampleSize::Records(n) => {
196208
// For records, we read enough to capture n records
197209
// Estimate ~1KB per record as a starting point, with a minimum
198-
let estimated_size = (n * 1024).max(8192);
210+
let estimated_size = n.saturating_mul(1024).clamp(8192, MAX_RECORDS_BYTES);
199211
let mut buffer = vec![0u8; estimated_size];
200212
let bytes_read = reader.read(&mut buffer)?;
201213
buffer.truncate(bytes_read);
@@ -206,14 +218,24 @@ impl Sniffer {
206218
let newlines = bytecount::count(&buffer, b'\n');
207219
if newlines < n {
208220
// Read more data
209-
let additional = (n - newlines) * 2048;
221+
let additional = (n - newlines).saturating_mul(2048).min(MAX_RECORDS_BYTES);
210222
let mut more = vec![0u8; additional];
211223
let more_read = reader.read(&mut more)?;
212224
more.truncate(more_read);
213225
buffer.extend(more);
214226
}
215227
}
216228

229+
if buffer.len() >= MAX_RECORDS_BYTES {
230+
let mut probe = [0u8; 1];
231+
if reader.read(&mut probe)? > 0 {
232+
eprintln!(
233+
"warning: Records sample capped at 100 MB; \
234+
sniff result may be approximate for very large inputs"
235+
);
236+
}
237+
}
238+
217239
Ok(buffer)
218240
}
219241
}
@@ -717,4 +739,33 @@ mod tests {
717739
// Raw: 16 + 12 = 28 bytes for 2 rows = 14 bytes avg
718740
assert_eq!(metadata.avg_record_len, 14);
719741
}
742+
743+
#[test]
744+
fn test_records_mode_cap_boundary_ok() {
745+
// Verify that a reader with more than MAX_RECORDS_BYTES of valid CSV is handled
746+
// gracefully and returns Ok. Uses a cycling pattern to avoid a large literal in
747+
// test source; the runtime still materializes ~100 MB via collect().
748+
// We supply MAX_RECORDS_BYTES + one extra row so the probe-read finds data and
749+
// the truncation warning fires, but the sniff still succeeds.
750+
let row = b"col1,col2,col3\n1,2,3\n"; // 21 bytes, valid CSV pair
751+
let total = MAX_RECORDS_BYTES + row.len();
752+
let data: Vec<u8> = row.iter().copied().cycle().take(total).collect();
753+
// Confirm the test data actually exceeds the cap so the probe path is exercised.
754+
assert!(
755+
data.len() > MAX_RECORDS_BYTES,
756+
"test data must exceed MAX_RECORDS_BYTES to exercise probe-read path"
757+
);
758+
let cursor = std::io::Cursor::new(data);
759+
let mut sniffer = Sniffer::new();
760+
// Records(200_000): estimated_size = 200_000 * 1024 > MAX_RECORDS_BYTES (100 MB), clamped to MAX_RECORDS_BYTES.
761+
sniffer.sample_size(SampleSize::Records(200_000));
762+
let result = sniffer.sniff_reader(cursor);
763+
assert!(
764+
result.is_ok(),
765+
"sniff should succeed at cap boundary: {result:?}"
766+
);
767+
// Note: the eprintln! truncation warning cannot be captured in a standard Rust
768+
// unit test without process-level stderr redirection. The probe-read path is
769+
// exercised by virtue of data.len() > MAX_RECORDS_BYTES (asserted above).
770+
}
720771
}

0 commit comments

Comments
 (0)