@@ -19,6 +19,9 @@ use crate::tum::score::{DialectScore, find_best_dialect, score_all_dialects_with
1919use crate :: tum:: table:: { Table , parse_table} ;
2020use crate :: tum:: type_detection:: infer_column_types;
2121
22+ /// Maximum buffer size for `SampleSize::Records` mode (100 MB).
23+ const MAX_RECORDS_BYTES : usize = 100 * 1024 * 1024 ;
24+
2225/// CSV dialect sniffer using the Table Uniformity Method.
2326///
2427/// # Example
@@ -188,14 +191,23 @@ impl Sniffer {
188191 Ok ( buffer)
189192 }
190193 SampleSize :: All => {
194+ const MAX_BYTES : u64 = 1024 * 1024 * 1024 ; // 1 GB hard cap
191195 let mut buffer = Vec :: new ( ) ;
192- reader. read_to_end ( & mut buffer) ?;
196+ ( & mut reader) . take ( MAX_BYTES ) . read_to_end ( & mut buffer) ?;
197+ if buffer. len ( ) as u64 == MAX_BYTES {
198+ let mut probe = [ 0u8 ; 1 ] ;
199+ if reader. read ( & mut probe) ? > 0 {
200+ eprintln ! (
201+ "warning: input exceeds 1 GB; sniffing on truncated sample — results may be inaccurate"
202+ ) ;
203+ }
204+ }
193205 Ok ( buffer)
194206 }
195207 SampleSize :: Records ( n) => {
196208 // For records, we read enough to capture n records
197209 // Estimate ~1KB per record as a starting point, with a minimum
198- let estimated_size = ( n * 1024 ) . max ( 8192 ) ;
210+ let estimated_size = n . saturating_mul ( 1024 ) . clamp ( 8192 , MAX_RECORDS_BYTES ) ;
199211 let mut buffer = vec ! [ 0u8 ; estimated_size] ;
200212 let bytes_read = reader. read ( & mut buffer) ?;
201213 buffer. truncate ( bytes_read) ;
@@ -206,14 +218,24 @@ impl Sniffer {
206218 let newlines = bytecount:: count ( & buffer, b'\n' ) ;
207219 if newlines < n {
208220 // Read more data
209- let additional = ( n - newlines) * 2048 ;
221+ let additional = ( n - newlines) . saturating_mul ( 2048 ) . min ( MAX_RECORDS_BYTES ) ;
210222 let mut more = vec ! [ 0u8 ; additional] ;
211223 let more_read = reader. read ( & mut more) ?;
212224 more. truncate ( more_read) ;
213225 buffer. extend ( more) ;
214226 }
215227 }
216228
229+ if buffer. len ( ) >= MAX_RECORDS_BYTES {
230+ let mut probe = [ 0u8 ; 1 ] ;
231+ if reader. read ( & mut probe) ? > 0 {
232+ eprintln ! (
233+ "warning: Records sample capped at 100 MB; \
234+ sniff result may be approximate for very large inputs"
235+ ) ;
236+ }
237+ }
238+
217239 Ok ( buffer)
218240 }
219241 }
@@ -717,4 +739,33 @@ mod tests {
717739 // Raw: 16 + 12 = 28 bytes for 2 rows = 14 bytes avg
718740 assert_eq ! ( metadata. avg_record_len, 14 ) ;
719741 }
742+
743+ #[ test]
744+ fn test_records_mode_cap_boundary_ok ( ) {
745+ // Verify that a reader with more than MAX_RECORDS_BYTES of valid CSV is handled
746+ // gracefully and returns Ok. Uses a cycling pattern to avoid a large literal in
747+ // test source; the runtime still materializes ~100 MB via collect().
748+ // We supply MAX_RECORDS_BYTES + one extra row so the probe-read finds data and
749+ // the truncation warning fires, but the sniff still succeeds.
750+ let row = b"col1,col2,col3\n 1,2,3\n " ; // 21 bytes, valid CSV pair
751+ let total = MAX_RECORDS_BYTES + row. len ( ) ;
752+ let data: Vec < u8 > = row. iter ( ) . copied ( ) . cycle ( ) . take ( total) . collect ( ) ;
753+ // Confirm the test data actually exceeds the cap so the probe path is exercised.
754+ assert ! (
755+ data. len( ) > MAX_RECORDS_BYTES ,
756+ "test data must exceed MAX_RECORDS_BYTES to exercise probe-read path"
757+ ) ;
758+ let cursor = std:: io:: Cursor :: new ( data) ;
759+ let mut sniffer = Sniffer :: new ( ) ;
760+ // Records(200_000): estimated_size = 200_000 * 1024 > MAX_RECORDS_BYTES (100 MB), clamped to MAX_RECORDS_BYTES.
761+ sniffer. sample_size ( SampleSize :: Records ( 200_000 ) ) ;
762+ let result = sniffer. sniff_reader ( cursor) ;
763+ assert ! (
764+ result. is_ok( ) ,
765+ "sniff should succeed at cap boundary: {result:?}"
766+ ) ;
767+ // Note: the eprintln! truncation warning cannot be captured in a standard Rust
768+ // unit test without process-level stderr redirection. The probe-read path is
769+ // exercised by virtue of data.len() > MAX_RECORDS_BYTES (asserted above).
770+ }
720771}
0 commit comments