@@ -31,6 +31,8 @@ pub enum Utf8ValidationError {
3131 } ,
3232 /// Incomplete UTF-8 sequence at end of stream
3333 IncompleteSequence ,
34+ /// Non-UTF-8 encoding detected at start of stream
35+ NonUtf8EncodingDetected ( DetectedEncoding ) ,
3436}
3537
3638impl From < Utf8Error > for Utf8ValidationError {
@@ -50,6 +52,13 @@ impl std::fmt::Display for Utf8ValidationError {
5052 Self :: IncompleteSequence => {
5153 write ! ( f, "incomplete UTF-8 sequence at end of stream" )
5254 }
55+ Self :: NonUtf8EncodingDetected ( detected) => {
56+ write ! (
57+ f,
58+ "non-UTF-8 encoding detected at start of stream: {:?}" ,
59+ detected
60+ )
61+ }
5362 }
5463 }
5564}
@@ -323,6 +332,7 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<DetectedEncoding> {
323332/// Possible scenarios for start-of-xml detection of encoding
324333///
325334/// See the documentation of [`detect_encoding`]
335+ #[ derive( Clone , Debug , PartialEq , Eq ) ]
326336pub enum DetectedEncoding {
327337 /// Matches UTF-8 or some other ascii-compatible encoding
328338 AsciiCompatible ,
@@ -417,6 +427,10 @@ impl<R: io::Read> io::BufRead for Utf8BytesReader<R> {
417427/// that only valid UTF-8 bytes are written to the output buffer. Incomplete UTF-8
418428/// sequences at read boundaries are buffered and combined with subsequent reads.
419429///
430+ /// Additionally, this reader checks the very beginning of the stream for encoding
431+ /// signatures (BOMs or XML declaration patterns) and rejects streams that appear to
432+ /// be encoded in UTF-16 or other non-UTF-8 encodings.
433+ ///
420434/// # Examples
421435///
422436/// ```
@@ -434,6 +448,8 @@ pub struct Utf8ValidatingReader<R> {
434448 inner : R ,
435449 /// Buffer to hold incomplete UTF-8 sequences from previous reads (max 3 bytes)
436450 buffer : Vec < u8 > ,
451+ /// Whether we've checked for encoding at the start of the stream
452+ encoding_checked : bool ,
437453}
438454
439455impl < R > Utf8ValidatingReader < R > {
@@ -442,6 +458,7 @@ impl<R> Utf8ValidatingReader<R> {
442458 Self {
443459 inner,
444460 buffer : Vec :: with_capacity ( 4 ) ,
461+ encoding_checked : false ,
445462 }
446463 }
447464
@@ -467,6 +484,49 @@ impl<R: Read> Read for Utf8ValidatingReader<R> {
467484 return Ok ( 0 ) ;
468485 }
469486
487+ // Check for encoding at the start of the stream
488+ if !self . encoding_checked {
489+ self . encoding_checked = true ;
490+
491+ // Read initial data to detect encoding
492+ // Read enough for encoding detection (4 bytes) plus fill up to caller's buffer size
493+ let read_size = buf. len ( ) . max ( 64 ) ; // Read at least 64 bytes for efficiency
494+ let mut temp = vec ! [ 0u8 ; read_size] ;
495+ let n = self . inner . read ( & mut temp) ?;
496+
497+ if n > 0 {
498+ self . buffer . extend_from_slice ( & temp[ ..n] ) ;
499+
500+ // Try to detect encoding if we have at least 4 bytes
501+ if self . buffer . len ( ) >= 4 {
502+ if let Some ( detected) = detect_encoding ( & self . buffer ) {
503+ match detected {
504+ DetectedEncoding :: Utf8Bom | DetectedEncoding :: AsciiCompatible => {
505+ // Strip BOM if present
506+ let bom_len = detected. bom_len ( ) ;
507+ if bom_len > 0 {
508+ self . buffer . drain ( ..bom_len) ;
509+ }
510+ }
511+ DetectedEncoding :: Utf16Le
512+ | DetectedEncoding :: Utf16LeBom
513+ | DetectedEncoding :: Utf16Be
514+ | DetectedEncoding :: Utf16BeBom => {
515+ // Reject UTF-16 encodings
516+ return Err ( io:: Error :: new (
517+ io:: ErrorKind :: InvalidData ,
518+ EncodingError :: Utf8 (
519+ Utf8ValidationError :: NonUtf8EncodingDetected ( detected) ,
520+ ) ,
521+ ) ) ;
522+ }
523+ }
524+ }
525+ }
526+ }
527+ // If we read 0 bytes or less than 4 bytes, assume UTF-8 and continue
528+ }
529+
470530 loop {
471531 // If we have buffered data, check if it's complete UTF-8
472532 if !self . buffer . is_empty ( ) {
@@ -1214,4 +1274,88 @@ mod utf8_validating_reader_tests {
12141274 }
12151275 }
12161276 }
1277+
1278+ mod encoding_detection {
1279+ use super :: * ;
1280+
1281+ #[ test]
1282+ fn utf8_bom_stripped ( ) {
1283+ // UTF-8 BOM (0xEF 0xBB 0xBF) followed by "Hello"
1284+ let data = b"\xEF \xBB \xBF Hello" ;
1285+ let mut reader = Utf8ValidatingReader :: new ( & data[ ..] ) ;
1286+ let mut buf = [ 0u8 ; 20 ] ;
1287+ let n = reader. read ( & mut buf) . unwrap ( ) ;
1288+
1289+ // BOM should be stripped, only "Hello" should be returned
1290+ assert_eq ! ( & buf[ ..n] , b"Hello" ) ;
1291+ assert_eq ! ( std:: str :: from_utf8( & buf[ ..n] ) . unwrap( ) , "Hello" ) ;
1292+ }
1293+
1294+ #[ test]
1295+ fn utf16le_bom_rejected ( ) {
1296+ // UTF-16 LE BOM (0xFF 0xFE)
1297+ let data = b"\xFF \xFE <?xml" ;
1298+ let mut reader = Utf8ValidatingReader :: new ( & data[ ..] ) ;
1299+ let mut buf = [ 0u8 ; 20 ] ;
1300+
1301+ let result = reader. read ( & mut buf) ;
1302+ assert ! ( result. is_err( ) ) ;
1303+ let err = result. unwrap_err ( ) ;
1304+ assert_eq ! ( err. kind( ) , io:: ErrorKind :: InvalidData ) ;
1305+
1306+ // Verify the error can be downcast to EncodingError
1307+ let encoding_err = err
1308+ . get_ref ( )
1309+ . unwrap ( )
1310+ . downcast_ref :: < EncodingError > ( )
1311+ . expect ( "Error should downcast to EncodingError" ) ;
1312+
1313+ // Verify it's the NonUtf8EncodingDetected error with the correct encoding
1314+ match encoding_err {
1315+ EncodingError :: Utf8 ( Utf8ValidationError :: NonUtf8EncodingDetected ( detected) ) => {
1316+ assert_eq ! ( * detected, DetectedEncoding :: Utf16LeBom ) ;
1317+ }
1318+ other => panic ! ( "Expected NonUtf8EncodingDetected error, got: {:?}" , other) ,
1319+ }
1320+ }
1321+
1322+ #[ test]
1323+ fn utf16be_bom_rejected ( ) {
1324+ // UTF-16 BE BOM (0xFE 0xFF)
1325+ let data = b"\xFE \xFF \x00 <\x00 ?" ;
1326+ let mut reader = Utf8ValidatingReader :: new ( & data[ ..] ) ;
1327+ let mut buf = [ 0u8 ; 20 ] ;
1328+
1329+ let result = reader. read ( & mut buf) ;
1330+ assert ! ( result. is_err( ) ) ;
1331+ let err = result. unwrap_err ( ) ;
1332+ assert_eq ! ( err. kind( ) , io:: ErrorKind :: InvalidData ) ;
1333+
1334+ // Verify the error can be downcast to EncodingError
1335+ let encoding_err = err
1336+ . get_ref ( )
1337+ . unwrap ( )
1338+ . downcast_ref :: < EncodingError > ( )
1339+ . expect ( "Error should downcast to EncodingError" ) ;
1340+
1341+ // Verify it's the NonUtf8EncodingDetected error with the correct encoding
1342+ match encoding_err {
1343+ EncodingError :: Utf8 ( Utf8ValidationError :: NonUtf8EncodingDetected ( detected) ) => {
1344+ assert_eq ! ( * detected, DetectedEncoding :: Utf16BeBom ) ;
1345+ }
1346+ other => panic ! ( "Expected NonUtf8EncodingDetected error, got: {:?}" , other) ,
1347+ }
1348+ }
1349+
1350+ #[ test]
1351+ fn ascii_compatible_encoding_allowed ( ) {
1352+ // ASCII-compatible XML declaration (no BOM)
1353+ let data = b"<?xml version=\" 1.0\" ?><root/>" ;
1354+ let mut reader = Utf8ValidatingReader :: new ( & data[ ..] ) ;
1355+ let mut buf = [ 0u8 ; 50 ] ;
1356+
1357+ let n = reader. read ( & mut buf) . unwrap ( ) ;
1358+ assert_eq ! ( & buf[ ..n] , data) ;
1359+ }
1360+ }
12171361}
0 commit comments