Utf8ValidatingReader detects encodings and strips BOMs automtically

dralley · dralley · commit 65aae52dea11 · 2026-02-26T01:26:54.000-05:00
In cases where the input is sufficiently short and doesn't contain
invalid sequences, Utf8ValidatingReader was unable to detect the input
as being not-UTF-8

We now call detect_encoding() during the first read() so that it can
more effectively raise the appropriate errors. Doing this (and BOM
stripping) upstream of the parser makes it possible to eliminate this
responsibility from the parser, once it can be relied upon on all code
paths.
diff --git a/src/encoding.rs b/src/encoding.rs
@@ -31,6 +31,8 @@ pub enum Utf8ValidationError {
     },
     /// Incomplete UTF-8 sequence at end of stream
     IncompleteSequence,
+    /// Non-UTF-8 encoding detected at start of stream
+    NonUtf8EncodingDetected(DetectedEncoding),
 }
 
 impl From<Utf8Error> for Utf8ValidationError {
@@ -50,6 +52,13 @@ impl std::fmt::Display for Utf8ValidationError {
             Self::IncompleteSequence => {
                 write!(f, "incomplete UTF-8 sequence at end of stream")
             }
+            Self::NonUtf8EncodingDetected(detected) => {
+                write!(
+                    f,
+                    "non-UTF-8 encoding detected at start of stream: {:?}",
+                    detected
+                )
+            }
         }
     }
 }
@@ -323,6 +332,7 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<DetectedEncoding> {
 /// Possible scenarios for start-of-xml detection of encoding
 ///
 /// See the documentation of [`detect_encoding`]
+#[derive(Clone, Debug, PartialEq, Eq)]
 pub enum DetectedEncoding {
     /// Matches UTF-8 or some other ascii-compatible encoding
     AsciiCompatible,
@@ -417,6 +427,10 @@ impl<R: io::Read> io::BufRead for Utf8BytesReader<R> {
 /// that only valid UTF-8 bytes are written to the output buffer. Incomplete UTF-8
 /// sequences at read boundaries are buffered and combined with subsequent reads.
 ///
+/// Additionally, this reader checks the very beginning of the stream for encoding
+/// signatures (BOMs or XML declaration patterns) and rejects streams that appear to
+/// be encoded in UTF-16 or other non-UTF-8 encodings.
+///
 /// # Examples
 ///
 /// ```
@@ -434,6 +448,8 @@ pub struct Utf8ValidatingReader<R> {
     inner: R,
     /// Buffer to hold incomplete UTF-8 sequences from previous reads (max 3 bytes)
     buffer: Vec<u8>,
+    /// Whether we've checked for encoding at the start of the stream
+    encoding_checked: bool,
 }
 
 impl<R> Utf8ValidatingReader<R> {
@@ -442,6 +458,7 @@ impl<R> Utf8ValidatingReader<R> {
         Self {
             inner,
             buffer: Vec::with_capacity(4),
+            encoding_checked: false,
         }
     }
 
@@ -467,6 +484,49 @@ impl<R: Read> Read for Utf8ValidatingReader<R> {
             return Ok(0);
         }
 
+        // Check for encoding at the start of the stream
+        if !self.encoding_checked {
+            self.encoding_checked = true;
+
+            // Read initial data to detect encoding
+            // Read enough for encoding detection (4 bytes) plus fill up to caller's buffer size
+            let read_size = buf.len().max(64); // Read at least 64 bytes for efficiency
+            let mut temp = vec![0u8; read_size];
+            let n = self.inner.read(&mut temp)?;
+
+            if n > 0 {
+                self.buffer.extend_from_slice(&temp[..n]);
+
+                // Try to detect encoding if we have at least 4 bytes
+                if self.buffer.len() >= 4 {
+                    if let Some(detected) = detect_encoding(&self.buffer) {
+                        match detected {
+                            DetectedEncoding::Utf8Bom | DetectedEncoding::AsciiCompatible => {
+                                // Strip BOM if present
+                                let bom_len = detected.bom_len();
+                                if bom_len > 0 {
+                                    self.buffer.drain(..bom_len);
+                                }
+                            }
+                            DetectedEncoding::Utf16Le
+                            | DetectedEncoding::Utf16LeBom
+                            | DetectedEncoding::Utf16Be
+                            | DetectedEncoding::Utf16BeBom => {
+                                // Reject UTF-16 encodings
+                                return Err(io::Error::new(
+                                    io::ErrorKind::InvalidData,
+                                    EncodingError::Utf8(
+                                        Utf8ValidationError::NonUtf8EncodingDetected(detected),
+                                    ),
+                                ));
+                            }
+                        }
+                    }
+                }
+            }
+            // If we read 0 bytes or less than 4 bytes, assume UTF-8 and continue
+        }
+
         loop {
             // If we have buffered data, check if it's complete UTF-8
             if !self.buffer.is_empty() {
@@ -1214,4 +1274,88 @@ mod utf8_validating_reader_tests {
             }
         }
     }
+
+    mod encoding_detection {
+        use super::*;
+
+        #[test]
+        fn utf8_bom_stripped() {
+            // UTF-8 BOM (0xEF 0xBB 0xBF) followed by "Hello"
+            let data = b"\xEF\xBB\xBFHello";
+            let mut reader = Utf8ValidatingReader::new(&data[..]);
+            let mut buf = [0u8; 20];
+            let n = reader.read(&mut buf).unwrap();
+
+            // BOM should be stripped, only "Hello" should be returned
+            assert_eq!(&buf[..n], b"Hello");
+            assert_eq!(std::str::from_utf8(&buf[..n]).unwrap(), "Hello");
+        }
+
+        #[test]
+        fn utf16le_bom_rejected() {
+            // UTF-16 LE BOM (0xFF 0xFE)
+            let data = b"\xFF\xFE<?xml";
+            let mut reader = Utf8ValidatingReader::new(&data[..]);
+            let mut buf = [0u8; 20];
+
+            let result = reader.read(&mut buf);
+            assert!(result.is_err());
+            let err = result.unwrap_err();
+            assert_eq!(err.kind(), io::ErrorKind::InvalidData);
+
+            // Verify the error can be downcast to EncodingError
+            let encoding_err = err
+                .get_ref()
+                .unwrap()
+                .downcast_ref::<EncodingError>()
+                .expect("Error should downcast to EncodingError");
+
+            // Verify it's the NonUtf8EncodingDetected error with the correct encoding
+            match encoding_err {
+                EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected(detected)) => {
+                    assert_eq!(*detected, DetectedEncoding::Utf16LeBom);
+                }
+                other => panic!("Expected NonUtf8EncodingDetected error, got: {:?}", other),
+            }
+        }
+
+        #[test]
+        fn utf16be_bom_rejected() {
+            // UTF-16 BE BOM (0xFE 0xFF)
+            let data = b"\xFE\xFF\x00<\x00?";
+            let mut reader = Utf8ValidatingReader::new(&data[..]);
+            let mut buf = [0u8; 20];
+
+            let result = reader.read(&mut buf);
+            assert!(result.is_err());
+            let err = result.unwrap_err();
+            assert_eq!(err.kind(), io::ErrorKind::InvalidData);
+
+            // Verify the error can be downcast to EncodingError
+            let encoding_err = err
+                .get_ref()
+                .unwrap()
+                .downcast_ref::<EncodingError>()
+                .expect("Error should downcast to EncodingError");
+
+            // Verify it's the NonUtf8EncodingDetected error with the correct encoding
+            match encoding_err {
+                EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected(detected)) => {
+                    assert_eq!(*detected, DetectedEncoding::Utf16BeBom);
+                }
+                other => panic!("Expected NonUtf8EncodingDetected error, got: {:?}", other),
+            }
+        }
+
+        #[test]
+        fn ascii_compatible_encoding_allowed() {
+            // ASCII-compatible XML declaration (no BOM)
+            let data = b"<?xml version=\"1.0\"?><root/>";
+            let mut reader = Utf8ValidatingReader::new(&data[..]);
+            let mut buf = [0u8; 50];
+
+            let n = reader.read(&mut buf).unwrap();
+            assert_eq!(&buf[..n], data);
+        }
+    }
 }
diff --git a/src/reader/state.rs b/src/reader/state.rs
@@ -280,6 +280,8 @@ impl ReaderState {
             if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) {
                 let event = BytesDecl::from_start(BytesStart::wrap(content, 3, self.decoder()));
 
+                // TODO: once we can assume that the parser is operating on UTF-8, then we can throw
+                // an error here if we see a non-UTF-8 encoding... if encoding/decoding is not enabled.
                 // Try getting encoding from the declaration event
                 #[cfg(feature = "encoding")]
                 if self.encoding.can_be_refined() {
diff --git a/tests/encodings.rs b/tests/encodings.rs