Skip to content

Commit 65aae52

Browse files
committed
Utf8ValidatingReader detects encodings and strips BOMs automtically
In cases where the input is sufficiently short and doesn't contain invalid sequences, Utf8ValidatingReader was unable to detect the input as being not-UTF-8 We now call detect_encoding() during the first read() so that it can more effectively raise the appropriate errors. Doing this (and BOM stripping) upstream of the parser makes it possible to eliminate this responsibility from the parser, once it can be relied upon on all code paths.
1 parent bf93198 commit 65aae52

3 files changed

Lines changed: 282 additions & 26 deletions

File tree

src/encoding.rs

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ pub enum Utf8ValidationError {
3131
},
3232
/// Incomplete UTF-8 sequence at end of stream
3333
IncompleteSequence,
34+
/// Non-UTF-8 encoding detected at start of stream
35+
NonUtf8EncodingDetected(DetectedEncoding),
3436
}
3537

3638
impl From<Utf8Error> for Utf8ValidationError {
@@ -50,6 +52,13 @@ impl std::fmt::Display for Utf8ValidationError {
5052
Self::IncompleteSequence => {
5153
write!(f, "incomplete UTF-8 sequence at end of stream")
5254
}
55+
Self::NonUtf8EncodingDetected(detected) => {
56+
write!(
57+
f,
58+
"non-UTF-8 encoding detected at start of stream: {:?}",
59+
detected
60+
)
61+
}
5362
}
5463
}
5564
}
@@ -323,6 +332,7 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<DetectedEncoding> {
323332
/// Possible scenarios for start-of-xml detection of encoding
324333
///
325334
/// See the documentation of [`detect_encoding`]
335+
#[derive(Clone, Debug, PartialEq, Eq)]
326336
pub enum DetectedEncoding {
327337
/// Matches UTF-8 or some other ascii-compatible encoding
328338
AsciiCompatible,
@@ -417,6 +427,10 @@ impl<R: io::Read> io::BufRead for Utf8BytesReader<R> {
417427
/// that only valid UTF-8 bytes are written to the output buffer. Incomplete UTF-8
418428
/// sequences at read boundaries are buffered and combined with subsequent reads.
419429
///
430+
/// Additionally, this reader checks the very beginning of the stream for encoding
431+
/// signatures (BOMs or XML declaration patterns) and rejects streams that appear to
432+
/// be encoded in UTF-16 or other non-UTF-8 encodings.
433+
///
420434
/// # Examples
421435
///
422436
/// ```
@@ -434,6 +448,8 @@ pub struct Utf8ValidatingReader<R> {
434448
inner: R,
435449
/// Buffer to hold incomplete UTF-8 sequences from previous reads (max 3 bytes)
436450
buffer: Vec<u8>,
451+
/// Whether we've checked for encoding at the start of the stream
452+
encoding_checked: bool,
437453
}
438454

439455
impl<R> Utf8ValidatingReader<R> {
@@ -442,6 +458,7 @@ impl<R> Utf8ValidatingReader<R> {
442458
Self {
443459
inner,
444460
buffer: Vec::with_capacity(4),
461+
encoding_checked: false,
445462
}
446463
}
447464

@@ -467,6 +484,49 @@ impl<R: Read> Read for Utf8ValidatingReader<R> {
467484
return Ok(0);
468485
}
469486

487+
// Check for encoding at the start of the stream
488+
if !self.encoding_checked {
489+
self.encoding_checked = true;
490+
491+
// Read initial data to detect encoding
492+
// Read enough for encoding detection (4 bytes) plus fill up to caller's buffer size
493+
let read_size = buf.len().max(64); // Read at least 64 bytes for efficiency
494+
let mut temp = vec![0u8; read_size];
495+
let n = self.inner.read(&mut temp)?;
496+
497+
if n > 0 {
498+
self.buffer.extend_from_slice(&temp[..n]);
499+
500+
// Try to detect encoding if we have at least 4 bytes
501+
if self.buffer.len() >= 4 {
502+
if let Some(detected) = detect_encoding(&self.buffer) {
503+
match detected {
504+
DetectedEncoding::Utf8Bom | DetectedEncoding::AsciiCompatible => {
505+
// Strip BOM if present
506+
let bom_len = detected.bom_len();
507+
if bom_len > 0 {
508+
self.buffer.drain(..bom_len);
509+
}
510+
}
511+
DetectedEncoding::Utf16Le
512+
| DetectedEncoding::Utf16LeBom
513+
| DetectedEncoding::Utf16Be
514+
| DetectedEncoding::Utf16BeBom => {
515+
// Reject UTF-16 encodings
516+
return Err(io::Error::new(
517+
io::ErrorKind::InvalidData,
518+
EncodingError::Utf8(
519+
Utf8ValidationError::NonUtf8EncodingDetected(detected),
520+
),
521+
));
522+
}
523+
}
524+
}
525+
}
526+
}
527+
// If we read 0 bytes or less than 4 bytes, assume UTF-8 and continue
528+
}
529+
470530
loop {
471531
// If we have buffered data, check if it's complete UTF-8
472532
if !self.buffer.is_empty() {
@@ -1214,4 +1274,88 @@ mod utf8_validating_reader_tests {
12141274
}
12151275
}
12161276
}
1277+
1278+
mod encoding_detection {
1279+
use super::*;
1280+
1281+
#[test]
1282+
fn utf8_bom_stripped() {
1283+
// UTF-8 BOM (0xEF 0xBB 0xBF) followed by "Hello"
1284+
let data = b"\xEF\xBB\xBFHello";
1285+
let mut reader = Utf8ValidatingReader::new(&data[..]);
1286+
let mut buf = [0u8; 20];
1287+
let n = reader.read(&mut buf).unwrap();
1288+
1289+
// BOM should be stripped, only "Hello" should be returned
1290+
assert_eq!(&buf[..n], b"Hello");
1291+
assert_eq!(std::str::from_utf8(&buf[..n]).unwrap(), "Hello");
1292+
}
1293+
1294+
#[test]
1295+
fn utf16le_bom_rejected() {
1296+
// UTF-16 LE BOM (0xFF 0xFE)
1297+
let data = b"\xFF\xFE<?xml";
1298+
let mut reader = Utf8ValidatingReader::new(&data[..]);
1299+
let mut buf = [0u8; 20];
1300+
1301+
let result = reader.read(&mut buf);
1302+
assert!(result.is_err());
1303+
let err = result.unwrap_err();
1304+
assert_eq!(err.kind(), io::ErrorKind::InvalidData);
1305+
1306+
// Verify the error can be downcast to EncodingError
1307+
let encoding_err = err
1308+
.get_ref()
1309+
.unwrap()
1310+
.downcast_ref::<EncodingError>()
1311+
.expect("Error should downcast to EncodingError");
1312+
1313+
// Verify it's the NonUtf8EncodingDetected error with the correct encoding
1314+
match encoding_err {
1315+
EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected(detected)) => {
1316+
assert_eq!(*detected, DetectedEncoding::Utf16LeBom);
1317+
}
1318+
other => panic!("Expected NonUtf8EncodingDetected error, got: {:?}", other),
1319+
}
1320+
}
1321+
1322+
#[test]
1323+
fn utf16be_bom_rejected() {
1324+
// UTF-16 BE BOM (0xFE 0xFF)
1325+
let data = b"\xFE\xFF\x00<\x00?";
1326+
let mut reader = Utf8ValidatingReader::new(&data[..]);
1327+
let mut buf = [0u8; 20];
1328+
1329+
let result = reader.read(&mut buf);
1330+
assert!(result.is_err());
1331+
let err = result.unwrap_err();
1332+
assert_eq!(err.kind(), io::ErrorKind::InvalidData);
1333+
1334+
// Verify the error can be downcast to EncodingError
1335+
let encoding_err = err
1336+
.get_ref()
1337+
.unwrap()
1338+
.downcast_ref::<EncodingError>()
1339+
.expect("Error should downcast to EncodingError");
1340+
1341+
// Verify it's the NonUtf8EncodingDetected error with the correct encoding
1342+
match encoding_err {
1343+
EncodingError::Utf8(Utf8ValidationError::NonUtf8EncodingDetected(detected)) => {
1344+
assert_eq!(*detected, DetectedEncoding::Utf16BeBom);
1345+
}
1346+
other => panic!("Expected NonUtf8EncodingDetected error, got: {:?}", other),
1347+
}
1348+
}
1349+
1350+
#[test]
1351+
fn ascii_compatible_encoding_allowed() {
1352+
// ASCII-compatible XML declaration (no BOM)
1353+
let data = b"<?xml version=\"1.0\"?><root/>";
1354+
let mut reader = Utf8ValidatingReader::new(&data[..]);
1355+
let mut buf = [0u8; 50];
1356+
1357+
let n = reader.read(&mut buf).unwrap();
1358+
assert_eq!(&buf[..n], data);
1359+
}
1360+
}
12171361
}

src/reader/state.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,8 @@ impl ReaderState {
280280
if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) {
281281
let event = BytesDecl::from_start(BytesStart::wrap(content, 3, self.decoder()));
282282

283+
// TODO: once we can assume that the parser is operating on UTF-8, then we can throw
284+
// an error here if we see a non-UTF-8 encoding... if encoding/decoding is not enabled.
283285
// Try getting encoding from the declaration event
284286
#[cfg(feature = "encoding")]
285287
if self.encoding.can_be_refined() {

0 commit comments

Comments
 (0)