Skip to content

Commit b0b6be1

Browse files
committed
Add Utf8BytesReader infrastructure
When the source of the bytes isn't UTF-8 (or isn't known to be), the bytes need to be decoded first, or at least validated as such. Wrap 'Read'ers with Utf8BytesReader to ensure this happens. Defer the validating portion for now.
1 parent 3848161 commit b0b6be1

File tree

10 files changed

+103
-87
lines changed

10 files changed

+103
-87
lines changed

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ license = "MIT"
1414
[dependencies]
1515
document-features = { version = "0.2", optional = true }
1616
encoding_rs = { version = "0.8", optional = true }
17+
encoding_rs_io = { version = "0.1", optional = true }
1718
serde = { version = "1.0", optional = true }
1819
tokio = { version = "1.20", optional = true, default-features = false, features = ["io-util"] }
1920
memchr = "2.5"
@@ -102,7 +103,7 @@ async-tokio = ["tokio"]
102103
## [UTF-16LE]: encoding_rs::UTF_16LE
103104
## [ISO-2022-JP]: encoding_rs::ISO_2022_JP
104105
## [#158]: https://github.com/tafia/quick-xml/issues/158
105-
encoding = ["encoding_rs"]
106+
encoding = ["encoding_rs", "encoding_rs_io"]
106107

107108
## Enables support for recognizing all [HTML 5 entities] in [`unescape`] and
108109
## [`unescape_with`] functions. The full list of entities also can be found in

src/de/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ mod var;
110110

111111
pub use crate::errors::serialize::DeError;
112112
use crate::{
113-
encoding::Decoder,
113+
encoding::{Decoder, Utf8BytesReader},
114114
errors::Error,
115115
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
116116
name::QName,
@@ -592,7 +592,7 @@ impl<'de> Deserializer<'de, SliceReader<'de>> {
592592
}
593593
}
594594

595-
impl<'de, R> Deserializer<'de, IoReader<R>>
595+
impl<'de, R> Deserializer<'de, IoReader<Utf8BytesReader<R>>>
596596
where
597597
R: BufRead,
598598
{

src/encoding.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
//! A module for wrappers that encode / decode data.
22
33
use std::borrow::Cow;
4+
use std::io;
45

56
#[cfg(feature = "encoding")]
67
use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
8+
#[cfg(feature = "encoding")]
9+
use encoding_rs_io::{DecodeReaderBytes, DecodeReaderBytesBuilder};
710

811
#[cfg(feature = "encoding")]
912
use crate::Error;
@@ -21,6 +24,57 @@ pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
2124
#[cfg(feature = "encoding")]
2225
pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
2326

27+
/// A struct for transparently decoding / validating bytes as UTF-8.
28+
#[derive(Debug)]
29+
pub struct Utf8BytesReader<R> {
30+
#[cfg(feature = "encoding")]
31+
reader: io::BufReader<DecodeReaderBytes<R, Vec<u8>>>,
32+
#[cfg(not(feature = "encoding"))]
33+
reader: io::BufReader<R>,
34+
}
35+
36+
impl<R: io::Read> Utf8BytesReader<R> {
37+
/// Build a new reader which decodes a stream of bytes in an unknown encoding into UTF-8.
38+
/// Note: The consumer is responsible for finding the correct character boundaries when
39+
/// treating a given range of bytes as UTF-8.
40+
#[cfg(feature = "encoding")]
41+
pub fn new(reader: R) -> Self {
42+
let decoder = DecodeReaderBytesBuilder::new()
43+
.bom_override(true)
44+
.build(reader);
45+
46+
Self {
47+
reader: io::BufReader::new(decoder),
48+
}
49+
}
50+
51+
/// Build a new reader which (will eventually) validate UTF-8.
52+
/// Note: The consumer is responsible for finding the correct character boundaries when
53+
/// treating a given range of bytes as UTF-8.
54+
#[cfg(not(feature = "encoding"))]
55+
pub fn new(reader: R) -> Self {
56+
Self {
57+
reader: io::BufReader::new(reader),
58+
}
59+
}
60+
}
61+
62+
impl<R: io::Read> io::Read for Utf8BytesReader<R> {
63+
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
64+
self.reader.read(buf)
65+
}
66+
}
67+
68+
impl<R: io::Read> io::BufRead for Utf8BytesReader<R> {
69+
fn fill_buf(&mut self) -> io::Result<&[u8]> {
70+
self.reader.fill_buf()
71+
}
72+
73+
fn consume(&mut self, amt: usize) {
74+
self.reader.consume(amt)
75+
}
76+
}
77+
2478
/// Decoder of byte slices into strings.
2579
///
2680
/// If feature `encoding` is enabled, this encoding taken from the `"encoding"`

src/reader/buffered_reader.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
//! underlying byte stream.
33
44
use std::fs::File;
5-
use std::io::{self, BufRead, BufReader};
5+
use std::io::{self, BufRead};
66
use std::path::Path;
77

88
use memchr;
99

10+
use crate::encoding::Utf8BytesReader;
1011
use crate::errors::{Error, Result};
1112
use crate::events::Event;
1213
use crate::name::QName;
@@ -34,6 +35,7 @@ macro_rules! impl_buffered_source {
3435

3536
#[cfg(feature = "encoding")]
3637
$($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
38+
// TODO: broken because decoder sends UTF-8
3739
loop {
3840
break match self $(.$reader)? .fill_buf() $(.$await)? {
3941
Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
@@ -395,15 +397,13 @@ impl<R: BufRead> Reader<R> {
395397
}
396398
}
397399

398-
impl Reader<BufReader<File>> {
400+
impl Reader<Utf8BytesReader<File>> {
399401
/// Creates an XML reader from a file path.
400402
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
401403
let file = File::open(path).map_err(Error::Io)?;
402-
let reader = BufReader::new(file);
403-
Ok(Self::from_reader(reader))
404+
Ok(Self::from_reader(file))
404405
}
405406
}
406-
407407
#[cfg(test)]
408408
mod test {
409409
use crate::reader::test::check;

src/reader/mod.rs

Lines changed: 13 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
//! Contains high-level interface for a pull-based XML parser.
22
3+
use std::io::Read;
4+
use std::ops::Range;
5+
36
#[cfg(feature = "encoding")]
47
use encoding_rs::Encoding;
5-
use std::ops::Range;
68

7-
use crate::encoding::Decoder;
9+
use crate::encoding::{Decoder, Utf8BytesReader};
810
use crate::errors::{Error, Result};
911
use crate::events::Event;
1012
use crate::reader::parser::Parser;
@@ -359,7 +361,7 @@ enum ParseState {
359361
/// BomDetected -- "encoding=..." --> XmlDetected
360362
/// ```
361363
#[cfg(feature = "encoding")]
362-
#[derive(Clone, Copy)]
364+
#[derive(Clone, Copy, Debug)]
363365
enum EncodingRef {
364366
/// Encoding was implicitly assumed to have a specified value. It can be refined
365367
/// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
@@ -459,73 +461,22 @@ pub struct Reader<R> {
459461
}
460462

461463
/// Builder methods
462-
impl<R> Reader<R> {
464+
impl<R: Read> Reader<Utf8BytesReader<R>> {
463465
/// Creates a `Reader` that reads from a given reader.
464466
pub fn from_reader(reader: R) -> Self {
465467
Self {
466-
reader,
468+
reader: Utf8BytesReader::new(reader),
467469
parser: Parser::default(),
468470
}
469471
}
470-
471-
configure_methods!();
472472
}
473473

474-
/// Getters
474+
/// Public implementation-independent functionality
475475
impl<R> Reader<R> {
476-
/// Consumes `Reader` returning the underlying reader
477-
///
478-
/// Can be used to compute line and column of a parsing error position
479-
///
480-
/// # Examples
481-
///
482-
/// ```
483-
/// # use pretty_assertions::assert_eq;
484-
/// use std::{str, io::Cursor};
485-
/// use quick_xml::events::Event;
486-
/// use quick_xml::reader::Reader;
487-
///
488-
/// let xml = r#"<tag1 att1 = "test">
489-
/// <tag2><!--Test comment-->Test</tag2>
490-
/// <tag3>Test 2</tag3>
491-
/// </tag1>"#;
492-
/// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
493-
/// let mut buf = Vec::new();
494-
///
495-
/// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
496-
/// let end_pos = reader.buffer_position();
497-
/// let mut cursor = reader.into_inner();
498-
/// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
499-
/// .expect("can't make a string");
500-
/// let mut line = 1;
501-
/// let mut column = 0;
502-
/// for c in s.chars() {
503-
/// if c == '\n' {
504-
/// line += 1;
505-
/// column = 0;
506-
/// } else {
507-
/// column += 1;
508-
/// }
509-
/// }
510-
/// (line, column)
511-
/// }
512-
///
513-
/// loop {
514-
/// match reader.read_event_into(&mut buf) {
515-
/// Ok(Event::Start(ref e)) => match e.name().as_ref() {
516-
/// b"tag1" | b"tag2" => (),
517-
/// tag => {
518-
/// assert_eq!(b"tag3", tag);
519-
/// assert_eq!((3, 22), into_line_and_column(reader));
520-
/// break;
521-
/// }
522-
/// },
523-
/// Ok(Event::Eof) => unreachable!(),
524-
/// _ => (),
525-
/// }
526-
/// buf.clear();
527-
/// }
528-
/// ```
476+
// Configuration setters
477+
configure_methods!();
478+
479+
/// Consumes `Reader` returning the underlying reader.
529480
pub fn into_inner(self) -> R {
530481
self.reader
531482
}
@@ -1612,7 +1563,7 @@ mod test {
16121563
/// character should be stripped for consistency
16131564
#[$test]
16141565
$($async)? fn bom_from_reader() {
1615-
let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
1566+
let mut reader = Reader::from_str("\u{feff}\u{feff}");
16161567

16171568
assert_eq!(
16181569
reader.$read_event($buf) $(.$await)? .unwrap(),

src/reader/ns_reader.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@
66
77
use std::borrow::Cow;
88
use std::fs::File;
9-
use std::io::{BufRead, BufReader};
9+
use std::io::{BufRead, Read};
1010
use std::ops::Deref;
1111
use std::path::Path;
1212

13+
use crate::encoding::Utf8BytesReader;
1314
use crate::errors::Result;
1415
use crate::events::Event;
1516
use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult};
1617
use crate::reader::{Reader, Span, XmlSource};
17-
1818
/// A low level encoding-agnostic XML event reader that performs namespace resolution.
1919
///
2020
/// Consumes a [`BufRead`] and streams XML `Event`s.
@@ -33,14 +33,12 @@ pub struct NsReader<R> {
3333
}
3434

3535
/// Builder methods
36-
impl<R> NsReader<R> {
36+
impl<R: Read> NsReader<Utf8BytesReader<R>> {
3737
/// Creates a `NsReader` that reads from a reader.
3838
#[inline]
3939
pub fn from_reader(reader: R) -> Self {
4040
Self::new(Reader::from_reader(reader))
4141
}
42-
43-
configure_methods!(reader);
4442
}
4543

4644
/// Private methods
@@ -118,8 +116,11 @@ impl<R> NsReader<R> {
118116
}
119117
}
120118

121-
/// Getters
119+
/// Public implementation-independent functionality
122120
impl<R> NsReader<R> {
121+
// Configuration setters
122+
configure_methods!(reader);
123+
123124
/// Consumes `NsReader` returning the underlying reader
124125
///
125126
/// See the [`Reader::into_inner`] for examples
@@ -528,7 +529,7 @@ impl<R: BufRead> NsReader<R> {
528529
}
529530
}
530531

531-
impl NsReader<BufReader<File>> {
532+
impl NsReader<Utf8BytesReader<File>> {
532533
/// Creates an XML reader from a file path.
533534
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
534535
Ok(Self::new(Reader::from_file(path)?))

src/reader/parser.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,6 @@ impl Parser {
171171
if len > 2 && buf[len - 1] == b'?' {
172172
if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
173173
let event = BytesDecl::from_start(BytesStart::wrap(&buf[1..len - 1], 3));
174-
175174
// Try getting encoding from the declaration event
176175
#[cfg(feature = "encoding")]
177176
if self.encoding.can_be_refined() {

src/reader/slice_reader.rs

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,32 @@ use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, Xml
1616

1717
use memchr;
1818

19-
/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
20-
/// This implementation supports not using an intermediate buffer as the byte slice
21-
/// itself can be used to borrow from.
19+
use super::parser::Parser;
20+
21+
/// This is an implementation of [`Reader`] for reading from a `&[u8]` as
22+
/// underlying byte stream. This implementation supports not using an
23+
/// intermediate buffer as the byte slice itself can be used to borrow from.
2224
impl<'a> Reader<&'a [u8]> {
2325
/// Creates an XML reader from a string slice.
2426
pub fn from_str(s: &'a str) -> Self {
2527
// Rust strings are guaranteed to be UTF-8, so lock the encoding
2628
#[cfg(feature = "encoding")]
2729
{
28-
let mut reader = Self::from_reader(s.as_bytes());
29-
reader.parser.encoding = EncodingRef::Explicit(UTF_8);
30-
reader
30+
let mut parser = Parser::default();
31+
parser.encoding = EncodingRef::Explicit(UTF_8);
32+
Self {
33+
reader: s.as_bytes(),
34+
parser: parser,
35+
}
3136
}
3237

3338
#[cfg(not(feature = "encoding"))]
34-
Self::from_reader(s.as_bytes())
39+
{
40+
Self {
41+
reader: s.as_bytes(),
42+
parser: Parser::default(),
43+
}
44+
}
3545
}
3646

3747
/// Read an event that borrows from the input rather than a buffer.

tests/test.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ fn test_issue94() {
100100
let mut reader = Reader::from_reader(&data[..]);
101101
reader.trim_text(true);
102102
loop {
103-
match reader.read_event() {
103+
match reader.read_event_into(&mut Vec::new()) {
104104
Ok(Eof) | Err(..) => break,
105105
_ => (),
106106
}

tests/xmlrs_reader_tests.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ fn test_bytes(input: &[u8], output: &[u8], trim: bool) {
392392

393393
let mut decoder = reader.decoder();
394394
loop {
395-
let line = match reader.read_resolved_event() {
395+
let line = match reader.read_resolved_event_into(&mut Vec::new()) {
396396
Ok((_, Event::Decl(e))) => {
397397
// Declaration could change decoder
398398
decoder = reader.decoder();

0 commit comments

Comments
 (0)