Skip to content

Commit 8fa6f1e

Browse files
authored
Merge pull request #399 from Mingun/bom
Rework BOM handling and encoding API
2 parents 3b37c0e + d49a4b8 commit 8fa6f1e

File tree

15 files changed

+1002
-945
lines changed

15 files changed

+1002
-945
lines changed

Changelog.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717
- [#393]: New module `name` with `QName`, `LocalName`, `Namespace`, `Prefix`
1818
and `PrefixDeclaration` wrappers around byte arrays and `ResolveResult` with
1919
the result of namespace resolution
20+
- [#180]: Make `Decoder` struct public. You already had access to it via the
21+
`Reader::decoder()` method, but could not name it in the code. Now the preferred
22+
way to access decoding functionality is via this struct
23+
- [#191]: New event variant `StartText` emitted for bytes before the XML declaration
24+
or a start comment or a tag. For streams with BOM this event will contain a BOM
2025

2126
### Bug Fixes
2227

@@ -56,6 +61,33 @@
5661
- [#393]: Now `BytesStart::name()` and `BytesEnd::name()` returns `QName`, and
5762
`BytesStart::local_name()` and `BytesEnd::local_name()` returns `LocalName`
5863

64+
- [#191]: Remove unused `reader.decoder().decode_owned()`. If you ever used it,
65+
use `String::from_utf8` instead (which that function did)
66+
- [#191]: Remove `*_without_bom` methods from the `Attributes` struct because they are useless.
67+
Use the same-named methods without that suffix instead. Attribute values cannot contain BOM
68+
- [#191]: Remove `Reader::decode()` and `Reader::decode_without_bom()`, they are replaced by
69+
`Decoder::decode()` and `Decoder::decode_with_bom_removal()`.
70+
Use `reader.decoder().decode_*(...)` instead of `reader.decode_*(...)` for now.
71+
`Reader::encoding()` is replaced by `Decoder::encoding()` as well
72+
- [#191]: Remove poorly designed `BytesText::unescape_and_decode_without_bom()` and
73+
`BytesText::unescape_and_decode_without_bom_with_custom_entities()`. Although these methods worked
74+
as expected, this was only due to good luck. They was replaced by the
75+
`BytesStartText::decode_with_bom_removal()`:
76+
- conceptually, you should decode BOM only for the first `Text` event from the
77+
reader (since now `StartText` event is emitted instead for this)
78+
- text before the first tag is not an XML content at all, so it is meaningless
79+
to try to unescape something in it
80+
81+
- [#180]: Eliminated the differences in the decoding API when feature `encoding` enabled and when it is
82+
disabled. Signatures of functions are now the same regardless of whether or not the feature is
83+
enabled, and an error will be returned instead of performing replacements for invalid characters
84+
in both cases.
85+
86+
Previously, if the `encoding` feature was enabled, decoding functions would return `Result<Cow<&str>>`
87+
while without this feature they would return `Result<&str>`. With this change, only `Result<Cow<&str>>`
88+
is returned regardless of the status of the feature.
89+
- [#180]: Error variant `Error::Utf8` replaced by `Error::NonDecodable`
90+
5991
### New Tests
6092

6193
- [#9]: Added tests for incorrect nested tags in input
@@ -66,6 +98,8 @@
6698

6799
[#8]: https://github.com/Mingun/fast-xml/pull/8
68100
[#9]: https://github.com/Mingun/fast-xml/pull/9
101+
[#180]: https://github.com/tafia/quick-xml/issues/180
102+
[#191]: https://github.com/tafia/quick-xml/issues/191
69103
[#363]: https://github.com/tafia/quick-xml/issues/363
70104
[#387]: https://github.com/tafia/quick-xml/pull/387
71105
[#391]: https://github.com/tafia/quick-xml/pull/391

benches/bench.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use criterion::{self, criterion_group, criterion_main, Criterion};
22
use pretty_assertions::assert_eq;
33
use quick_xml::events::Event;
4+
use quick_xml::name::QName;
45
use quick_xml::Reader;
56

67
static SAMPLE: &[u8] = include_bytes!("../tests/sample_rss.xml");
@@ -173,15 +174,15 @@ fn bytes_text_unescaped(c: &mut Criterion) {
173174
/// Benchmarks, how fast individual event parsed
174175
fn one_event(c: &mut Criterion) {
175176
let mut group = c.benchmark_group("One event");
176-
group.bench_function("Text", |b| {
177+
group.bench_function("StartText", |b| {
177178
let src = "Hello world!".repeat(512 / 12).into_bytes();
178179
let mut buf = Vec::with_capacity(1024);
179180
b.iter(|| {
180181
let mut r = Reader::from_reader(src.as_ref());
181182
let mut nbtxt = criterion::black_box(0);
182183
r.check_end_names(false).check_comments(false);
183184
match r.read_event(&mut buf) {
184-
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
185+
Ok(Event::StartText(e)) => nbtxt += e.unescaped().unwrap().len(),
185186
something_else => panic!("Did not expect {:?}", something_else),
186187
};
187188

@@ -310,7 +311,7 @@ fn attributes(c: &mut Criterion) {
310311
let mut buf = Vec::new();
311312
loop {
312313
match r.read_event(&mut buf) {
313-
Ok(Event::Empty(e)) if e.name() == b"player" => {
314+
Ok(Event::Empty(e)) if e.name() == QName(b"player") => {
314315
for name in ["num", "status", "avg"] {
315316
if let Some(_attr) = e.try_get_attribute(name).unwrap() {
316317
count += 1

src/de/escape.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,8 @@ macro_rules! deserialize_num {
4545
where
4646
V: Visitor<'de>,
4747
{
48-
#[cfg(not(feature = "encoding"))]
4948
let value = self.decoder.decode(self.escaped_value.as_ref())?.parse()?;
5049

51-
#[cfg(feature = "encoding")]
52-
let value = self.decoder.decode(self.escaped_value.as_ref()).parse()?;
53-
5450
visitor.$visit(value)
5551
}
5652
};
@@ -71,11 +67,8 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
7167
V: Visitor<'de>,
7268
{
7369
let unescaped = self.unescaped()?;
74-
#[cfg(not(feature = "encoding"))]
7570
let value = self.decoder.decode(&unescaped)?;
7671

77-
#[cfg(feature = "encoding")]
78-
let value = self.decoder.decode(&unescaped);
7972
visitor.visit_str(&value)
8073
}
8174

src/de/mod.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ where
337337
{
338338
#[cfg(feature = "encoding")]
339339
{
340-
let value = decoder.decode(value);
340+
let value = decoder.decode(value)?;
341341
// No need to unescape because valid boolean representations cannot be escaped
342342
match value.as_ref() {
343343
"true" | "1" | "True" | "TRUE" | "t" | "Yes" | "YES" | "yes" | "y" => {
@@ -624,7 +624,7 @@ where
624624
allow_start: bool,
625625
) -> Result<BytesCData<'de>, DeError> {
626626
match self.next()? {
627-
DeEvent::Text(e) if unescape => e.unescape().map_err(|e| DeError::InvalidXml(e.into())),
627+
DeEvent::Text(e) if unescape => e.unescape().map_err(Into::into),
628628
DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())),
629629
DeEvent::CData(e) => Ok(e),
630630
DeEvent::Start(e) if allow_start => {
@@ -952,6 +952,10 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
952952
let event = loop {
953953
let e = self.reader.read_event(&mut self.buf)?;
954954
match e {
955+
//TODO: Probably not the best idea treat StartText as usual text
956+
// Usually this event will represent a BOM
957+
// Changing this requires review of the serde-de::top_level::one_element test
958+
Event::StartText(e) => break Ok(DeEvent::Text(e.into_owned().into())),
955959
Event::Start(e) => break Ok(DeEvent::Start(e.into_owned())),
956960
Event::End(e) => break Ok(DeEvent::End(e.into_owned())),
957961
Event::Text(e) => break Ok(DeEvent::Text(e.into_owned())),
@@ -992,6 +996,10 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
992996
loop {
993997
let e = self.reader.read_event_unbuffered()?;
994998
match e {
999+
//TODO: Probably not the best idea treat StartText as usual text
1000+
// Usually this event will represent a BOM
1001+
// Changing this requires review of the serde-de::top_level::one_element test
1002+
Event::StartText(e) => break Ok(DeEvent::Text(e.into())),
9951003
Event::Start(e) => break Ok(DeEvent::Start(e)),
9961004
Event::End(e) => break Ok(DeEvent::End(e)),
9971005
Event::Text(e) => break Ok(DeEvent::Text(e)),

src/de/seq.rs

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ use crate::de::{DeError, DeEvent, Deserializer, XmlRead};
22
use crate::events::BytesStart;
33
use crate::reader::Decoder;
44
use serde::de::{DeserializeSeed, SeqAccess};
5-
#[cfg(not(feature = "encoding"))]
6-
use std::borrow::Cow;
75

86
/// Check if tag `start` is included in the `fields` list. `decoder` is used to
97
/// get a string representation of a tag.
@@ -14,11 +12,7 @@ pub fn not_in(
1412
start: &BytesStart,
1513
decoder: Decoder,
1614
) -> Result<bool, DeError> {
17-
#[cfg(not(feature = "encoding"))]
18-
let tag = Cow::Borrowed(decoder.decode(start.name().into_inner())?);
19-
20-
#[cfg(feature = "encoding")]
21-
let tag = decoder.decode(start.name().into_inner());
15+
let tag = decoder.decode(start.name().into_inner())?;
2216

2317
Ok(fields.iter().all(|&field| field != tag.as_ref()))
2418
}

src/errors.rs

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,16 @@ use crate::escape::EscapeError;
44
use crate::events::attributes::AttrError;
55
use crate::utils::write_byte_string;
66
use std::str::Utf8Error;
7+
use std::string::FromUtf8Error;
78

89
/// The error type used by this crate.
910
#[derive(Debug)]
1011
pub enum Error {
1112
/// IO error
1213
Io(::std::io::Error),
13-
/// Utf8 error
14-
Utf8(Utf8Error),
14+
/// Input decoding error. If `encoding` feature is disabled, contains `None`,
15+
/// otherwise contains the UTF-8 decoding error
16+
NonDecodable(Option<Utf8Error>),
1517
/// Unexpected End of File
1618
UnexpectedEof(String),
1719
/// End event mismatch
@@ -46,10 +48,18 @@ impl From<::std::io::Error> for Error {
4648
}
4749

4850
impl From<Utf8Error> for Error {
49-
/// Creates a new `Error::Utf8` from the given error
51+
/// Creates a new `Error::NonDecodable` from the given error
5052
#[inline]
5153
fn from(error: Utf8Error) -> Error {
52-
Error::Utf8(error)
54+
Error::NonDecodable(Some(error))
55+
}
56+
}
57+
58+
impl From<FromUtf8Error> for Error {
59+
/// Creates a new `Error::Utf8` from the given error
60+
#[inline]
61+
fn from(error: FromUtf8Error) -> Error {
62+
error.utf8_error().into()
5363
}
5464
}
5565

@@ -77,7 +87,8 @@ impl std::fmt::Display for Error {
7787
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
7888
match self {
7989
Error::Io(e) => write!(f, "I/O error: {}", e),
80-
Error::Utf8(e) => write!(f, "UTF8 error: {}", e),
90+
Error::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"),
91+
Error::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e),
8192
Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}", e),
8293
Error::EndEventMismatch { expected, found } => {
8394
write!(f, "Expecting </{}> found </{}>", expected, found)
@@ -109,7 +120,7 @@ impl std::error::Error for Error {
109120
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
110121
match self {
111122
Error::Io(e) => Some(e),
112-
Error::Utf8(e) => Some(e),
123+
Error::NonDecodable(Some(e)) => Some(e),
113124
Error::InvalidAttr(e) => Some(e),
114125
Error::EscapeError(e) => Some(e),
115126
_ => None,
@@ -227,6 +238,7 @@ pub mod serialize {
227238
}
228239

229240
impl From<Error> for DeError {
241+
#[inline]
230242
fn from(e: Error) -> Self {
231243
Self::InvalidXml(e)
232244
}
@@ -239,15 +251,17 @@ pub mod serialize {
239251
}
240252
}
241253

242-
impl From<ParseIntError> for DeError {
243-
fn from(e: ParseIntError) -> Self {
244-
Self::InvalidInt(e)
254+
impl From<Utf8Error> for DeError {
255+
#[inline]
256+
fn from(e: Utf8Error) -> Self {
257+
Self::InvalidXml(e.into())
245258
}
246259
}
247260

248-
impl From<ParseFloatError> for DeError {
249-
fn from(e: ParseFloatError) -> Self {
250-
Self::InvalidFloat(e)
261+
impl From<FromUtf8Error> for DeError {
262+
#[inline]
263+
fn from(e: FromUtf8Error) -> Self {
264+
Self::InvalidXml(e.into())
251265
}
252266
}
253267

@@ -257,4 +271,18 @@ pub mod serialize {
257271
Self::InvalidXml(e.into())
258272
}
259273
}
274+
275+
impl From<ParseIntError> for DeError {
276+
#[inline]
277+
fn from(e: ParseIntError) -> Self {
278+
Self::InvalidInt(e)
279+
}
280+
}
281+
282+
impl From<ParseFloatError> for DeError {
283+
#[inline]
284+
fn from(e: ParseFloatError) -> Self {
285+
Self::InvalidFloat(e)
286+
}
287+
}
260288
}

0 commit comments

Comments
 (0)