Skip to content

Commit d49a4b8

Browse files
Mingundralley
andcommitted
Fix #180: Eliminated the differences in the decoding API when feature encoding enabled and when it is disabled
Co-authored-by: Daniel Alley <[email protected]>
1 parent be9ec0c commit d49a4b8

File tree

9 files changed

+65
-84
lines changed

9 files changed

+65
-84
lines changed

Changelog.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,16 @@
7878
- text before the first tag is not an XML content at all, so it is meaningless
7979
to try to unescape something in it
8080

81+
- [#180]: Eliminated the differences in the decoding API when feature `encoding` enabled and when it is
82+
disabled. Signatures of functions are now the same regardless of whether or not the feature is
83+
enabled, and an error will be returned instead of performing replacements for invalid characters
84+
in both cases.
85+
86+
Previously, if the `encoding` feature was enabled, decoding functions would return `Result<Cow<&str>>`
87+
while without this feature they would return `Result<&str>`. With this change, only `Result<Cow<&str>>`
88+
is returned regardless of the status of the feature.
89+
- [#180]: Error variant `Error::Utf8` replaced by `Error::NonDecodable`
90+
8191
### New Tests
8292

8393
- [#9]: Added tests for incorrect nested tags in input

src/de/escape.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,8 @@ macro_rules! deserialize_num {
4545
where
4646
V: Visitor<'de>,
4747
{
48-
#[cfg(not(feature = "encoding"))]
4948
let value = self.decoder.decode(self.escaped_value.as_ref())?.parse()?;
5049

51-
#[cfg(feature = "encoding")]
52-
let value = self.decoder.decode(self.escaped_value.as_ref()).parse()?;
53-
5450
visitor.$visit(value)
5551
}
5652
};
@@ -71,11 +67,8 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> {
7167
V: Visitor<'de>,
7268
{
7369
let unescaped = self.unescaped()?;
74-
#[cfg(not(feature = "encoding"))]
7570
let value = self.decoder.decode(&unescaped)?;
7671

77-
#[cfg(feature = "encoding")]
78-
let value = self.decoder.decode(&unescaped);
7972
visitor.visit_str(&value)
8073
}
8174

src/de/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ where
337337
{
338338
#[cfg(feature = "encoding")]
339339
{
340-
let value = decoder.decode(value);
340+
let value = decoder.decode(value)?;
341341
// No need to unescape because valid boolean representations cannot be escaped
342342
match value.as_ref() {
343343
"true" | "1" | "True" | "TRUE" | "t" | "Yes" | "YES" | "yes" | "y" => {

src/de/seq.rs

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ use crate::de::{DeError, DeEvent, Deserializer, XmlRead};
22
use crate::events::BytesStart;
33
use crate::reader::Decoder;
44
use serde::de::{DeserializeSeed, SeqAccess};
5-
#[cfg(not(feature = "encoding"))]
6-
use std::borrow::Cow;
75

86
/// Check if tag `start` is included in the `fields` list. `decoder` is used to
97
/// get a string representation of a tag.
@@ -14,11 +12,7 @@ pub fn not_in(
1412
start: &BytesStart,
1513
decoder: Decoder,
1614
) -> Result<bool, DeError> {
17-
#[cfg(not(feature = "encoding"))]
18-
let tag = Cow::Borrowed(decoder.decode(start.name().into_inner())?);
19-
20-
#[cfg(feature = "encoding")]
21-
let tag = decoder.decode(start.name().into_inner());
15+
let tag = decoder.decode(start.name().into_inner())?;
2216

2317
Ok(fields.iter().all(|&field| field != tag.as_ref()))
2418
}

src/errors.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ use std::string::FromUtf8Error;
1111
pub enum Error {
1212
/// IO error
1313
Io(::std::io::Error),
14-
/// Utf8 error
15-
Utf8(Utf8Error),
14+
/// Input decoding error. If `encoding` feature is disabled, contains `None`,
15+
/// otherwise contains the UTF-8 decoding error
16+
NonDecodable(Option<Utf8Error>),
1617
/// Unexpected End of File
1718
UnexpectedEof(String),
1819
/// End event mismatch
@@ -47,10 +48,10 @@ impl From<::std::io::Error> for Error {
4748
}
4849

4950
impl From<Utf8Error> for Error {
50-
/// Creates a new `Error::Utf8` from the given error
51+
/// Creates a new `Error::NonDecodable` from the given error
5152
#[inline]
5253
fn from(error: Utf8Error) -> Error {
53-
Error::Utf8(error)
54+
Error::NonDecodable(Some(error))
5455
}
5556
}
5657

@@ -86,7 +87,8 @@ impl std::fmt::Display for Error {
8687
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
8788
match self {
8889
Error::Io(e) => write!(f, "I/O error: {}", e),
89-
Error::Utf8(e) => write!(f, "UTF8 error: {}", e),
90+
Error::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"),
91+
Error::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e),
9092
Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}", e),
9193
Error::EndEventMismatch { expected, found } => {
9294
write!(f, "Expecting </{}> found </{}>", expected, found)
@@ -118,7 +120,7 @@ impl std::error::Error for Error {
118120
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
119121
match self {
120122
Error::Io(e) => Some(e),
121-
Error::Utf8(e) => Some(e),
123+
Error::NonDecodable(Some(e)) => Some(e),
122124
Error::InvalidAttr(e) => Some(e),
123125
Error::EscapeError(e) => Some(e),
124126
_ => None,

src/events/attributes.rs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,6 @@ impl<'a> Attribute<'a> {
113113
reader: &Reader<B>,
114114
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
115115
) -> XmlResult<String> {
116-
#[cfg(feature = "encoding")]
117-
let decoded = reader.decoder().decode(&*self.value);
118-
119-
#[cfg(not(feature = "encoding"))]
120116
let decoded = reader.decoder().decode(&*self.value)?;
121117

122118
let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;

src/events/mod.rs

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,6 @@ impl<'a> BytesStartText<'a> {
8585
/// appeared in the BOM or in the text before the first tag.
8686
pub fn decode_with_bom_removal(&self, decoder: Decoder) -> Result<String> {
8787
//TODO: Fix lifetime issue - it should be possible to borrow string
88-
#[cfg(feature = "encoding")]
89-
let decoded = decoder.decode_with_bom_removal(&*self);
90-
91-
#[cfg(not(feature = "encoding"))]
9288
let decoded = decoder.decode_with_bom_removal(&*self)?;
9389

9490
Ok(decoded.to_string())
@@ -317,10 +313,6 @@ impl<'a> BytesStart<'a> {
317313
reader: &Reader<B>,
318314
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
319315
) -> Result<String> {
320-
#[cfg(feature = "encoding")]
321-
let decoded = reader.decoder().decode(&*self);
322-
323-
#[cfg(not(feature = "encoding"))]
324316
let decoded = reader.decoder().decode(&*self)?;
325317

326318
let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
@@ -880,10 +872,6 @@ impl<'a> BytesText<'a> {
880872
reader: &Reader<B>,
881873
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
882874
) -> Result<String> {
883-
#[cfg(feature = "encoding")]
884-
let decoded = reader.decoder().decode(&*self);
885-
886-
#[cfg(not(feature = "encoding"))]
887875
let decoded = reader.decoder().decode(&*self)?;
888876

889877
let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
@@ -1000,21 +988,8 @@ impl<'a> BytesCData<'a> {
1000988
#[cfg(feature = "serialize")]
1001989
pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result<Cow<'a, str>> {
1002990
Ok(match &self.content {
1003-
Cow::Borrowed(bytes) => {
1004-
#[cfg(feature = "encoding")]
1005-
{
1006-
decoder.decode(bytes)
1007-
}
1008-
#[cfg(not(feature = "encoding"))]
1009-
{
1010-
decoder.decode(bytes)?.into()
1011-
}
1012-
}
991+
Cow::Borrowed(bytes) => decoder.decode(bytes)?,
1013992
Cow::Owned(bytes) => {
1014-
#[cfg(feature = "encoding")]
1015-
let decoded = decoder.decode(bytes).into_owned();
1016-
1017-
#[cfg(not(feature = "encoding"))]
1018993
let decoded = decoder.decode(bytes)?.to_string();
1019994

1020995
decoded.into()

src/reader.rs

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
//! A module to handle `Reader`
22
3-
#[cfg(feature = "encoding")]
43
use std::borrow::Cow;
54
use std::io::{self, BufRead, BufReader};
65
use std::{fs::File, path::Path, str::from_utf8};
@@ -1472,8 +1471,9 @@ impl Decoder {
14721471
/// Returns an error in case of malformed sequences in the `bytes`.
14731472
///
14741473
/// If you instead want to use XML declared encoding, use the `encoding` feature
1475-
pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> {
1476-
Ok(from_utf8(bytes)?)
1474+
#[inline]
1475+
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
1476+
Ok(Cow::Borrowed(from_utf8(bytes)?))
14771477
}
14781478

14791479
/// Decodes a slice regardless of XML declaration with BOM removal if
@@ -1482,7 +1482,7 @@ impl Decoder {
14821482
/// Returns an error in case of malformed sequences in the `bytes`.
14831483
///
14841484
/// If you instead want to use XML declared encoding, use the `encoding` feature
1485-
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<&'b str> {
1485+
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
14861486
let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") {
14871487
&bytes[3..]
14881488
} else {
@@ -1504,12 +1504,18 @@ impl Decoder {
15041504
}
15051505

15061506
/// Decodes specified bytes using encoding, declared in the XML, if it was
1507-
/// declared there, or UTF-8 otherwise
1507+
/// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
1508+
/// in the `bytes`.
15081509
///
1509-
/// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the
1510-
/// `U+FFFD REPLACEMENT CHARACTER`.
1511-
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> {
1512-
self.encoding.decode(bytes).0
1510+
/// Returns an error in case of malformed sequences in the `bytes`.
1511+
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
1512+
match self
1513+
.encoding
1514+
.decode_without_bom_handling_and_without_replacement(bytes)
1515+
{
1516+
None => Err(Error::NonDecodable(None)),
1517+
Some(s) => Ok(s),
1518+
}
15131519
}
15141520

15151521
/// Decodes a slice with BOM removal if it is present in the `bytes` using
@@ -1520,9 +1526,26 @@ impl Decoder {
15201526
///
15211527
/// If XML declaration is absent in the XML, UTF-8 is used.
15221528
///
1523-
/// Any malformed sequences replaced with the `U+FFFD REPLACEMENT CHARACTER`.
1524-
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> {
1525-
self.encoding.decode_with_bom_removal(bytes).0
1529+
/// Returns an error in case of malformed sequences in the `bytes`.
1530+
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
1531+
self.decode(self.remove_bom(bytes))
1532+
}
1533+
/// Copied from [`Encoding::decode_with_bom_removal`]
1534+
#[inline]
1535+
fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] {
1536+
use encoding_rs::*;
1537+
1538+
if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
1539+
return &bytes[3..];
1540+
}
1541+
if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") {
1542+
return &bytes[2..];
1543+
}
1544+
if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") {
1545+
return &bytes[2..];
1546+
}
1547+
1548+
bytes
15261549
}
15271550
}
15281551

tests/xmlrs_reader_tests.rs

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
use quick_xml::events::{BytesStart, Event};
22
use quick_xml::name::{QName, ResolveResult};
3-
use quick_xml::{Reader, Result};
4-
use std::borrow::Cow;
3+
use quick_xml::{Decoder, Reader, Result};
54
use std::str::from_utf8;
65

76
#[test]
@@ -381,7 +380,7 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) {
381380
loop {
382381
buf.clear();
383382
let event = reader.read_namespaced_event(&mut buf, &mut ns_buffer);
384-
let line = xmlrs_display(event, &reader);
383+
let line = xmlrs_display(event, reader.decoder());
385384
if let Some((n, spec)) = spec_lines.next() {
386385
if spec.trim() == "EndDocument" {
387386
break;
@@ -420,8 +419,8 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) {
420419
}
421420
}
422421

423-
fn namespace_name(n: ResolveResult, name: QName, reader: &Reader<&[u8]>) -> String {
424-
let name = decode(name.as_ref(), reader);
422+
fn namespace_name(n: ResolveResult, name: QName, decoder: Decoder) -> String {
423+
let name = decoder.decode(name.as_ref()).unwrap();
425424
match n {
426425
// Produces string '{namespace}prefixed_name'
427426
ResolveResult::Bound(n) => format!("{{{}}}{}", from_utf8(n.as_ref()).unwrap(), name),
@@ -448,44 +447,33 @@ fn make_attrs(e: &BytesStart) -> ::std::result::Result<String, String> {
448447
Ok(atts.join(", "))
449448
}
450449

451-
// FIXME: The public API differs based on the "encoding" feature
452-
fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> {
453-
#[cfg(feature = "encoding")]
454-
let decoded = reader.decoder().decode(text);
455-
456-
#[cfg(not(feature = "encoding"))]
457-
let decoded = Cow::Borrowed(reader.decoder().decode(text).unwrap());
458-
459-
decoded
460-
}
461-
462-
fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8]>) -> String {
450+
fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, decoder: Decoder) -> String {
463451
match opt_event {
464452
Ok((_, Event::StartText(_))) => "StartText".to_string(),
465453
Ok((n, Event::Start(ref e))) => {
466-
let name = namespace_name(n, e.name(), reader);
454+
let name = namespace_name(n, e.name(), decoder);
467455
match make_attrs(e) {
468456
Ok(ref attrs) if attrs.is_empty() => format!("StartElement({})", &name),
469457
Ok(ref attrs) => format!("StartElement({} [{}])", &name, &attrs),
470458
Err(e) => format!("StartElement({}, attr-error: {})", &name, &e),
471459
}
472460
}
473461
Ok((n, Event::Empty(ref e))) => {
474-
let name = namespace_name(n, e.name(), reader);
462+
let name = namespace_name(n, e.name(), decoder);
475463
match make_attrs(e) {
476464
Ok(ref attrs) if attrs.is_empty() => format!("EmptyElement({})", &name),
477465
Ok(ref attrs) => format!("EmptyElement({} [{}])", &name, &attrs),
478466
Err(e) => format!("EmptyElement({}, attr-error: {})", &name, &e),
479467
}
480468
}
481469
Ok((n, Event::End(ref e))) => {
482-
let name = namespace_name(n, e.name(), reader);
470+
let name = namespace_name(n, e.name(), decoder);
483471
format!("EndElement({})", name)
484472
}
485473
Ok((_, Event::Comment(ref e))) => format!("Comment({})", from_utf8(e).unwrap()),
486474
Ok((_, Event::CData(ref e))) => format!("CData({})", from_utf8(e).unwrap()),
487475
Ok((_, Event::Text(ref e))) => match e.unescaped() {
488-
Ok(c) => format!("Characters({})", decode(c.as_ref(), reader).as_ref()),
476+
Ok(c) => format!("Characters({})", decoder.decode(c.as_ref()).unwrap()),
489477
Err(ref err) => format!("FailedUnescape({:?}; {})", e.escaped(), err),
490478
},
491479
Ok((_, Event::Decl(ref e))) => {

0 commit comments

Comments
 (0)