Skip to content

Commit f6d5e99

Browse files
authored
Merge pull request #403 from Mingun/lock-encoding
Always use UTF-8 for a `Reader` and a `Deserializer` created from a string
2 parents b686d3d + d9f7772 commit f6d5e99

File tree

4 files changed

+163
-39
lines changed

4 files changed

+163
-39
lines changed

Changelog.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
- [#191]: New event variant `StartText` emitted for bytes before the XML declaration
2424
or a start comment or a tag. For streams with BOM this event will contain a BOM
2525
- [#395]: Add support for XML Schema `xs:list`
26+
- [#324]: `Reader::from_str` / `Deserializer::from_str` / `from_str` now ignore
27+
the XML declared encoding and always use UTF-8
2628

2729
### Bug Fixes
2830

@@ -92,6 +94,8 @@
9294
- [#118]: Remove `BytesStart::unescaped*` set of methods because they could return wrong results
9395
Use methods on `Attribute` instead
9496

97+
- [#403]: Remove deprecated `quick_xml::de::from_bytes` and `Deserializer::from_borrowing_reader`
98+
9599
### New Tests
96100

97101
- [#9]: Added tests for incorrect nested tags in input
@@ -104,11 +108,13 @@
104108
[#9]: https://github.com/Mingun/fast-xml/pull/9
105109
[#180]: https://github.com/tafia/quick-xml/issues/180
106110
[#191]: https://github.com/tafia/quick-xml/issues/191
111+
[#324]: https://github.com/tafia/quick-xml/issues/324
107112
[#363]: https://github.com/tafia/quick-xml/issues/363
108113
[#387]: https://github.com/tafia/quick-xml/pull/387
109114
[#391]: https://github.com/tafia/quick-xml/pull/391
110115
[#393]: https://github.com/tafia/quick-xml/pull/393
111116
[#395]: https://github.com/tafia/quick-xml/pull/395
117+
[#403]: https://github.com/tafia/quick-xml/pull/403
112118

113119
## 0.23.0 -- 2022-05-08
114120

src/de/mod.rs

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -297,16 +297,8 @@ pub fn from_str<'de, T>(s: &'de str) -> Result<T, DeError>
297297
where
298298
T: Deserialize<'de>,
299299
{
300-
from_slice(s.as_bytes())
301-
}
302-
303-
/// Deserialize an instance of type `T` from bytes of XML text.
304-
#[deprecated = "Use `from_slice` instead"]
305-
pub fn from_bytes<'de, T>(s: &'de [u8]) -> Result<T, DeError>
306-
where
307-
T: Deserialize<'de>,
308-
{
309-
from_slice(s)
300+
let mut de = Deserializer::from_str(s);
301+
T::deserialize(&mut de)
310302
}
311303

312304
/// Deserialize an instance of type `T` from bytes of XML text.
@@ -400,12 +392,6 @@ where
400392
}
401393
}
402394

403-
/// Get a new deserializer from a regular BufRead
404-
#[deprecated = "Use `Deserializer::new` instead"]
405-
pub fn from_borrowing_reader(reader: R) -> Self {
406-
Self::new(reader)
407-
}
408-
409395
/// Set the maximum number of events that could be skipped during deserialization
410396
/// of sequences.
411397
///
@@ -711,12 +697,17 @@ where
711697
impl<'de> Deserializer<'de, SliceReader<'de>> {
712698
/// Create new deserializer that will borrow data from the specified string
713699
pub fn from_str(s: &'de str) -> Self {
714-
Self::from_slice(s.as_bytes())
700+
Self::from_borrowing_reader(Reader::from_str(s))
715701
}
716702

717703
/// Create new deserializer that will borrow data from the specified byte array
718704
pub fn from_slice(bytes: &'de [u8]) -> Self {
719-
let mut reader = Reader::from_bytes(bytes);
705+
Self::from_borrowing_reader(Reader::from_bytes(bytes))
706+
}
707+
708+
/// Create new deserializer that will borrow data from the specified borrowing reader
709+
#[inline]
710+
fn from_borrowing_reader(mut reader: Reader<&'de [u8]>) -> Self {
720711
reader
721712
.expand_empty_elements(true)
722713
.check_end_names(true)

src/reader.rs

Lines changed: 126 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,55 @@ enum TagState {
5656
Exit,
5757
}
5858

59+
/// A reference to an encoding together with information about how it was retrieved.
60+
///
61+
/// The state transition diagram:
62+
///
63+
/// ```mermaid
64+
/// flowchart LR
65+
/// Implicit -- from_str --> Explicit
66+
/// Implicit -- BOM --> BomDetected
67+
/// Implicit -- "encoding=..." --> XmlDetected
68+
/// BomDetected -- "encoding=..." --> XmlDetected
69+
/// ```
70+
#[cfg(feature = "encoding")]
71+
#[derive(Clone, Copy)]
72+
enum EncodingRef {
73+
/// Encoding was implicitly assumed to have a specified value. It can be refined
74+
/// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
75+
Implicit(&'static Encoding),
76+
/// Encoding was explicitly set to the desired value. It cannot be changed
77+
/// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
78+
Explicit(&'static Encoding),
79+
/// Encoding was detected from a byte order mark (BOM) or by the first bytes
80+
/// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
81+
BomDetected(&'static Encoding),
82+
/// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
83+
/// It can no longer change
84+
XmlDetected(&'static Encoding),
85+
}
86+
#[cfg(feature = "encoding")]
87+
impl EncodingRef {
88+
#[inline]
89+
fn encoding(&self) -> &'static Encoding {
90+
match self {
91+
Self::Implicit(e) => e,
92+
Self::Explicit(e) => e,
93+
Self::BomDetected(e) => e,
94+
Self::XmlDetected(e) => e,
95+
}
96+
}
97+
#[inline]
98+
fn can_be_refined(&self) -> bool {
99+
match self {
100+
Self::Implicit(_) | Self::BomDetected(_) => true,
101+
Self::Explicit(_) | Self::XmlDetected(_) => false,
102+
}
103+
}
104+
}
105+
106+
////////////////////////////////////////////////////////////////////////////////////////////////////
107+
59108
/// A low level encoding-agnostic XML event reader.
60109
///
61110
/// Consumes a `BufRead` and streams XML `Event`s.
@@ -144,11 +193,8 @@ pub struct Reader<R: BufRead> {
144193
pending_pop: bool,
145194

146195
#[cfg(feature = "encoding")]
147-
/// the encoding specified in the xml, defaults to utf8
148-
encoding: &'static Encoding,
149-
#[cfg(feature = "encoding")]
150-
/// check if quick-rs could find out the encoding
151-
is_encoding_set: bool,
196+
/// Reference to the encoding used to read an XML
197+
encoding: EncodingRef,
152198
}
153199

154200
/// Builder methods
@@ -172,9 +218,7 @@ impl<R: BufRead> Reader<R> {
172218
pending_pop: false,
173219

174220
#[cfg(feature = "encoding")]
175-
encoding: ::encoding_rs::UTF_8,
176-
#[cfg(feature = "encoding")]
177-
is_encoding_set: false,
221+
encoding: EncodingRef::Implicit(UTF_8),
178222
}
179223
}
180224

@@ -412,7 +456,7 @@ impl<R: BufRead> Reader<R> {
412456
pub fn decoder(&self) -> Decoder {
413457
Decoder {
414458
#[cfg(feature = "encoding")]
415-
encoding: self.encoding,
459+
encoding: self.encoding.encoding(),
416460
}
417461
}
418462
}
@@ -683,10 +727,9 @@ impl<R: BufRead> Reader<R> {
683727
{
684728
Ok(Some(bytes)) => {
685729
#[cfg(feature = "encoding")]
686-
if first {
730+
if first && self.encoding.can_be_refined() {
687731
if let Some(encoding) = detect_encoding(bytes) {
688-
self.encoding = encoding;
689-
self.is_encoding_set = true;
732+
self.encoding = EncodingRef::BomDetected(encoding);
690733
}
691734
}
692735

@@ -843,9 +886,10 @@ impl<R: BufRead> Reader<R> {
843886

844887
// Try getting encoding from the declaration event
845888
#[cfg(feature = "encoding")]
846-
if let Some(enc) = event.encoder() {
847-
self.encoding = enc;
848-
self.is_encoding_set = true;
889+
if self.encoding.can_be_refined() {
890+
if let Some(encoding) = event.encoder() {
891+
self.encoding = EncodingRef::XmlDetected(encoding);
892+
}
849893
}
850894

851895
Ok(Event::Decl(event))
@@ -905,6 +949,15 @@ impl Reader<BufReader<File>> {
905949
impl<'a> Reader<&'a [u8]> {
906950
/// Creates an XML reader from a string slice.
907951
pub fn from_str(s: &'a str) -> Self {
952+
// Rust strings are guaranteed to be UTF-8, so lock the encoding
953+
#[cfg(feature = "encoding")]
954+
{
955+
let mut reader = Self::from_reader(s.as_bytes());
956+
reader.encoding = EncodingRef::Explicit(UTF_8);
957+
reader
958+
}
959+
960+
#[cfg(not(feature = "encoding"))]
908961
Self::from_reader(s.as_bytes())
909962
}
910963

@@ -1533,8 +1586,6 @@ impl Decoder {
15331586
/// Copied from [`Encoding::decode_with_bom_removal`]
15341587
#[inline]
15351588
fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] {
1536-
use encoding_rs::*;
1537-
15381589
if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
15391590
return &bytes[3..];
15401591
}
@@ -1556,15 +1607,13 @@ impl Decoder {
15561607
pub(crate) fn utf8() -> Self {
15571608
Decoder {
15581609
#[cfg(feature = "encoding")]
1559-
encoding: encoding_rs::UTF_8,
1610+
encoding: UTF_8,
15601611
}
15611612
}
15621613

15631614
#[cfg(feature = "encoding")]
15641615
pub(crate) fn utf16() -> Self {
1565-
Decoder {
1566-
encoding: encoding_rs::UTF_16LE,
1567-
}
1616+
Decoder { encoding: UTF_16LE }
15681617
}
15691618
}
15701619

@@ -2480,6 +2529,62 @@ mod test {
24802529
);
24812530
}
24822531
}
2532+
2533+
#[cfg(feature = "encoding")]
2534+
mod encoding {
2535+
use crate::events::Event;
2536+
use crate::reader::Reader;
2537+
use encoding_rs::{UTF_8, UTF_16LE, WINDOWS_1251};
2538+
use pretty_assertions::assert_eq;
2539+
2540+
mod bytes {
2541+
use super::*;
2542+
use pretty_assertions::assert_eq;
2543+
2544+
/// Checks that encoding is detected by BOM and changed after XML declaration
2545+
#[test]
2546+
fn bom_detected() {
2547+
let mut reader = Reader::from_bytes(b"\xFF\xFE<?xml encoding='windows-1251'?>");
2548+
2549+
assert_eq!(reader.decoder().encoding(), UTF_8);
2550+
reader.read_event_buffered($buf).unwrap();
2551+
assert_eq!(reader.decoder().encoding(), UTF_16LE);
2552+
2553+
reader.read_event_buffered($buf).unwrap();
2554+
assert_eq!(reader.decoder().encoding(), WINDOWS_1251);
2555+
2556+
assert_eq!(reader.read_event_buffered($buf).unwrap(), Event::Eof);
2557+
}
2558+
2559+
/// Checks that encoding is changed by XML declaration, but only once
2560+
#[test]
2561+
fn xml_declaration() {
2562+
let mut reader = Reader::from_bytes(b"<?xml encoding='UTF-16'?><?xml encoding='windows-1251'?>");
2563+
2564+
assert_eq!(reader.decoder().encoding(), UTF_8);
2565+
reader.read_event_buffered($buf).unwrap();
2566+
assert_eq!(reader.decoder().encoding(), UTF_16LE);
2567+
2568+
reader.read_event_buffered($buf).unwrap();
2569+
assert_eq!(reader.decoder().encoding(), UTF_16LE);
2570+
2571+
assert_eq!(reader.read_event_buffered($buf).unwrap(), Event::Eof);
2572+
}
2573+
}
2574+
2575+
/// Checks that XML declaration cannot change the encoding from UTF-8 if
2576+
/// a `Reader` was created using `from_str` method
2577+
#[test]
2578+
fn str_always_has_utf8() {
2579+
let mut reader = Reader::from_str("<?xml encoding='UTF-16'?>");
2580+
2581+
assert_eq!(reader.decoder().encoding(), UTF_8);
2582+
reader.read_event_buffered($buf).unwrap();
2583+
assert_eq!(reader.decoder().encoding(), UTF_8);
2584+
2585+
assert_eq!(reader.read_event_buffered($buf).unwrap(), Event::Eof);
2586+
}
2587+
}
24832588
};
24842589
}
24852590

tests/serde-de.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4765,3 +4765,25 @@ mod xml_schema_lists {
47654765
}
47664766
}
47674767
}
4768+
4769+
/// Test for https://github.com/tafia/quick-xml/issues/324
4770+
#[test]
4771+
fn from_str_should_ignore_encoding() {
4772+
let xml = r#"
4773+
<?xml version="1.0" encoding="windows-1252" ?>
4774+
<A a="€" />
4775+
"#;
4776+
4777+
#[derive(Debug, PartialEq, Deserialize)]
4778+
struct A {
4779+
a: String,
4780+
}
4781+
4782+
let a: A = from_str(xml).unwrap();
4783+
assert_eq!(
4784+
a,
4785+
A {
4786+
a: "€".to_string()
4787+
}
4788+
);
4789+
}

0 commit comments

Comments
 (0)