Skip to content

Commit 5ab2d4a

Browse files
committed
Properly normalize EOL characters in BytesText::decode, BytesCData::decode and BytesRef::decode methods
1 parent 2f864b1 commit 5ab2d4a

File tree

4 files changed

+78
-8
lines changed

4 files changed

+78
-8
lines changed

Changelog.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,12 @@
2525

2626
### Bug Fixes
2727

28+
- [#806]: Properly normalize EOL characters in `BytesText::decode`, `BytesCData::decode`
29+
and `BytesRef::decode` methods.
30+
2831
### Misc Changes
2932

33+
[#806]: https://github.com/tafia/quick-xml/issues/806
3034
[#878]: https://github.com/tafia/quick-xml/pull/878
3135
[#882]: https://github.com/tafia/quick-xml/pull/882
3236

src/encoding.rs

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ use std::str::Utf8Error;
66
#[cfg(feature = "encoding")]
77
use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_16LE, UTF_8};
88

9+
use crate::escape::normalize_eols;
10+
911
/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
1012
/// See <https://unicode.org/faq/utf_bom.html#bom1>
1113
pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
@@ -145,9 +147,20 @@ impl Decoder {
145147
bytes: &Cow<'b, [u8]>,
146148
) -> Result<Cow<'b, str>, EncodingError> {
147149
match bytes {
148-
Cow::Borrowed(bytes) => self.decode(bytes),
150+
Cow::Borrowed(bytes) => {
151+
let text = self.decode(bytes)?;
152+
match normalize_eols(&text) {
153+
// If text borrowed after normalization that means that it's not changed
154+
Cow::Borrowed(_) => Ok(text),
155+
Cow::Owned(s) => Ok(Cow::Owned(s)),
156+
}
157+
}
149158
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
150-
Cow::Owned(bytes) => Ok(self.decode(bytes)?.into_owned().into()),
159+
Cow::Owned(bytes) => {
160+
let text = self.decode(bytes)?;
161+
let text = normalize_eols(&text);
162+
Ok(text.into_owned().into())
163+
}
151164
}
152165
}
153166
}

src/events/mod.rs

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -577,8 +577,24 @@ impl<'a> BytesText<'a> {
577577

578578
/// Decodes the content of the event.
579579
///
580-
/// This will allocate if the value contains any escape sequences or in
581-
/// non-UTF-8 encoding.
580+
/// This will allocate if the value contains any escape sequences or in non-UTF-8
581+
/// encoding, or [EOL normalization] is required.
582+
///
583+
/// Note, althougth you may use this library to parse HTML, you cannot use this
584+
/// method to get HTML content, because its returns normalized value: the following
585+
/// sequences are translated into a single `\n` (U+000a) character:
586+
///
587+
/// - `\r\n`
588+
/// - `\r\x85`
589+
/// - `\r`
590+
/// - `\x85`
591+
/// - `\x2028`
592+
///
593+
/// The text in HTML normally is not normalized in any way; normalization is
594+
/// performed only in limited contexts and [only for] `\r\n` and `\r`.
595+
///
596+
/// [EOL normalization]: https://www.w3.org/TR/xml11/#sec-line-ends
597+
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
582598
pub fn decode(&self) -> Result<Cow<'a, str>, EncodingError> {
583599
self.decoder.decode_cow(&self.content)
584600
}
@@ -827,6 +843,25 @@ impl<'a> BytesCData<'a> {
827843
/// When this event produced by the XML reader, it uses the encoding information
828844
/// associated with that reader to interpret the raw bytes contained within this
829845
/// CDATA event.
846+
///
847+
/// This will allocate if the value in non-UTF-8 encoding, or [EOL normalization]
848+
/// is required.
849+
///
850+
/// Note, althougth you may use this library to parse HTML, you cannot use this
851+
/// method to get HTML content, because its returns normalized value: the following
852+
/// sequences are translated into a single `\n` (U+000a) character:
853+
///
854+
/// - `\r\n`
855+
/// - `\r\x85`
856+
/// - `\r`
857+
/// - `\x85`
858+
/// - `\x2028`
859+
///
860+
/// The text in HTML normally is not normalized in any way; normalization is
861+
/// performed only in limited contexts and [only for] `\r\n` and `\r`.
862+
///
863+
/// [EOL normalization]: https://www.w3.org/TR/xml11/#sec-line-ends
864+
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
830865
pub fn decode(&self) -> Result<Cow<'a, str>, EncodingError> {
831866
Ok(self.decoder.decode_cow(&self.content)?)
832867
}
@@ -1437,8 +1472,24 @@ impl<'a> BytesRef<'a> {
14371472

14381473
/// Decodes the content of the event.
14391474
///
1440-
/// This will allocate if the value contains any escape sequences or in
1441-
/// non-UTF-8 encoding.
1475+
/// This will allocate if the value in non-UTF-8 encoding, or [EOL normalization]
1476+
/// is required.
1477+
///
1478+
/// Note, althougth you may use this library to parse HTML, you cannot use this
1479+
/// method to get HTML content, because its returns normalized value: the following
1480+
/// sequences are translated into a single `\n` (U+000a) character:
1481+
///
1482+
/// - `\r\n`
1483+
/// - `\r\x85`
1484+
/// - `\r`
1485+
/// - `\x85`
1486+
/// - `\x2028`
1487+
///
1488+
/// The text in HTML normally is not normalized in any way; normalization is
1489+
/// performed only in limited contexts and [only for] `\r\n` and `\r`.
1490+
///
1491+
/// [EOL normalization]: https://www.w3.org/TR/xml11/#sec-line-ends
1492+
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
14421493
pub fn decode(&self) -> Result<Cow<'a, str>, EncodingError> {
14431494
self.decoder.decode_cow(&self.content)
14441495
}

tests/serde-se.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1897,9 +1897,11 @@ mod with_root {
18971897
<root>3</root>");
18981898
serialize_as!(tuple:
18991899
// Use to_string() to get owned type that is required for deserialization
1900-
("<\"&'>".to_string(), "with\t\r\n spaces", 3usize)
1900+
// NOTE: do not use \r, because it normalized to \n during deserialziation
1901+
// but writes as is during serialization
1902+
("<\"&'>".to_string(), "with\t\n spaces", 3usize)
19011903
=> "<root>&lt;\"&amp;'&gt;</root>\
1902-
<root>with\t\r\n spaces</root>\
1904+
<root>with\t\n spaces</root>\
19031905
<root>3</root>");
19041906
serialize_as!(tuple_struct:
19051907
Tuple(42.0, "answer")

0 commit comments

Comments
 (0)