Properly normalize EOL characters in BytesText::decode, BytesCData::decode and BytesRef::decode methods

Mingun · Mingun · commit 5ab2d4ac03b2 · 2025-07-26T01:53:33.000+05:00
diff --git a/Changelog.md b/Changelog.md
@@ -25,8 +25,12 @@
 
 ### Bug Fixes
 
+- [#806]: Properly normalize EOL characters in `BytesText::decode`, `BytesCData::decode`
+  and `BytesRef::decode` methods.
+
 ### Misc Changes
 
+[#806]: https://github.com/tafia/quick-xml/issues/806
 [#878]: https://github.com/tafia/quick-xml/pull/878
 [#882]: https://github.com/tafia/quick-xml/pull/882
 
diff --git a/src/encoding.rs b/src/encoding.rs
@@ -6,6 +6,8 @@ use std::str::Utf8Error;
 #[cfg(feature = "encoding")]
 use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_16LE, UTF_8};
 
+use crate::escape::normalize_eols;
+
 /// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
 /// See <https://unicode.org/faq/utf_bom.html#bom1>
 pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
@@ -145,9 +147,20 @@ impl Decoder {
         bytes: &Cow<'b, [u8]>,
     ) -> Result<Cow<'b, str>, EncodingError> {
         match bytes {
-            Cow::Borrowed(bytes) => self.decode(bytes),
+            Cow::Borrowed(bytes) => {
+                let text = self.decode(bytes)?;
+                match normalize_eols(&text) {
+                    // If text borrowed after normalization that means that it's not changed
+                    Cow::Borrowed(_) => Ok(text),
+                    Cow::Owned(s) => Ok(Cow::Owned(s)),
+                }
+            }
             // Convert to owned, because otherwise Cow will be bound with wrong lifetime
-            Cow::Owned(bytes) => Ok(self.decode(bytes)?.into_owned().into()),
+            Cow::Owned(bytes) => {
+                let text = self.decode(bytes)?;
+                let text = normalize_eols(&text);
+                Ok(text.into_owned().into())
+            }
         }
     }
 }
diff --git a/src/events/mod.rs b/src/events/mod.rs
@@ -577,8 +577,24 @@ impl<'a> BytesText<'a> {
 
     /// Decodes the content of the event.
     ///
-    /// This will allocate if the value contains any escape sequences or in
-    /// non-UTF-8 encoding.
+    /// This will allocate if the value contains any escape sequences or in non-UTF-8
+    /// encoding, or [EOL normalization] is required.
+    ///
+    /// Note, althougth you may use this library to parse HTML, you cannot use this
+    /// method to get HTML content, because its returns normalized value: the following
+    /// sequences are translated into a single `\n` (U+000a) character:
+    ///
+    /// - `\r\n`
+    /// - `\r\x85`
+    /// - `\r`
+    /// - `\x85`
+    /// - `\x2028`
+    ///
+    /// The text in HTML normally is not normalized in any way; normalization is
+    /// performed only in limited contexts and [only for] `\r\n` and `\r`.
+    ///
+    /// [EOL normalization]: https://www.w3.org/TR/xml11/#sec-line-ends
+    /// [only for]: https://html.spec.whatwg.org/#normalize-newlines
     pub fn decode(&self) -> Result<Cow<'a, str>, EncodingError> {
         self.decoder.decode_cow(&self.content)
     }
@@ -827,6 +843,25 @@ impl<'a> BytesCData<'a> {
     /// When this event produced by the XML reader, it uses the encoding information
     /// associated with that reader to interpret the raw bytes contained within this
     /// CDATA event.
+    ///
+    /// This will allocate if the value in non-UTF-8 encoding, or [EOL normalization]
+    /// is required.
+    ///
+    /// Note, althougth you may use this library to parse HTML, you cannot use this
+    /// method to get HTML content, because its returns normalized value: the following
+    /// sequences are translated into a single `\n` (U+000a) character:
+    ///
+    /// - `\r\n`
+    /// - `\r\x85`
+    /// - `\r`
+    /// - `\x85`
+    /// - `\x2028`
+    ///
+    /// The text in HTML normally is not normalized in any way; normalization is
+    /// performed only in limited contexts and [only for] `\r\n` and `\r`.
+    ///
+    /// [EOL normalization]: https://www.w3.org/TR/xml11/#sec-line-ends
+    /// [only for]: https://html.spec.whatwg.org/#normalize-newlines
     pub fn decode(&self) -> Result<Cow<'a, str>, EncodingError> {
         Ok(self.decoder.decode_cow(&self.content)?)
     }
@@ -1437,8 +1472,24 @@ impl<'a> BytesRef<'a> {
 
     /// Decodes the content of the event.
     ///
-    /// This will allocate if the value contains any escape sequences or in
-    /// non-UTF-8 encoding.
+    /// This will allocate if the value in non-UTF-8 encoding, or [EOL normalization]
+    /// is required.
+    ///
+    /// Note, althougth you may use this library to parse HTML, you cannot use this
+    /// method to get HTML content, because its returns normalized value: the following
+    /// sequences are translated into a single `\n` (U+000a) character:
+    ///
+    /// - `\r\n`
+    /// - `\r\x85`
+    /// - `\r`
+    /// - `\x85`
+    /// - `\x2028`
+    ///
+    /// The text in HTML normally is not normalized in any way; normalization is
+    /// performed only in limited contexts and [only for] `\r\n` and `\r`.
+    ///
+    /// [EOL normalization]: https://www.w3.org/TR/xml11/#sec-line-ends
+    /// [only for]: https://html.spec.whatwg.org/#normalize-newlines
     pub fn decode(&self) -> Result<Cow<'a, str>, EncodingError> {
         self.decoder.decode_cow(&self.content)
     }
diff --git a/tests/serde-se.rs b/tests/serde-se.rs
@@ -1897,9 +1897,11 @@ mod with_root {
             <root>3</root>");
     serialize_as!(tuple:
         // Use to_string() to get owned type that is required for deserialization
-        ("<\"&'>".to_string(), "with\t\r\n spaces", 3usize)
+        // NOTE: do not use \r, because it normalized to \n during deserialziation
+        // but writes as is during serialization
+        ("<\"&'>".to_string(), "with\t\n spaces", 3usize)
         => "<root>&lt;\"&amp;'&gt;</root>\
-            <root>with\t\r\n spaces</root>\
+            <root>with\t\n spaces</root>\
             <root>3</root>");
     serialize_as!(tuple_struct:
         Tuple(42.0, "answer")