diff --git a/Changelog.md b/Changelog.md index 7b82419f..0a2f4b9d 100644 --- a/Changelog.md +++ b/Changelog.md @@ -18,8 +18,16 @@ ### Bug Fixes +- [#895]: Fix incorrect normalization of `\rX` EOL sequences where `X` is a char which is + UTF-8 encoded as [c2 xx], except [c2 85]. + ### Misc Changes +- [#895]: Add new `xml10_content()` and `xml11_content()` methods which behaves the same as + `html_content()` and `xml_content()` methods, but express intention more clearly. + +[#895]: https://github.com/tafia/quick-xml/pull/895 + ## 0.38.2 -- 2025-08-19 diff --git a/src/escape.rs b/src/escape.rs index 87b95497..b7816d9d 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -305,7 +305,7 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////// // TODO: It would be better to reuse buffer after decoding if possible -pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str> { +pub(crate) fn normalize_xml11_eols<'input>(text: &'input str) -> Cow<'input, str> { let bytes = text.as_bytes(); // The following sequences of UTF-8 encoded input should be translated into @@ -326,13 +326,13 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str> // we are sure that index within string normalized.push_str(&text[0..i]); - let mut pos = normalize_xml_eol_step(&mut normalized, text, i, '\n'); + let mut pos = normalize_xml11_eol_step(&mut normalized, text, i, '\n'); while let Some(i) = memchr3(b'\r', 0xC2, 0xE2, &bytes[pos..]) { let index = pos + i; // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because // we are sure that index within string normalized.push_str(&text[pos..index]); - pos = normalize_xml_eol_step(&mut normalized, text, index, '\n'); + pos = normalize_xml11_eol_step(&mut normalized, text, index, '\n'); } if let Some(rest) = text.get(pos..) { normalized.push_str(rest); @@ -378,7 +378,7 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str> /// /// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends /// [only for]: https://html.spec.whatwg.org/#normalize-newlines -fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize { +fn normalize_xml11_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize { let input = text.as_bytes(); match input[index] { b'\r' => { @@ -388,15 +388,15 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch: normalized.push(ch); return index + 2; // skip \r\n } - // Because input is correct UTF-8 and in UTF-8 every character has - // an unique prefix, byte C2 means only start of #x85 character if next == 0xC2 { + // UTF-8 encoding of #x85 character is [c2 85] if index + 2 < input.len() && input[index + 2] == 0x85 { normalized.push(ch); } else { + normalized.push(ch); // NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because // we are sure that index within string - normalized.push_str(&text[index..index + 3]); + normalized.push_str(&text[index + 1..index + 3]); } return index + 3; // skip \r + UTF-8 encoding of character (c2 xx) } @@ -441,7 +441,7 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch: //////////////////////////////////////////////////////////////////////////////////////////////////// // TODO: It would be better to reuse buffer after decoding if possible -pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str> { +pub(crate) fn normalize_xml10_eols<'input>(text: &'input str) -> Cow<'input, str> { let bytes = text.as_bytes(); // The following sequences of UTF-8 encoded input should be translated into @@ -459,13 +459,13 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str> // we are sure that index within string normalized.push_str(&text[0..i]); - let mut pos = normalize_html_eol_step(&mut normalized, bytes, i, '\n'); + let mut pos = normalize_xml10_eol_step(&mut normalized, bytes, i, '\n'); while let Some(i) = memchr(b'\r', &bytes[pos..]) { let index = pos + i; // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because // we are sure that index within string normalized.push_str(&text[pos..index]); - pos = normalize_html_eol_step(&mut normalized, bytes, index, '\n'); + pos = normalize_xml10_eol_step(&mut normalized, bytes, index, '\n'); } if let Some(rest) = text.get(pos..) { normalized.push_str(rest); @@ -487,7 +487,12 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str> /// - `ch`: a character that should be put to the string instead of newline sequence /// /// [only for]: https://html.spec.whatwg.org/#normalize-newlines -fn normalize_html_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: char) -> usize { +fn normalize_xml10_eol_step( + normalized: &mut String, + input: &[u8], + index: usize, + ch: char, +) -> usize { match input[index] { b'\r' => { normalized.push(ch); @@ -2062,32 +2067,35 @@ mod normalization { mod eol { use super::*; - mod xml { + mod xml11 { use super::*; use pretty_assertions::assert_eq; #[test] fn empty() { - assert_eq!(normalize_xml_eols(""), ""); + assert_eq!(normalize_xml11_eols(""), ""); } #[test] fn already_normalized() { assert_eq!( - normalize_xml_eols("\nalready \n\n normalized\n"), + normalize_xml11_eols("\nalready \n\n normalized\n"), "\nalready \n\n normalized\n", ); } #[test] fn cr_lf() { - assert_eq!(normalize_xml_eols("\r\nsome\r\n\r\ntext"), "\nsome\n\ntext"); + assert_eq!( + normalize_xml11_eols("\r\nsome\r\n\r\ntext"), + "\nsome\n\ntext" + ); } #[test] fn cr_u0085() { assert_eq!( - normalize_xml_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"), + normalize_xml11_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"), "\nsome\n\ntext", ); } @@ -2095,7 +2103,7 @@ mod normalization { #[test] fn u0085() { assert_eq!( - normalize_xml_eols("\u{0085}some\u{0085}\u{0085}text"), + normalize_xml11_eols("\u{0085}some\u{0085}\u{0085}text"), "\nsome\n\ntext", ); } @@ -2103,7 +2111,7 @@ mod normalization { #[test] fn u2028() { assert_eq!( - normalize_xml_eols("\u{2028}some\u{2028}\u{2028}text"), + normalize_xml11_eols("\u{2028}some\u{2028}\u{2028}text"), "\nsome\n\ntext", ); } @@ -2111,7 +2119,7 @@ mod normalization { #[test] fn mixed() { assert_eq!( - normalize_xml_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"), + normalize_xml11_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"), "\n\n\n\n\n\nsome\n\n\ntext", ); } @@ -2138,9 +2146,9 @@ mod normalization { dbg!((input, &description)); if ch == '\u{0085}' { - assert_eq!(normalize_xml_eols(input), "\n", "{}", description); + assert_eq!(normalize_xml11_eols(input), "\n", "{}", description); } else { - assert_eq!(normalize_xml_eols(input), input, "{}", description); + assert_eq!(normalize_xml11_eols(input), input, "{}", description); } } assert_eq!((first..=last).count(), 64); @@ -2171,9 +2179,12 @@ mod normalization { dbg!((input, &description)); if ch == '\u{0085}' { - assert_eq!(normalize_xml_eols(input), "\n", "{}", description); + assert_eq!(normalize_xml11_eols(input), "\n", "{}", description); } else { - assert_eq!(normalize_xml_eols(input), input, "{}", description); + let mut expected = utf8.clone(); + expected[0] = b'\n'; + let expected = std::str::from_utf8(&expected).expect(&description); + assert_eq!(normalize_xml11_eols(input), expected, "{}", description); } } assert_eq!((first..=last).count(), 64); @@ -2204,28 +2215,28 @@ mod normalization { dbg!((input, &description)); if ch == '\u{2028}' { - assert_eq!(normalize_xml_eols(input), "\n", "{}", description); + assert_eq!(normalize_xml11_eols(input), "\n", "{}", description); } else { - assert_eq!(normalize_xml_eols(input), input, "{}", description); + assert_eq!(normalize_xml11_eols(input), input, "{}", description); } } assert_eq!((first..=last).count(), 4096); } } - mod html { + mod xml10 { use super::*; use pretty_assertions::assert_eq; #[test] fn empty() { - assert_eq!(normalize_html_eols(""), ""); + assert_eq!(normalize_xml10_eols(""), ""); } #[test] fn already_normalized() { assert_eq!( - normalize_html_eols("\nalready \n\n normalized\n"), + normalize_xml10_eols("\nalready \n\n normalized\n"), "\nalready \n\n normalized\n", ); } @@ -2233,7 +2244,7 @@ mod normalization { #[test] fn cr_lf() { assert_eq!( - normalize_html_eols("\r\nsome\r\n\r\ntext"), + normalize_xml10_eols("\r\nsome\r\n\r\ntext"), "\nsome\n\ntext" ); } @@ -2241,7 +2252,7 @@ mod normalization { #[test] fn cr_u0085() { assert_eq!( - normalize_html_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"), + normalize_xml10_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"), "\n\u{0085}some\n\u{0085}\n\u{0085}text", ); } @@ -2249,7 +2260,7 @@ mod normalization { #[test] fn u0085() { assert_eq!( - normalize_html_eols("\u{0085}some\u{0085}\u{0085}text"), + normalize_xml10_eols("\u{0085}some\u{0085}\u{0085}text"), "\u{0085}some\u{0085}\u{0085}text", ); } @@ -2257,7 +2268,7 @@ mod normalization { #[test] fn u2028() { assert_eq!( - normalize_html_eols("\u{2028}some\u{2028}\u{2028}text"), + normalize_xml10_eols("\u{2028}some\u{2028}\u{2028}text"), "\u{2028}some\u{2028}\u{2028}text", ); } @@ -2265,7 +2276,7 @@ mod normalization { #[test] fn mixed() { assert_eq!( - normalize_html_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"), + normalize_xml10_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"), "\n\n\n\u{2028}\n\nsome\n\u{0085}\n\u{0085}text", ); } diff --git a/src/events/mod.rs b/src/events/mod.rs index ed0a8c60..a0546d91 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -49,8 +49,8 @@ use std::str::from_utf8; use crate::encoding::{Decoder, EncodingError}; use crate::errors::{Error, IllFormedError}; use crate::escape::{ - escape, minimal_escape, normalize_html_eols, normalize_xml_eols, parse_number, partial_escape, - EscapeError, + escape, minimal_escape, normalize_xml10_eols, normalize_xml11_eols, parse_number, + partial_escape, EscapeError, }; use crate::name::{LocalName, QName}; use crate::utils::{name_len, trim_xml_end, trim_xml_start, write_cow_string, Bytes}; @@ -591,7 +591,7 @@ impl<'a> BytesText<'a> { self.decoder.decode_cow(&self.content) } - /// Decodes the content of the XML event. + /// Decodes the content of the XML 1.0 or HTML event. /// /// When this event produced by the reader, it uses the encoding information /// associated with that reader to interpret the raw bytes contained within @@ -600,18 +600,19 @@ impl<'a> BytesText<'a> { /// This will allocate if the value contains any escape sequences or in non-UTF-8 /// encoding, or EOL normalization is required. /// - /// Note, that this method should be used only if event represents XML content, - /// because rules for normalizing EOLs for [XML] and [HTML] differs. + /// Note, that this method should be used only if event represents XML 1.0 or HTML content, + /// because rules for normalizing EOLs for [XML 1.0] / [HTML] and [XML 1.1] differs. /// - /// To get HTML content use [`html_content()`](Self::html_content). + /// This method also can be used to get HTML content, because rules the same. /// - /// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends + /// [XML 1.0]: https://www.w3.org/TR/xml/#sec-line-ends + /// [XML 1.1]: https://www.w3.org/TR/xml11/#sec-line-ends /// [HTML]: https://html.spec.whatwg.org/#normalize-newlines - pub fn xml_content(&self) -> Result, EncodingError> { - self.decoder.content(&self.content, normalize_xml_eols) + pub fn xml10_content(&self) -> Result, EncodingError> { + self.decoder.content(&self.content, normalize_xml10_eols) } - /// Decodes the content of the HTML event. + /// Decodes the content of the XML 1.1 event. /// /// When this event produced by the reader, it uses the encoding information /// associated with that reader to interpret the raw bytes contained within @@ -620,15 +621,28 @@ impl<'a> BytesText<'a> { /// This will allocate if the value contains any escape sequences or in non-UTF-8 /// encoding, or EOL normalization is required. /// - /// Note, that this method should be used only if event represents HTML content, - /// because rules for normalizing EOLs for [XML] and [HTML] differs. + /// Note, that this method should be used only if event represents XML 1.1 content, + /// because rules for normalizing EOLs for [XML 1.0], [XML 1.1] and [HTML] differs. /// - /// To get XML content use [`xml_content()`](Self::xml_content). + /// To get HTML content use [`xml10_content()`](Self::xml10_content). /// - /// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends + /// [XML 1.0]: https://www.w3.org/TR/xml/#sec-line-ends + /// [XML 1.1]: https://www.w3.org/TR/xml11/#sec-line-ends /// [HTML]: https://html.spec.whatwg.org/#normalize-newlines + pub fn xml11_content(&self) -> Result, EncodingError> { + self.decoder.content(&self.content, normalize_xml11_eols) + } + + /// Alias for [`xml11_content()`](Self::xml11_content). + #[inline] + pub fn xml_content(&self) -> Result, EncodingError> { + self.xml11_content() + } + + /// Alias for [`xml10_content()`](Self::xml10_content). + #[inline] pub fn html_content(&self) -> Result, EncodingError> { - self.decoder.content(&self.content, normalize_html_eols) + self.xml10_content() } /// Removes leading XML whitespace bytes from text content. @@ -884,8 +898,8 @@ impl<'a> BytesCData<'a> { self.decoder.decode_cow(&self.content) } - /// Decodes the raw input byte content of the CDATA section of the XML event - /// into a string. + /// Decodes the raw input byte content of the CDATA section of the XML 1.0 or + /// HTML event into a string. /// /// When this event produced by the reader, it uses the encoding information /// associated with that reader to interpret the raw bytes contained within @@ -894,18 +908,19 @@ impl<'a> BytesCData<'a> { /// This will allocate if the value in non-UTF-8 encoding, or EOL normalization /// is required. /// - /// Note, that this method should be used only if event represents XML content, - /// because rules for normalizing EOLs for [XML] and [HTML] differs. + /// Note, that this method should be used only if event represents XML 1.0 or HTML content, + /// because rules for normalizing EOLs for [XML 1.0] / [HTML] and [XML 1.1] differs. /// - /// To get HTML content use [`html_content()`](Self::html_content). + /// This method also can be used to get HTML content, because rules the same. /// - /// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends + /// [XML 1.0]: https://www.w3.org/TR/xml/#sec-line-ends + /// [XML 1.1]: https://www.w3.org/TR/xml11/#sec-line-ends /// [HTML]: https://html.spec.whatwg.org/#normalize-newlines - pub fn xml_content(&self) -> Result, EncodingError> { - self.decoder.content(&self.content, normalize_xml_eols) + pub fn xml10_content(&self) -> Result, EncodingError> { + self.decoder.content(&self.content, normalize_xml10_eols) } - /// Decodes the raw input byte content of the CDATA section of the HTML event + /// Decodes the raw input byte content of the CDATA section of the XML 1.1 event /// into a string. /// /// When this event produced by the reader, it uses the encoding information @@ -915,15 +930,28 @@ impl<'a> BytesCData<'a> { /// This will allocate if the value in non-UTF-8 encoding, or EOL normalization /// is required. /// - /// Note, that this method should be used only if event represents HTML content, - /// because rules for normalizing EOLs for [XML] and [HTML] differs. + /// Note, that this method should be used only if event represents XML 1.1 content, + /// because rules for normalizing EOLs for [XML 1.0], [XML 1.1] and [HTML] differs. /// - /// To get XML content use [`xml_content()`](Self::xml_content). + /// To get HTML content use [`xml10_content()`](Self::xml10_content). /// - /// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends + /// [XML 1.0]: https://www.w3.org/TR/xml/#sec-line-ends + /// [XML 1.1]: https://www.w3.org/TR/xml11/#sec-line-ends /// [HTML]: https://html.spec.whatwg.org/#normalize-newlines + pub fn xml11_content(&self) -> Result, EncodingError> { + self.decoder.content(&self.content, normalize_xml11_eols) + } + + /// Alias for [`xml11_content()`](Self::xml11_content). + #[inline] + pub fn xml_content(&self) -> Result, EncodingError> { + self.xml11_content() + } + + /// Alias for [`xml10_content()`](Self::xml10_content). + #[inline] pub fn html_content(&self) -> Result, EncodingError> { - self.decoder.content(&self.content, normalize_html_eols) + self.xml10_content() } } @@ -1543,7 +1571,7 @@ impl<'a> BytesRef<'a> { self.decoder.decode_cow(&self.content) } - /// Decodes the content of the XML event. + /// Decodes the content of the XML 1.0 or HTML event. /// /// When this event produced by the reader, it uses the encoding information /// associated with that reader to interpret the raw bytes contained within @@ -1552,18 +1580,19 @@ impl<'a> BytesRef<'a> { /// This will allocate if the value in non-UTF-8 encoding, or EOL normalization /// is required. /// - /// Note, that this method should be used only if event represents XML content, - /// because rules for normalizing EOLs for [XML] and [HTML] differs. + /// Note, that this method should be used only if event represents XML 1.0 or HTML content, + /// because rules for normalizing EOLs for [XML 1.0] / [HTML] and [XML 1.1] differs. /// - /// To get HTML content use [`html_content()`](Self::html_content). + /// This method also can be used to get HTML content, because rules the same. /// - /// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends + /// [XML 1.0]: https://www.w3.org/TR/xml/#sec-line-ends + /// [XML 1.1]: https://www.w3.org/TR/xml11/#sec-line-ends /// [HTML]: https://html.spec.whatwg.org/#normalize-newlines - pub fn xml_content(&self) -> Result, EncodingError> { - self.decoder.content(&self.content, normalize_xml_eols) + pub fn xml10_content(&self) -> Result, EncodingError> { + self.decoder.content(&self.content, normalize_xml10_eols) } - /// Decodes the content of the HTML event. + /// Decodes the content of the XML 1.1 event. /// /// When this event produced by the reader, it uses the encoding information /// associated with that reader to interpret the raw bytes contained within @@ -1572,15 +1601,28 @@ impl<'a> BytesRef<'a> { /// This will allocate if the value in non-UTF-8 encoding, or EOL normalization /// is required. /// - /// Note, that this method should be used only if event represents HTML content, - /// because rules for normalizing EOLs for [XML] and [HTML] differs. + /// Note, that this method should be used only if event represents XML 1.1 content, + /// because rules for normalizing EOLs for [XML 1.0] / [HTML] and [XML 1.1] differs. /// - /// To get XML content use [`xml_content()`](Self::xml_content). + /// To get HTML content use [`xml10_content()`](Self::xml10_content). /// - /// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends + /// [XML 1.0]: https://www.w3.org/TR/xml/#sec-line-ends + /// [XML 1.1]: https://www.w3.org/TR/xml11/#sec-line-ends /// [HTML]: https://html.spec.whatwg.org/#normalize-newlines + pub fn xml11_content(&self) -> Result, EncodingError> { + self.decoder.content(&self.content, normalize_xml11_eols) + } + + /// Alias for [`xml11_content()`](Self::xml11_content). + #[inline] + pub fn xml_content(&self) -> Result, EncodingError> { + self.xml11_content() + } + + /// Alias for [`xml10_content()`](Self::xml10_content). + #[inline] pub fn html_content(&self) -> Result, EncodingError> { - self.decoder.content(&self.content, normalize_html_eols) + self.xml10_content() } /// Returns `true` if the specified reference represents the character reference