Skip to content

Commit 2f864b1

Browse files
committed
Implement EOL normalization procedure as described in "2.11 End-of-Line Handling" section of XML 1.1 spec
https://www.w3.org/TR/xml11/#sec-line-ends
1 parent dee4b1b commit 2f864b1

File tree

1 file changed

+180
-1
lines changed

1 file changed

+180
-1
lines changed

src/escape.rs

Lines changed: 180 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//! Manage xml character escapes
22
3-
use memchr::memchr2_iter;
3+
use memchr::{memchr2_iter, memchr3};
44
use std::borrow::Cow;
55
use std::num::ParseIntError;
66
use std::ops::Range;
@@ -302,6 +302,123 @@ where
302302
}
303303
}
304304

305+
////////////////////////////////////////////////////////////////////////////////////////////////////
306+
307+
// TODO: It would be better to reuse buffer after decoding if possible
308+
pub(crate) fn normalize_eols<'input>(text: &'input str) -> Cow<'input, str> {
309+
let bytes = text.as_bytes();
310+
311+
// The following sequences of UTF-8 encoded input should be translated into
312+
// a single `\n` (U+000a) character to normalize EOLs:
313+
//
314+
// |UTF-8 |String|
315+
// |--------|------|
316+
// |0d 0a |\r\n |
317+
// |0d c2 85|\r\x85|
318+
// |0d |\r |
319+
// |c2 85 |\x85 |
320+
// |e2 80 a8|\u2028|
321+
if let Some(i) = memchr3(b'\r', 0xC2, 0xE2, bytes) {
322+
// We found a character that requires normalization, so create new normalized
323+
// string, put the prefix as is and then put normalized character
324+
let mut normalized = String::with_capacity(text.len());
325+
// NOTE: unsafe { text.get_unchecked(0..i) } could be used because
326+
// we are sure that index within string
327+
normalized.push_str(&text[0..i]);
328+
329+
let mut pos = normalize_eol_step(&mut normalized, bytes, i, '\n');
330+
while let Some(i) = memchr3(b'\r', 0xC2, 0xE2, &bytes[pos..]) {
331+
let index = pos + i;
332+
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
333+
// we are sure that index within string
334+
normalized.push_str(&text[pos..index]);
335+
pos = normalize_eol_step(&mut normalized, bytes, index, '\n');
336+
}
337+
if let Some(rest) = text.get(pos..) {
338+
normalized.push_str(rest);
339+
}
340+
return normalized.into();
341+
}
342+
Cow::Borrowed(text)
343+
}
344+
345+
/// All line breaks MUST have been normalized on input to #xA as described
346+
/// in [2.11 End-of-Line Handling][eof], so the rest of this algorithm operates
347+
/// on text normalized in this way.
348+
///
349+
/// To simplify the tasks of applications, the XML processor MUST behave
350+
/// as if it normalized all line breaks in external parsed entities
351+
/// (including the document entity) on input, before parsing, by translating
352+
/// all of the following to a single #xA character (_which attribute normalization
353+
/// routine will replace by #x20 character_):
354+
///
355+
/// 1. the two-character sequence #xD #xA
356+
/// 2. the two-character sequence #xD #x85
357+
/// 3. the single character #x85
358+
/// 4. the single character #x2028
359+
/// 5. any #xD character that is not immediately followed by #xA or #x85.
360+
///
361+
/// The characters #x85 and #x2028 cannot be reliably recognized and translated
362+
/// until an entity's encoding declaration (if present) has been read.
363+
/// Therefore, it is a fatal error to use them within the XML declaration or text declaration.
364+
///
365+
/// Note, that this function cannot be used to normalize HTML values. The text in HTML
366+
/// normally is not normalized in any way; normalization is performed only in limited
367+
/// contexts and [only for] `\r\n` and `\r`.
368+
///
369+
/// # Parameters
370+
///
371+
/// - `normalized`: the string with the result of normalization
372+
/// - `input`: UTF-8 bytes of the string to be normalized
373+
/// - `index`: a byte index into `input` of character which is processed right now.
374+
/// It always points to the first byte of character in UTF-8 encoding
375+
/// - `ch`: a character that should be put to the string instead of newline sequence
376+
///
377+
/// Returns the index of next unprocessed byte in the `input`.
378+
///
379+
/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends
380+
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
381+
fn normalize_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: char) -> usize {
382+
match input[index] {
383+
b'\r' => {
384+
normalized.push(ch);
385+
if index + 1 < input.len() {
386+
let next = input[index + 1];
387+
if next == b'\n' {
388+
return index + 2; // skip \r\n
389+
}
390+
// Because input is correct UTF-8 and in UTF-8 every character has
391+
// an unique prefix, byte C2 means only start of #x85 character
392+
if next == 0xC2 {
393+
return index + 3; // skip UTF-8 encoding of #xD #x85 characters (0d c2 85)
394+
}
395+
}
396+
index + 1 // skip \r
397+
}
398+
b'\n' => {
399+
normalized.push(ch);
400+
index + 1 // skip \n
401+
}
402+
// Start of UTF-8 encoding of #x85 character (c2 85)
403+
0xC2 => {
404+
normalized.push(ch);
405+
index + 2 // skip UTF-8 encoding of #x85 character (c2 85)
406+
}
407+
// Start of UTF-8 encoding of #x2028 character (e2 80 a8)
408+
0xE2 => {
409+
normalized.push(ch);
410+
index + 3 // skip UTF-8 encoding of #x2028 character (e2 80 a8)
411+
}
412+
413+
x => unreachable!(
414+
"at {}: expected ''\\n', '\\r', '\\xC2', or '\\xE2', found '{}' / {} / `0x{:X}`",
415+
index, x as char, x, x
416+
),
417+
}
418+
}
419+
420+
////////////////////////////////////////////////////////////////////////////////////////////////////
421+
305422
/// Resolves predefined XML entities or all HTML5 entities depending on the feature
306423
/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
307424
///
@@ -1844,3 +1961,65 @@ fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> {
18441961
_ => u32::from_str_radix(src, radix).map_err(ParseCharRefError::InvalidNumber),
18451962
}
18461963
}
1964+
1965+
////////////////////////////////////////////////////////////////////////////////////////////////////
1966+
1967+
#[cfg(test)]
1968+
mod normalization {
1969+
use super::*;
1970+
1971+
mod eol {
1972+
use super::*;
1973+
use pretty_assertions::assert_eq;
1974+
1975+
#[test]
1976+
fn empty() {
1977+
assert_eq!(normalize_eols(""), "");
1978+
}
1979+
1980+
#[test]
1981+
fn already_normalized() {
1982+
assert_eq!(
1983+
normalize_eols("\nalready \n\n normalized\n"),
1984+
"\nalready \n\n normalized\n"
1985+
);
1986+
}
1987+
1988+
#[test]
1989+
fn cr_lf() {
1990+
assert_eq!(normalize_eols("\r\nsome\r\n\r\ntext"), "\nsome\n\ntext");
1991+
}
1992+
1993+
#[test]
1994+
fn cr_u0085() {
1995+
assert_eq!(
1996+
normalize_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
1997+
"\nsome\n\ntext"
1998+
);
1999+
}
2000+
2001+
#[test]
2002+
fn u0085() {
2003+
assert_eq!(
2004+
normalize_eols("\u{0085}some\u{0085}\u{0085}text"),
2005+
"\nsome\n\ntext"
2006+
);
2007+
}
2008+
2009+
#[test]
2010+
fn u2028() {
2011+
assert_eq!(
2012+
normalize_eols("\u{2028}some\u{2028}\u{2028}text"),
2013+
"\nsome\n\ntext"
2014+
);
2015+
}
2016+
2017+
#[test]
2018+
fn mixed() {
2019+
assert_eq!(
2020+
normalize_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
2021+
"\n\n\n\n\n\nsome\n\n\ntext"
2022+
);
2023+
}
2024+
}
2025+
}

0 commit comments

Comments
 (0)