|
1 | 1 | //! Manage xml character escapes |
2 | 2 |
|
3 | | -use memchr::memchr2_iter; |
| 3 | +use memchr::{memchr2_iter, memchr3}; |
4 | 4 | use std::borrow::Cow; |
5 | 5 | use std::num::ParseIntError; |
6 | 6 | use std::ops::Range; |
@@ -302,6 +302,123 @@ where |
302 | 302 | } |
303 | 303 | } |
304 | 304 |
|
| 305 | +//////////////////////////////////////////////////////////////////////////////////////////////////// |
| 306 | + |
| 307 | +// TODO: It would be better to reuse buffer after decoding if possible |
| 308 | +pub(crate) fn normalize_eols<'input>(text: &'input str) -> Cow<'input, str> { |
| 309 | + let bytes = text.as_bytes(); |
| 310 | + |
| 311 | + // The following sequences of UTF-8 encoded input should be translated into |
| 312 | + // a single `\n` (U+000a) character to normalize EOLs: |
| 313 | + // |
| 314 | + // |UTF-8 |String| |
| 315 | + // |--------|------| |
| 316 | + // |0d 0a |\r\n | |
| 317 | + // |0d c2 85|\r\x85| |
| 318 | + // |0d |\r | |
| 319 | + // |c2 85 |\x85 | |
| 320 | + // |e2 80 a8|\u2028| |
| 321 | + if let Some(i) = memchr3(b'\r', 0xC2, 0xE2, bytes) { |
| 322 | + // We found a character that requires normalization, so create new normalized |
| 323 | + // string, put the prefix as is and then put normalized character |
| 324 | + let mut normalized = String::with_capacity(text.len()); |
| 325 | + // NOTE: unsafe { text.get_unchecked(0..i) } could be used because |
| 326 | + // we are sure that index within string |
| 327 | + normalized.push_str(&text[0..i]); |
| 328 | + |
| 329 | + let mut pos = normalize_eol_step(&mut normalized, bytes, i, '\n'); |
| 330 | + while let Some(i) = memchr3(b'\r', 0xC2, 0xE2, &bytes[pos..]) { |
| 331 | + let index = pos + i; |
| 332 | + // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because |
| 333 | + // we are sure that index within string |
| 334 | + normalized.push_str(&text[pos..index]); |
| 335 | + pos = normalize_eol_step(&mut normalized, bytes, index, '\n'); |
| 336 | + } |
| 337 | + if let Some(rest) = text.get(pos..) { |
| 338 | + normalized.push_str(rest); |
| 339 | + } |
| 340 | + return normalized.into(); |
| 341 | + } |
| 342 | + Cow::Borrowed(text) |
| 343 | +} |
| 344 | + |
| 345 | +/// All line breaks MUST have been normalized on input to #xA as described |
| 346 | +/// in [2.11 End-of-Line Handling][eof], so the rest of this algorithm operates |
| 347 | +/// on text normalized in this way. |
| 348 | +/// |
| 349 | +/// To simplify the tasks of applications, the XML processor MUST behave |
| 350 | +/// as if it normalized all line breaks in external parsed entities |
| 351 | +/// (including the document entity) on input, before parsing, by translating |
| 352 | +/// all of the following to a single #xA character (_which attribute normalization |
| 353 | +/// routine will replace by #x20 character_): |
| 354 | +/// |
| 355 | +/// 1. the two-character sequence #xD #xA |
| 356 | +/// 2. the two-character sequence #xD #x85 |
| 357 | +/// 3. the single character #x85 |
| 358 | +/// 4. the single character #x2028 |
| 359 | +/// 5. any #xD character that is not immediately followed by #xA or #x85. |
| 360 | +/// |
| 361 | +/// The characters #x85 and #x2028 cannot be reliably recognized and translated |
| 362 | +/// until an entity's encoding declaration (if present) has been read. |
| 363 | +/// Therefore, it is a fatal error to use them within the XML declaration or text declaration. |
| 364 | +/// |
| 365 | +/// Note, that this function cannot be used to normalize HTML values. The text in HTML |
| 366 | +/// normally is not normalized in any way; normalization is performed only in limited |
| 367 | +/// contexts and [only for] `\r\n` and `\r`. |
| 368 | +/// |
| 369 | +/// # Parameters |
| 370 | +/// |
| 371 | +/// - `normalized`: the string with the result of normalization |
| 372 | +/// - `input`: UTF-8 bytes of the string to be normalized |
| 373 | +/// - `index`: a byte index into `input` of character which is processed right now. |
| 374 | +/// It always points to the first byte of character in UTF-8 encoding |
| 375 | +/// - `ch`: a character that should be put to the string instead of newline sequence |
| 376 | +/// |
| 377 | +/// Returns the index of next unprocessed byte in the `input`. |
| 378 | +/// |
| 379 | +/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends |
| 380 | +/// [only for]: https://html.spec.whatwg.org/#normalize-newlines |
| 381 | +fn normalize_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: char) -> usize { |
| 382 | + match input[index] { |
| 383 | + b'\r' => { |
| 384 | + normalized.push(ch); |
| 385 | + if index + 1 < input.len() { |
| 386 | + let next = input[index + 1]; |
| 387 | + if next == b'\n' { |
| 388 | + return index + 2; // skip \r\n |
| 389 | + } |
| 390 | + // Because input is correct UTF-8 and in UTF-8 every character has |
| 391 | + // an unique prefix, byte C2 means only start of #x85 character |
| 392 | + if next == 0xC2 { |
| 393 | + return index + 3; // skip UTF-8 encoding of #xD #x85 characters (0d c2 85) |
| 394 | + } |
| 395 | + } |
| 396 | + index + 1 // skip \r |
| 397 | + } |
| 398 | + b'\n' => { |
| 399 | + normalized.push(ch); |
| 400 | + index + 1 // skip \n |
| 401 | + } |
| 402 | + // Start of UTF-8 encoding of #x85 character (c2 85) |
| 403 | + 0xC2 => { |
| 404 | + normalized.push(ch); |
| 405 | + index + 2 // skip UTF-8 encoding of #x85 character (c2 85) |
| 406 | + } |
| 407 | + // Start of UTF-8 encoding of #x2028 character (e2 80 a8) |
| 408 | + 0xE2 => { |
| 409 | + normalized.push(ch); |
| 410 | + index + 3 // skip UTF-8 encoding of #x2028 character (e2 80 a8) |
| 411 | + } |
| 412 | + |
| 413 | + x => unreachable!( |
| 414 | + "at {}: expected ''\\n', '\\r', '\\xC2', or '\\xE2', found '{}' / {} / `0x{:X}`", |
| 415 | + index, x as char, x, x |
| 416 | + ), |
| 417 | + } |
| 418 | +} |
| 419 | + |
| 420 | +//////////////////////////////////////////////////////////////////////////////////////////////////// |
| 421 | + |
305 | 422 | /// Resolves predefined XML entities or all HTML5 entities depending on the feature |
306 | 423 | /// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html). |
307 | 424 | /// |
@@ -1844,3 +1961,65 @@ fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> { |
1844 | 1961 | _ => u32::from_str_radix(src, radix).map_err(ParseCharRefError::InvalidNumber), |
1845 | 1962 | } |
1846 | 1963 | } |
| 1964 | + |
| 1965 | +//////////////////////////////////////////////////////////////////////////////////////////////////// |
| 1966 | + |
| 1967 | +#[cfg(test)] |
| 1968 | +mod normalization { |
| 1969 | + use super::*; |
| 1970 | + |
| 1971 | + mod eol { |
| 1972 | + use super::*; |
| 1973 | + use pretty_assertions::assert_eq; |
| 1974 | + |
| 1975 | + #[test] |
| 1976 | + fn empty() { |
| 1977 | + assert_eq!(normalize_eols(""), ""); |
| 1978 | + } |
| 1979 | + |
| 1980 | + #[test] |
| 1981 | + fn already_normalized() { |
| 1982 | + assert_eq!( |
| 1983 | + normalize_eols("\nalready \n\n normalized\n"), |
| 1984 | + "\nalready \n\n normalized\n" |
| 1985 | + ); |
| 1986 | + } |
| 1987 | + |
| 1988 | + #[test] |
| 1989 | + fn cr_lf() { |
| 1990 | + assert_eq!(normalize_eols("\r\nsome\r\n\r\ntext"), "\nsome\n\ntext"); |
| 1991 | + } |
| 1992 | + |
| 1993 | + #[test] |
| 1994 | + fn cr_u0085() { |
| 1995 | + assert_eq!( |
| 1996 | + normalize_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"), |
| 1997 | + "\nsome\n\ntext" |
| 1998 | + ); |
| 1999 | + } |
| 2000 | + |
| 2001 | + #[test] |
| 2002 | + fn u0085() { |
| 2003 | + assert_eq!( |
| 2004 | + normalize_eols("\u{0085}some\u{0085}\u{0085}text"), |
| 2005 | + "\nsome\n\ntext" |
| 2006 | + ); |
| 2007 | + } |
| 2008 | + |
| 2009 | + #[test] |
| 2010 | + fn u2028() { |
| 2011 | + assert_eq!( |
| 2012 | + normalize_eols("\u{2028}some\u{2028}\u{2028}text"), |
| 2013 | + "\nsome\n\ntext" |
| 2014 | + ); |
| 2015 | + } |
| 2016 | + |
| 2017 | + #[test] |
| 2018 | + fn mixed() { |
| 2019 | + assert_eq!( |
| 2020 | + normalize_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"), |
| 2021 | + "\n\n\n\n\n\nsome\n\n\ntext" |
| 2022 | + ); |
| 2023 | + } |
| 2024 | + } |
| 2025 | +} |
0 commit comments