|
| 1 | +//! Contains a parser for an XML element. |
| 2 | +
|
| 3 | +/// A parser that search a `>` symbol in the slice outside of quoted regions. |
| 4 | +/// |
| 5 | +/// The parser considers two quoted regions: a double-quoted (`"..."`) and |
| 6 | +/// a single-quoted (`'...'`) region. Matches found inside those regions are not |
| 7 | +/// considered as results. Each region starts and ends by its quote symbol, |
| 8 | +/// which cannot be escaped (but can be encoded as XML character entity or named |
| 9 | +/// entity. Anyway, that encoding does not contain literal quotes). |
| 10 | +/// |
| 11 | +/// To use a parser create an instance of parser and [`feed`] data into it. |
| 12 | +/// After successful search the parser will return [`Some`] with position of |
| 13 | +/// found symbol. If search is unsuccessful, a [`None`] will be returned. You |
| 14 | +/// typically would expect positive result of search, so that you should feed |
| 15 | +/// new data until yo'll get it. |
| 16 | +/// |
| 17 | +/// After success match parser state in not guaranteed to be prepared to the |
| 18 | +/// next parse. If you need to parse another element, create a new parser, it is |
| 19 | +/// cheap. |
| 20 | +/// |
| 21 | +/// # Example |
| 22 | +/// |
| 23 | +/// ``` |
| 24 | +/// # use quick_xml::reader::ElementParser; |
| 25 | +/// # use pretty_assertions::assert_eq; |
| 26 | +/// let mut parser = ElementParser::default(); |
| 27 | +/// |
| 28 | +/// // Parse `<my-element with = 'some > inside'>and the text follow...` |
| 29 | +/// // splitted into three chunks |
| 30 | +/// assert_eq!(parser.feed(b"<my-element"), None); |
| 31 | +/// // ...get new chunk of data |
| 32 | +/// assert_eq!(parser.feed(b" with = 'some >"), None); |
| 33 | +/// // ...get another chunk of data |
| 34 | +/// assert_eq!(parser.feed(b" inside'>and the text follow..."), Some(8)); |
| 35 | +/// // ^ ^ |
| 36 | +/// // 0 8 |
| 37 | +/// ``` |
| 38 | +/// |
| 39 | +/// [`feed`]: Self::feed() |
| 40 | +#[derive(Clone, Copy, Debug, Eq, PartialEq)] |
| 41 | +pub enum ElementParser { |
| 42 | + /// The initial state (inside element, but outside of attribute value). |
| 43 | + Outside, |
| 44 | + /// Inside a single-quoted region (`'...'`). |
| 45 | + SingleQ, |
| 46 | + /// Inside a double-quoted region (`"..."`). |
| 47 | + DoubleQ, |
| 48 | +} |
| 49 | + |
| 50 | +impl ElementParser { |
| 51 | + /// Returns number of consumed bytes or `None` if `>` was not found in `bytes`. |
| 52 | + #[inline] |
| 53 | + pub fn feed(&mut self, bytes: &[u8]) -> Option<usize> { |
| 54 | + for i in memchr::memchr3_iter(b'>', b'\'', b'"', bytes) { |
| 55 | + // multi-match (*self, bytes[i]) are little less efficient |
| 56 | + match bytes[i] { |
| 57 | + b'"' => match self { |
| 58 | + Self::Outside => *self = Self::DoubleQ, |
| 59 | + Self::DoubleQ => *self = Self::Outside, |
| 60 | + Self::SingleQ => {} |
| 61 | + }, |
| 62 | + b'\'' => match self { |
| 63 | + Self::Outside => *self = Self::SingleQ, |
| 64 | + Self::SingleQ => *self = Self::Outside, |
| 65 | + Self::DoubleQ => {} |
| 66 | + }, |
| 67 | + b'>' => match self { |
| 68 | + // Only allowed to match `>` while we are in state `Outside` |
| 69 | + Self::Outside => return Some(i), |
| 70 | + Self::SingleQ | Self::DoubleQ => {} |
| 71 | + }, |
| 72 | + _ => {} |
| 73 | + } |
| 74 | + } |
| 75 | + None |
| 76 | + } |
| 77 | +} |
| 78 | + |
| 79 | +impl Default for ElementParser { |
| 80 | + #[inline] |
| 81 | + fn default() -> Self { |
| 82 | + Self::Outside |
| 83 | + } |
| 84 | +} |
| 85 | + |
| 86 | +#[test] |
| 87 | +fn test() { |
| 88 | + use pretty_assertions::assert_eq; |
| 89 | + use ElementParser::*; |
| 90 | + |
| 91 | + /// Returns `Ok(pos)` with the position in the buffer where element is ended. |
| 92 | + /// |
| 93 | + /// Returns `Err(internal_state)` if parsing does not done yet. |
| 94 | + fn parse_element(bytes: &[u8], mut parser: ElementParser) -> Result<usize, ElementParser> { |
| 95 | + match parser.feed(bytes) { |
| 96 | + Some(i) => Ok(i), |
| 97 | + None => Err(parser), |
| 98 | + } |
| 99 | + } |
| 100 | + |
| 101 | + assert_eq!(parse_element(b"", Outside), Err(Outside)); |
| 102 | + assert_eq!(parse_element(b"", SingleQ), Err(SingleQ)); |
| 103 | + assert_eq!(parse_element(b"", DoubleQ), Err(DoubleQ)); |
| 104 | + |
| 105 | + assert_eq!(parse_element(b"'", Outside), Err(SingleQ)); |
| 106 | + assert_eq!(parse_element(b"'", SingleQ), Err(Outside)); |
| 107 | + assert_eq!(parse_element(b"'", DoubleQ), Err(DoubleQ)); |
| 108 | + |
| 109 | + assert_eq!(parse_element(b"\"", Outside), Err(DoubleQ)); |
| 110 | + assert_eq!(parse_element(b"\"", SingleQ), Err(SingleQ)); |
| 111 | + assert_eq!(parse_element(b"\"", DoubleQ), Err(Outside)); |
| 112 | + |
| 113 | + assert_eq!(parse_element(b">", Outside), Ok(0)); |
| 114 | + assert_eq!(parse_element(b">", SingleQ), Err(SingleQ)); |
| 115 | + assert_eq!(parse_element(b">", DoubleQ), Err(DoubleQ)); |
| 116 | + |
| 117 | + assert_eq!(parse_element(b"''>", Outside), Ok(2)); |
| 118 | + assert_eq!(parse_element(b"''>", SingleQ), Err(SingleQ)); |
| 119 | + assert_eq!(parse_element(b"''>", DoubleQ), Err(DoubleQ)); |
| 120 | +} |
0 commit comments