Skip to content

Commit 6231ac0

Browse files
committed
End processing instruction only after ?> instead of just >
Fixes (3): reader::async_tokio::test::read_event_into_async::processing_instruction reader::buffered_reader::test::read_event_impl::processing_instruction reader::slice_reader::test::read_event_impl::processing_instruction
1 parent 9634cd2 commit 6231ac0

File tree

5 files changed

+194
-7
lines changed

5 files changed

+194
-7
lines changed

Changelog.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,12 @@ resolve predefined entities.
3737
it can handle every attribute that does not match existing cases within an enum variant.
3838
- [#722]: Allow to pass owned strings to `Writer::create_element`. This is breaking change!
3939
- [#275]: Added `ElementWriter::new_line()` which enables pretty printing elements with multiple attributes.
40-
- [#743]: Add `Deserializer::get_ref()` to get XML Reader from serde Deserializer
41-
- [#734]: Add helper functions to resolve predefined XML and HTML5 entities:
40+
- [#743]: Added `Deserializer::get_ref()` to get XML Reader from serde Deserializer
41+
- [#734]: Added helper functions to resolve predefined XML and HTML5 entities:
4242
- `quick_xml::escape::resolve_predefined_entity`
4343
- `quick_xml::escape::resolve_xml_entity`
4444
- `quick_xml::escape::resolve_html5_entity`
45+
- [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`.
4546

4647
### Bug Fixes
4748

@@ -50,6 +51,7 @@ resolve predefined entities.
5051
- [#684]: Fix incorrect position reported for `Error::IllFormed(MissingDoctypeName)`.
5152
- [#704]: Fix empty tags with attributes not being expanded when `expand_empty_elements` is set to true.
5253
- [#683]: Use local tag name when check tag name against possible names for field.
54+
- [#753]: Correctly determine end of processing instructions and XML declaration.
5355

5456
### Misc Changes
5557

@@ -98,6 +100,7 @@ resolve predefined entities.
98100
[#738]: https://github.com/tafia/quick-xml/pull/738
99101
[#743]: https://github.com/tafia/quick-xml/pull/743
100102
[#748]: https://github.com/tafia/quick-xml/pull/748
103+
[#753]: https://github.com/tafia/quick-xml/pull/753
101104
[`DeEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.DeEvent.html
102105
[`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html
103106
[`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html

src/reader/buffered_reader.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,49 @@ macro_rules! impl_buffered_source {
9191
Ok((&buf[start..], done))
9292
}
9393

94+
$($async)? fn read_pi $(<$lf>)? (
95+
&mut self,
96+
buf: &'b mut Vec<u8>,
97+
position: &mut usize,
98+
) -> Result<(&'b [u8], bool)> {
99+
let mut parser = super::PiParser::default();
100+
101+
let mut read = 0;
102+
let mut done = false;
103+
let start = buf.len();
104+
while !done {
105+
let used = {
106+
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
107+
Ok(n) if n.is_empty() => break,
108+
Ok(n) => n,
109+
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
110+
Err(e) => {
111+
*position += read;
112+
return Err(Error::Io(e.into()));
113+
}
114+
};
115+
116+
match parser.feed(available) {
117+
Some(i) => {
118+
// We does not include `>` in data
119+
buf.extend_from_slice(&available[..i - 1]);
120+
done = true;
121+
i
122+
}
123+
None => {
124+
buf.extend_from_slice(available);
125+
available.len()
126+
}
127+
}
128+
};
129+
self $(.$reader)? .consume(used);
130+
read += used;
131+
}
132+
*position += read;
133+
134+
Ok((&buf[start..], done))
135+
}
136+
94137
$($async)? fn read_bang_element $(<$lf>)? (
95138
&mut self,
96139
buf: &'b mut Vec<u8>,

src/reader/mod.rs

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ macro_rules! read_until_close {
361361
},
362362
// `<?` - processing instruction
363363
Ok(Some(b'?')) => match $reader
364-
.read_bytes_until(b'>', $buf, &mut $self.state.offset)
364+
.read_pi($buf, &mut $self.state.offset)
365365
$(.$await)?
366366
{
367367
Ok((bytes, true)) => $self.state.emit_question_mark(bytes),
@@ -428,10 +428,12 @@ macro_rules! read_to_end {
428428
mod async_tokio;
429429
mod buffered_reader;
430430
mod ns_reader;
431+
mod pi;
431432
mod slice_reader;
432433
mod state;
433434

434435
pub use ns_reader::NsReader;
436+
pub use pi::PiParser;
435437

436438
/// Range of input in bytes, that corresponds to some piece of XML
437439
pub type Span = Range<usize>;
@@ -816,12 +818,29 @@ trait XmlSource<'r, B> {
816818
position: &mut usize,
817819
) -> Result<(&'r [u8], bool)>;
818820

819-
/// Read input until comment, CDATA or processing instruction is finished.
821+
/// Read input until processing instruction is finished.
822+
///
823+
/// This method expect that `<?` already was read.
824+
///
825+
/// Returns a slice of data read up to end of processing instruction (`>`),
826+
/// which does not include into result (`?` at the end included).
827+
///
828+
/// If input (`Self`) is exhausted and nothing was read, returns `None`.
829+
///
830+
/// # Parameters
831+
/// - `buf`: Buffer that could be filled from an input (`Self`) and
832+
/// from which [events] could borrow their data
833+
/// - `position`: Will be increased by amount of bytes consumed
834+
///
835+
/// [events]: crate::events::Event
836+
fn read_pi(&mut self, buf: B, position: &mut usize) -> Result<(&'r [u8], bool)>;
837+
838+
/// Read input until comment or CDATA is finished.
820839
///
821840
/// This method expect that `<` already was read.
822841
///
823-
/// Returns a slice of data read up to end of comment, CDATA or processing
824-
/// instruction (`>`), which does not include into result.
842+
/// Returns a slice of data read up to end of comment or CDATA (`>`),
843+
/// which does not include into result.
825844
///
826845
/// If input (`Self`) is exhausted and nothing was read, returns `None`.
827846
///

src/reader/pi.rs

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
//! Contains a parser for an XML processing instruction.
2+
3+
/// A parser that search a `?>` sequence in the slice.
4+
///
5+
/// To use a parser create an instance of parser and [`feed`] data into it.
6+
/// After successful search the parser will return [`Some`] with position where
7+
/// processing instruction is ended (the position after `?>`). If search was
8+
/// unsuccessful, a [`None`] will be returned. You typically would expect positive
9+
/// result of search, so that you should feed new data until yo'll get it.
10+
///
11+
/// NOTE: after successful match the parser does not returned to the initial
12+
/// state and should not be used anymore. Create a new parser if you want to perform
13+
/// new search.
14+
///
15+
/// # Example
16+
///
17+
/// ```
18+
/// # use quick_xml::reader::PiParser;
19+
/// # use pretty_assertions::assert_eq;
20+
/// let mut parser = PiParser::default();
21+
///
22+
/// // Parse `<?instruction with = 'some > and ?' inside?>and the text follow...`
23+
/// // splitted into three chunks
24+
/// assert_eq!(parser.feed(b"<?instruction"), None);
25+
/// // ...get new chunk of data
26+
/// assert_eq!(parser.feed(b" with = 'some > and ?"), None);
27+
/// // ...get another chunk of data
28+
/// assert_eq!(parser.feed(b"' inside?>and the text follow..."), Some(10));
29+
/// // ^ ^
30+
/// // 0 10
31+
/// ```
32+
///
33+
/// [`feed`]: Self::feed()
34+
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
35+
pub struct PiParser(
36+
/// A flag that indicates was the `bytes` in the previous attempt to find the
37+
/// end ended with `?`.
38+
pub bool,
39+
);
40+
41+
impl PiParser {
42+
/// Determines the end position of a processing instruction in the provided slice.
43+
/// Processing instruction ends on the first occurrence of `?>` which cannot be
44+
/// escaped.
45+
///
46+
/// Returns position after the `?>` or `None` if such sequence was not found.
47+
///
48+
/// [Section 2.6]: Parameter entity references MUST NOT be recognized within
49+
/// processing instructions, so parser do not search for them.
50+
///
51+
/// # Parameters
52+
/// - `bytes`: a slice to find the end of a processing instruction.
53+
/// Should contain text in ASCII-compatible encoding
54+
///
55+
/// [Section 2.6]: https://www.w3.org/TR/xml11/#sec-pi
56+
pub fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
57+
for i in memchr::memchr_iter(b'>', bytes) {
58+
match i {
59+
// +1 for `>` which should be included in event
60+
0 if self.0 => return Some(1),
61+
// If the previous byte is `?`, then we found `?>`
62+
// +1 for `>` which should be included in event
63+
i if i > 0 && bytes[i - 1] == b'?' => return Some(i + 1),
64+
_ => {}
65+
}
66+
}
67+
self.0 = bytes.last().copied() == Some(b'?');
68+
None
69+
}
70+
}
71+
72+
#[test]
73+
fn pi() {
74+
use pretty_assertions::assert_eq;
75+
76+
/// Returns `Ok(pos)` with the position in the buffer where processing
77+
/// instruction is ended.
78+
///
79+
/// Returns `Err(internal_state)` if parsing is not done yet.
80+
fn parse_pi(bytes: &[u8], had_question_mark: bool) -> Result<usize, bool> {
81+
let mut parser = PiParser(had_question_mark);
82+
match parser.feed(bytes) {
83+
Some(i) => Ok(i),
84+
None => Err(parser.0),
85+
}
86+
}
87+
88+
// Comments shows which character was seen the last before calling `feed`.
89+
// `x` means any character, pipe denotes start of the buffer that passed to `feed`
90+
91+
assert_eq!(parse_pi(b"", false), Err(false)); // x|
92+
assert_eq!(parse_pi(b"", true), Err(false)); // ?|
93+
94+
assert_eq!(parse_pi(b"?", false), Err(true)); // x|?
95+
assert_eq!(parse_pi(b"?", true), Err(true)); // ?|?
96+
97+
assert_eq!(parse_pi(b">", false), Err(false)); // x|>
98+
assert_eq!(parse_pi(b">", true), Ok(1)); // ?|>
99+
100+
assert_eq!(parse_pi(b"?>", false), Ok(2)); // x|?>
101+
assert_eq!(parse_pi(b"?>", true), Ok(2)); // ?|?>
102+
103+
assert_eq!(parse_pi(b">?>", false), Ok(3)); // x|>?>
104+
assert_eq!(parse_pi(b">?>", true), Ok(1)); // ?|>?>
105+
}

src/reader/slice_reader.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use encoding_rs::{Encoding, UTF_8};
1212
use crate::errors::{Error, Result, SyntaxError};
1313
use crate::events::Event;
1414
use crate::name::QName;
15-
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
15+
use crate::reader::{is_whitespace, BangType, PiParser, ReadElementState, Reader, Span, XmlSource};
1616

1717
/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
1818
/// This implementation supports not using an intermediate buffer as the byte slice
@@ -275,6 +275,23 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
275275
}
276276
}
277277

278+
fn read_pi(&mut self, _buf: (), position: &mut usize) -> Result<(&'a [u8], bool)> {
279+
let mut parser = PiParser::default();
280+
281+
if let Some(i) = parser.feed(self) {
282+
*position += i;
283+
// We does not include `>` in data
284+
let bytes = &self[..i - 1];
285+
*self = &self[i..];
286+
Ok((bytes, true))
287+
} else {
288+
*position += self.len();
289+
let bytes = &self[..];
290+
*self = &[];
291+
Ok((bytes, false))
292+
}
293+
}
294+
278295
fn read_bang_element(
279296
&mut self,
280297
_buf: (),

0 commit comments

Comments
 (0)