Skip to content

Commit 53e4ccc

Browse files
committed
Refactor xml5ever to not own the input.
1 parent ba98072 commit 53e4ccc

File tree

6 files changed

+184
-143
lines changed

6 files changed

+184
-143
lines changed

xml5ever/examples/simple_xml_tokenizer.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,15 @@
77
//! [dependencies]
88
//! xml5ever = "0.1.1"
99
//! tendril = "0.1.3"
10+
//! markup5ever = "0.7.4"
1011
//! ```
1112
extern crate xml5ever;
13+
extern crate markup5ever;
1214

1315
use std::io;
1416
use std::default::Default;
1517

18+
use markup5ever::buffer_queue::BufferQueue;
1619
use xml5ever::tendril::{ByteTendril, ReadExt};
1720
use xml5ever::tokenizer::{TokenSink, Token, XmlTokenizer, ParseError};
1821
use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken};
@@ -57,13 +60,17 @@ fn main() {
5760

5861
// We need a ByteTendril to read a file
5962
let mut input = ByteTendril::new();
63+
6064
// Using SliceExt.read_to_tendril we can read stdin
6165
io::stdin().read_to_tendril(&mut input).unwrap();
6266
// For xml5ever we need StrTendril, so we reinterpret it
6367
// into StrTendril.
64-
let input = input.try_reinterpret().unwrap();
68+
69+
// Load input into BufferQueue
70+
let mut input_buffer = BufferQueue::new();
71+
input_buffer.push_back(input.try_reinterpret().unwrap());
6572
// Here we create and run tokenizer
6673
let mut tok = XmlTokenizer::new(sink, Default::default());
67-
tok.feed(input);
74+
tok.feed(&mut input_buffer);
6875
tok.end();
6976
}

xml5ever/examples/xml_tokenizer.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,15 @@
77
//! [dependencies]
88
//! xml5ever = "0.2.0"
99
//! tendril = "0.1.3"
10+
//! markup5ever = "0.7.4"
1011
//! ```
1112
extern crate xml5ever;
13+
extern crate markup5ever;
1214

1315
use std::io::{self};
1416
use std::default::Default;
1517

18+
use markup5ever::buffer_queue::BufferQueue;
1619
use xml5ever::tendril::{ByteTendril, ReadExt};
1720
use xml5ever::tokenizer::{TokenSink, Token, XmlTokenizer, XmlTokenizerOpts, ParseError};
1821
use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken};
@@ -89,13 +92,15 @@ fn main() {
8992
};
9093
let mut input = ByteTendril::new();
9194
io::stdin().read_to_tendril(&mut input).unwrap();
92-
let input = input.try_reinterpret().unwrap();
95+
let mut input_buffer = BufferQueue::new();
96+
input_buffer.push_back(input.try_reinterpret().unwrap());
97+
9398
let mut tok = XmlTokenizer::new(sink, XmlTokenizerOpts {
9499
profile: true,
95100
exact_errors: true,
96101
.. Default::default()
97102
});
98-
tok.feed(input);
103+
tok.feed(&mut input_buffer);
99104
tok.end();
100105
sink.is_char(false);
101106
}

xml5ever/src/driver.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use std::borrow::Cow;
1515
use tendril;
1616
use tendril::StrTendril;
1717
use tendril::stream::{TendrilSink, Utf8LossyDecoder};
18+
use markup5ever::buffer_queue::BufferQueue;
1819

1920
/// All-encompasing parser setting structure.
2021
#[derive(Clone, Default)]
@@ -38,22 +39,25 @@ pub fn parse_document<Sink>(sink: Sink, opts: XmlParseOpts) -> XmlParser<Sink>
3839

3940
let tb = XmlTreeBuilder::new(sink, opts.tree_builder);
4041
let tok = XmlTokenizer::new(tb, opts.tokenizer);
41-
XmlParser { tokenizer: tok}
42+
XmlParser { tokenizer: tok, input_buffer: BufferQueue::new() }
4243
}
4344

4445
/// An XML parser,
4546
/// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
4647
pub struct XmlParser<Sink> where Sink: TreeSink {
4748
/// Tokenizer used by XmlParser.
4849
pub tokenizer: XmlTokenizer<XmlTreeBuilder<Sink::Handle, Sink>>,
50+
/// Input used by XmlParser.
51+
pub input_buffer: BufferQueue,
4952
}
5053

5154
impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for XmlParser<Sink> {
5255

5356
type Output = Sink::Output;
5457

5558
fn process(&mut self, t: StrTendril) {
56-
self.tokenizer.feed(t)
59+
self.input_buffer.push_back(t);
60+
self.tokenizer.feed(&mut self.input_buffer);
5761
}
5862

5963
// FIXME: Is it too noisy to report every character decoding error?

xml5ever/src/tokenizer/char_ref/mod.rs

Lines changed: 45 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use std::borrow::Cow::Borrowed;
1414
use super::{XmlTokenizer, TokenSink};
1515
use tendril::StrTendril;
1616
use util::{is_ascii_alnum};
17+
use markup5ever::buffer_queue::BufferQueue;
1718

1819
pub use self::Status::*;
1920
use self::State::*;
@@ -112,31 +113,35 @@ impl CharRefTokenizer {
112113
}
113114

114115
impl CharRefTokenizer {
115-
pub fn step<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
116+
pub fn step<Sink: TokenSink>(
117+
&mut self,
118+
tokenizer: &mut XmlTokenizer<Sink>,
119+
input: &mut BufferQueue)
120+
-> Status {
116121
if self.result.is_some() {
117122
return Done;
118123
}
119124

120125
debug!("char ref tokenizer stepping in state {:?}", self.state);
121126
match self.state {
122-
Begin => self.do_begin(tokenizer),
123-
Octothorpe => self.do_octothorpe(tokenizer),
124-
Numeric(base) => self.do_numeric(tokenizer, base),
125-
NumericSemicolon => self.do_numeric_semicolon(tokenizer),
126-
Named => self.do_named(tokenizer),
127-
BogusName => self.do_bogus_name(tokenizer),
127+
Begin => self.do_begin(tokenizer, input),
128+
Octothorpe => self.do_octothorpe(tokenizer, input),
129+
Numeric(base) => self.do_numeric(tokenizer, base, input),
130+
NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
131+
Named => self.do_named(tokenizer, input),
132+
BogusName => self.do_bogus_name(tokenizer, input),
128133
}
129134
}
130135

131-
fn do_begin<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
132-
match unwrap_or_return!(tokenizer.peek(), Stuck) {
136+
fn do_begin<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue) -> Status {
137+
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
133138
'\t' | '\n' | '\x0C' | ' ' | '<' | '&'
134139
=> self.finish_none(),
135140
c if Some(c) == self.addnl_allowed
136141
=> self.finish_none(),
137142

138143
'#' => {
139-
tokenizer.discard_char();
144+
tokenizer.discard_char(input);
140145
self.state = Octothorpe;
141146
Progress
142147
}
@@ -149,11 +154,11 @@ impl CharRefTokenizer {
149154
}
150155
}
151156

152-
fn do_octothorpe<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
153-
let c = unwrap_or_return!(tokenizer.peek(), Stuck);
157+
fn do_octothorpe<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue) -> Status {
158+
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
154159
match c {
155160
'x' | 'X' => {
156-
tokenizer.discard_char();
161+
tokenizer.discard_char(input);
157162
self.hex_marker = Some(c);
158163
self.state = Numeric(16);
159164
}
@@ -166,11 +171,11 @@ impl CharRefTokenizer {
166171
Progress
167172
}
168173

169-
fn do_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>, base: u32) -> Status {
170-
let c = unwrap_or_return!(tokenizer.peek(), Stuck);
174+
fn do_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>, base: u32, input: &mut BufferQueue) -> Status {
175+
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
171176
match c.to_digit(base) {
172177
Some(n) => {
173-
tokenizer.discard_char();
178+
tokenizer.discard_char(input);
174179
self.num = self.num.wrapping_mul(base);
175180
if self.num > 0x10FFFF {
176181
// We might overflow, and the character is definitely invalid.
@@ -182,7 +187,7 @@ impl CharRefTokenizer {
182187
Progress
183188
}
184189

185-
None if !self.seen_digit => self.unconsume_numeric(tokenizer),
190+
None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
186191

187192
None => {
188193
self.state = NumericSemicolon;
@@ -191,22 +196,22 @@ impl CharRefTokenizer {
191196
}
192197
}
193198

194-
fn do_numeric_semicolon<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
195-
match unwrap_or_return!(tokenizer.peek(), Stuck) {
196-
';' => tokenizer.discard_char(),
199+
fn do_numeric_semicolon<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue) -> Status {
200+
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
201+
';' => tokenizer.discard_char(input),
197202
_ => tokenizer.emit_error(Borrowed("Semicolon missing after numeric character reference")),
198203
};
199204
self.finish_numeric(tokenizer)
200205
}
201206

202-
fn unconsume_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
207+
fn unconsume_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue) -> Status {
203208
let mut unconsume = StrTendril::from_char('#');
204209
match self.hex_marker {
205210
Some(c) => unconsume.push_char(c),
206211
None => (),
207212
}
208213

209-
tokenizer.unconsume(unconsume);
214+
tokenizer.unconsume(input, unconsume);
210215
tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
211216
self.finish_none()
212217
}
@@ -244,8 +249,8 @@ impl CharRefTokenizer {
244249
self.finish_one(c)
245250
}
246251

247-
fn do_named<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
248-
let c = unwrap_or_return!(tokenizer.get_char(), Stuck);
252+
fn do_named<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue) -> Status {
253+
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
249254
self.name_buf_mut().push_char(c);
250255
match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
251256
// We have either a full match or a prefix of one.
@@ -260,7 +265,7 @@ impl CharRefTokenizer {
260265
}
261266

262267
// Can't continue the match.
263-
None => self.finish_named(tokenizer, Some(c)),
268+
None => self.finish_named(tokenizer, Some(c), input),
264269
}
265270
}
266271

@@ -271,13 +276,14 @@ impl CharRefTokenizer {
271276
tokenizer.emit_error(msg);
272277
}
273278

274-
fn unconsume_name<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) {
275-
tokenizer.unconsume(self.name_buf_opt.take().unwrap());
279+
fn unconsume_name<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue) {
280+
tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
276281
}
277282

278283
fn finish_named<Sink: TokenSink>(&mut self,
279284
tokenizer: &mut XmlTokenizer<Sink>,
280-
end_char: Option<char>) -> Status {
285+
end_char: Option<char>,
286+
input: &mut BufferQueue) -> Status {
281287
match self.name_match {
282288
None => {
283289
match end_char {
@@ -294,7 +300,7 @@ impl CharRefTokenizer {
294300

295301
_ => (),
296302
}
297-
self.unconsume_name(tokenizer);
303+
self.unconsume_name(tokenizer, input);
298304
self.finish_none()
299305
}
300306

@@ -343,10 +349,10 @@ impl CharRefTokenizer {
343349
};
344350

345351
if unconsume_all {
346-
self.unconsume_name(tokenizer);
352+
self.unconsume_name(tokenizer, input);
347353
self.finish_none()
348354
} else {
349-
tokenizer.unconsume(StrTendril::from_slice(&self.name_buf()[name_len..]));
355+
tokenizer.unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
350356
self.result = Some(CharRef {
351357
chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
352358
num_chars: if c2 == 0 { 1 } else { 2 },
@@ -357,40 +363,40 @@ impl CharRefTokenizer {
357363
}
358364
}
359365

360-
fn do_bogus_name<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
361-
let c = unwrap_or_return!(tokenizer.get_char(), Stuck);
366+
fn do_bogus_name<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue) -> Status {
367+
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
362368
self.name_buf_mut().push_char(c);
363369
match c {
364370
_ if is_ascii_alnum(c) => return Progress,
365371
';' => self.emit_name_error(tokenizer),
366372
_ => ()
367373
}
368-
self.unconsume_name(tokenizer);
374+
self.unconsume_name(tokenizer, input);
369375
self.finish_none()
370376
}
371377

372-
pub fn end_of_file<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) {
378+
pub fn end_of_file<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue) {
373379
while self.result.is_none() {
374380
match self.state {
375381
Begin => drop(self.finish_none()),
376382

377383
Numeric(_) if !self.seen_digit
378-
=> drop(self.unconsume_numeric(tokenizer)),
384+
=> drop(self.unconsume_numeric(tokenizer, input)),
379385

380386
Numeric(_) | NumericSemicolon => {
381387
tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
382388
self.finish_numeric(tokenizer);
383389
}
384390

385-
Named => drop(self.finish_named(tokenizer, None)),
391+
Named => drop(self.finish_named(tokenizer, None, input)),
386392

387393
BogusName => {
388-
self.unconsume_name(tokenizer);
394+
self.unconsume_name(tokenizer, input);
389395
self.finish_none();
390396
}
391397

392398
Octothorpe => {
393-
tokenizer.unconsume(StrTendril::from_slice("#"));
399+
tokenizer.unconsume(input, StrTendril::from_slice("#"));
394400
tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
395401
self.finish_none();
396402
}

0 commit comments

Comments
 (0)