Skip to content

Commit 7861cad

Browse files
authored
Simplify the way character references are resolved (#643)
* Simplify the way character references are resolved Signed-off-by: Simon Wülker <[email protected]> * Remove CharRefTokenizer::finish_none Signed-off-by: Simon Wülker <[email protected]> --------- Signed-off-by: Simon Wülker <[email protected]>
1 parent 4641184 commit 7861cad

File tree

2 files changed

+53
-62
lines changed

2 files changed

+53
-62
lines changed

html5ever/src/tokenizer/char_ref/mod.rs

Lines changed: 36 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ pub(super) struct CharRef {
2828
pub(super) enum Status {
2929
Stuck,
3030
Progress,
31-
Done,
31+
Done(CharRef),
3232
}
3333

3434
#[derive(Debug)]
@@ -43,7 +43,6 @@ enum State {
4343

4444
pub(super) struct CharRefTokenizer {
4545
state: State,
46-
result: Option<CharRef>,
4746
is_consumed_in_attribute: bool,
4847

4948
num: u32,
@@ -56,12 +55,18 @@ pub(super) struct CharRefTokenizer {
5655
name_len: usize,
5756
}
5857

58+
impl CharRef {
59+
const EMPTY: CharRef = CharRef {
60+
chars: ['\0', '\0'],
61+
num_chars: 0,
62+
};
63+
}
64+
5965
impl CharRefTokenizer {
6066
pub(super) fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer {
6167
CharRefTokenizer {
6268
is_consumed_in_attribute,
6369
state: State::Begin,
64-
result: None,
6570
num: 0,
6671
num_too_big: false,
6772
seen_digit: false,
@@ -72,12 +77,6 @@ impl CharRefTokenizer {
7277
}
7378
}
7479

75-
// A CharRefTokenizer can only tokenize one character reference,
76-
// so this method consumes the tokenizer.
77-
pub(super) fn get_result(self) -> CharRef {
78-
self.result.expect("get_result called before done")
79-
}
80-
8180
fn name_buf(&self) -> &StrTendril {
8281
self.name_buf_opt
8382
.as_ref()
@@ -90,20 +89,11 @@ impl CharRefTokenizer {
9089
.expect("name_buf missing in named character reference")
9190
}
9291

93-
fn finish_none(&mut self) -> Status {
94-
self.result = Some(CharRef {
95-
chars: ['\0', '\0'],
96-
num_chars: 0,
97-
});
98-
Status::Done
99-
}
100-
10192
fn finish_one(&mut self, c: char) -> Status {
102-
self.result = Some(CharRef {
93+
Status::Done(CharRef {
10394
chars: [c, '\0'],
10495
num_chars: 1,
105-
});
106-
Status::Done
96+
})
10797
}
10898
}
10999

@@ -113,10 +103,6 @@ impl CharRefTokenizer {
113103
tokenizer: &Tokenizer<Sink>,
114104
input: &BufferQueue,
115105
) -> Status {
116-
if self.result.is_some() {
117-
return Status::Done;
118-
}
119-
120106
debug!("char ref tokenizer stepping in state {:?}", self.state);
121107
match self.state {
122108
State::Begin => self.do_begin(tokenizer, input),
@@ -144,7 +130,7 @@ impl CharRefTokenizer {
144130
self.state = State::Octothorpe;
145131
Status::Progress
146132
},
147-
Some(_) => self.finish_none(),
133+
Some(_) => Status::Done(CharRef::EMPTY),
148134
None => Status::Stuck,
149135
}
150136
}
@@ -228,7 +214,7 @@ impl CharRefTokenizer {
228214

229215
input.push_front(unconsume);
230216
tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
231-
self.finish_none()
217+
Status::Done(CharRef::EMPTY)
232218
}
233219

234220
fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) -> Status {
@@ -331,7 +317,7 @@ impl CharRefTokenizer {
331317
_ => (),
332318
}
333319
self.unconsume_name(input);
334-
self.finish_none()
320+
Status::Done(CharRef::EMPTY)
335321
},
336322

337323
Some((c1, c2)) => {
@@ -379,15 +365,14 @@ impl CharRefTokenizer {
379365

380366
if unconsume_all {
381367
self.unconsume_name(input);
382-
self.finish_none()
368+
Status::Done(CharRef::EMPTY)
383369
} else {
384370
input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
385371
tokenizer.ignore_lf.set(false);
386-
self.result = Some(CharRef {
372+
Status::Done(CharRef {
387373
chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
388374
num_chars: if c2 == 0 { 1 } else { 2 },
389-
});
390-
Status::Done
375+
})
391376
}
392377
},
393378
}
@@ -411,34 +396,42 @@ impl CharRefTokenizer {
411396
_ => (),
412397
}
413398
self.unconsume_name(input);
414-
self.finish_none()
399+
Status::Done(CharRef::EMPTY)
415400
}
416401

417402
pub(super) fn end_of_file<Sink: TokenSink>(
418403
&mut self,
419404
tokenizer: &Tokenizer<Sink>,
420405
input: &BufferQueue,
421-
) {
422-
while self.result.is_none() {
423-
match self.state {
424-
State::Begin => drop(self.finish_none()),
425-
State::Numeric(_) if !self.seen_digit => {
426-
self.unconsume_numeric(tokenizer, input);
427-
},
406+
) -> CharRef {
407+
loop {
408+
let status = match self.state {
409+
State::Begin => Status::Done(CharRef::EMPTY),
410+
State::Numeric(_) if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
428411
State::Numeric(_) | State::NumericSemicolon => {
429412
tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
430-
self.finish_numeric(tokenizer);
413+
self.finish_numeric(tokenizer)
431414
},
432-
State::Named => drop(self.finish_named(tokenizer, input, None)),
415+
State::Named => self.finish_named(tokenizer, input, None),
433416
State::BogusName => {
434417
self.unconsume_name(input);
435-
self.finish_none();
418+
Status::Done(CharRef::EMPTY)
436419
},
437420
State::Octothorpe => {
438421
input.push_front(StrTendril::from_slice("#"));
439422
tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
440-
self.finish_none();
423+
Status::Done(CharRef::EMPTY)
424+
},
425+
};
426+
427+
match status {
428+
Status::Done(char_ref) => {
429+
return char_ref;
430+
},
431+
Status::Stuck => {
432+
return CharRef::EMPTY;
441433
},
434+
Status::Progress => {},
442435
}
443436
}
444437
}

html5ever/src/tokenizer/mod.rs

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ pub struct Tokenizer<Sink> {
108108

109109
/// Tokenizer for character references, if we're tokenizing
110110
/// one at the moment.
111-
char_ref_tokenizer: RefCell<Option<Box<CharRefTokenizer>>>,
111+
char_ref_tokenizer: RefCell<Option<CharRefTokenizer>>,
112112

113113
/// Current input character. Just consumed, may reconsume.
114114
current_char: Cell<char>,
@@ -558,11 +558,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
558558
}
559559
}
560560

561-
fn consume_char_ref(&self) {
562-
*self.char_ref_tokenizer.borrow_mut() = Some(Box::new(CharRefTokenizer::new(matches!(
563-
self.state.get(),
564-
states::AttributeValue(_)
565-
))));
561+
fn start_consuming_character_reference(&self) {
562+
debug_assert!(
563+
self.char_ref_tokenizer.borrow().is_none(),
564+
"Nested character references are impossible"
565+
);
566+
567+
let is_in_attribute = matches!(self.state.get(), states::AttributeValue(_));
568+
*self.char_ref_tokenizer.borrow_mut() = Some(CharRefTokenizer::new(is_in_attribute));
566569
}
567570

568571
fn emit_eof(&self) {
@@ -651,7 +654,7 @@ macro_rules! go (
651654
( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1); });
652655
( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
653656

654-
( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(); return ProcessResult::Continue; });
657+
( $me:ident : consume_char_ref ) => ({ $me.start_consuming_character_reference(); return ProcessResult::Continue; });
655658

656659
// We have a default next state after emitting a tag, but the sink can override.
657660
( $me:ident : emit_tag $s:ident ) => ({
@@ -1660,22 +1663,18 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
16601663
}
16611664

16621665
fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1663-
// FIXME HACK: Take and replace the tokenizer so we don't
1664-
// double-mut-borrow self. This is why it's boxed.
1665-
let mut tok = self.char_ref_tokenizer.take().unwrap();
1666-
let outcome = tok.step(self, input);
1667-
1668-
let progress = match outcome {
1669-
char_ref::Status::Done => {
1670-
self.process_char_ref(tok.get_result());
1666+
let mut char_ref_tokenizer = self.char_ref_tokenizer.borrow_mut();
1667+
let progress = match char_ref_tokenizer.as_mut().unwrap().step(self, input) {
1668+
char_ref::Status::Done(char_ref) => {
1669+
self.process_char_ref(char_ref);
1670+
*char_ref_tokenizer = None;
16711671
return ProcessResult::Continue;
16721672
},
16731673

16741674
char_ref::Status::Stuck => ProcessResult::Suspend,
16751675
char_ref::Status::Progress => ProcessResult::Continue,
16761676
};
16771677

1678-
*self.char_ref_tokenizer.borrow_mut() = Some(tok);
16791678
progress
16801679
}
16811680

@@ -1712,9 +1711,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
17121711
let input = BufferQueue::default();
17131712
match self.char_ref_tokenizer.take() {
17141713
None => (),
1715-
Some(mut tok) => {
1716-
tok.end_of_file(self, &input);
1717-
self.process_char_ref(tok.get_result());
1714+
Some(mut tokenizer) => {
1715+
self.process_char_ref(tokenizer.end_of_file(self, &input));
17181716
},
17191717
}
17201718

0 commit comments

Comments
 (0)