Skip to content

Commit b49d28c

Browse files
committed
Skip linear search over DAFSA node children at first layer
Signed-off-by: Simon Wülker <[email protected]>
1 parent 38bdcf8 commit b49d28c

File tree

2 files changed

+35
-0
lines changed

2 files changed

+35
-0
lines changed

html5ever/src/tokenizer/char_ref/codegen.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,16 @@ pub(crate) fn resolve_unique_hash_value(value: usize) -> CharRef {
114114
num_chars,
115115
}
116116
}
117+
118+
pub(crate) fn compute_index_of_first_character(c: char) -> Option<usize> {
119+
debug_assert!(c.is_ascii_alphabetic());
120+
121+
let index = c as u32 as u8;
122+
if c.is_ascii_uppercase() {
123+
Some((index - b'A') as usize)
124+
} else if c.is_ascii_lowercase() {
125+
Some((index - b'a') as usize)
126+
} else {
127+
None
128+
}
129+
}

html5ever/src/tokenizer/char_ref/named.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
use super::codegen::{resolve_unique_hash_value, Node, DAFSA_NODES};
1111
use super::{CharRef, Status};
12+
use crate::tokenizer::char_ref::codegen::compute_index_of_first_character;
1213
use crate::tokenizer::TokenSink;
1314
use crate::tokenizer::Tokenizer;
1415
use markup5ever::buffer_queue::BufferQueue;
@@ -121,6 +122,23 @@ impl NamedReferenceTokenizerState {
121122
tokenizer: &Tokenizer<Sink>,
122123
input: &BufferQueue,
123124
) -> Result<Status, StrTendril> {
125+
// Optimization: If this is the first character in the named reference then
126+
// we know the index of the child node, and we also know that we can't possibly be done yet.
127+
// This saves us from having to do a linear search over all children of the root node.
128+
if !self.has_consumed_characters() {
129+
let Some(c) = tokenizer.peek(input) else {
130+
return Ok(Status::Stuck);
131+
};
132+
tokenizer.discard_char(input);
133+
self.name_buffer.push_char(c);
134+
let Some(first_index) = compute_index_of_first_character(c) else {
135+
// First character not in the range a-z or A-Z
136+
return Err(mem::take(&mut self.name_buffer));
137+
};
138+
139+
self.current_node = &DAFSA_NODES[first_index];
140+
}
141+
124142
loop {
125143
let Some(c) = tokenizer.peek(input) else {
126144
return Ok(Status::Stuck);
@@ -196,6 +214,10 @@ impl NamedReferenceTokenizerState {
196214
}
197215
char_ref
198216
}
217+
218+
fn has_consumed_characters(&self) -> bool {
219+
!self.name_buffer.is_empty() || !self.last_match.is_none()
220+
}
199221
}
200222

201223
pub(crate) fn emit_name_error<Sink: TokenSink>(name: StrTendril, tokenizer: &Tokenizer<Sink>) {

0 commit comments

Comments
 (0)