|
9 | 9 |
|
10 | 10 | use super::codegen::{resolve_unique_hash_value, Node, DAFSA_NODES};
|
11 | 11 | use super::{CharRef, Status};
|
| 12 | +use crate::tokenizer::char_ref::codegen::compute_index_of_first_character; |
12 | 13 | use crate::tokenizer::TokenSink;
|
13 | 14 | use crate::tokenizer::Tokenizer;
|
14 | 15 | use markup5ever::buffer_queue::BufferQueue;
|
@@ -121,6 +122,23 @@ impl NamedReferenceTokenizerState {
|
121 | 122 | tokenizer: &Tokenizer<Sink>,
|
122 | 123 | input: &BufferQueue,
|
123 | 124 | ) -> Result<Status, StrTendril> {
|
| 125 | + // Optimization: If this is the first character in the named reference then |
| 126 | + // we know the index of the child node, and we also know that we can't possibly be done yet. |
| 127 | + // This saves us from having to do a linear search over all children of the root node. |
| 128 | + if !self.has_consumed_characters() { |
| 129 | + let Some(c) = tokenizer.peek(input) else { |
| 130 | + return Ok(Status::Stuck); |
| 131 | + }; |
| 132 | + tokenizer.discard_char(input); |
| 133 | + self.name_buffer.push_char(c); |
| 134 | + let Some(first_index) = compute_index_of_first_character(c) else { |
| 135 | + // First character not in the range a-z or A-Z |
| 136 | + return Err(mem::take(&mut self.name_buffer)); |
| 137 | + }; |
| 138 | + |
| 139 | + self.current_node = &DAFSA_NODES[first_index]; |
| 140 | + } |
| 141 | + |
124 | 142 | loop {
|
125 | 143 | let Some(c) = tokenizer.peek(input) else {
|
126 | 144 | return Ok(Status::Stuck);
|
@@ -196,6 +214,10 @@ impl NamedReferenceTokenizerState {
|
196 | 214 | }
|
197 | 215 | char_ref
|
198 | 216 | }
|
| 217 | + |
| 218 | + fn has_consumed_characters(&self) -> bool { |
| 219 | + !self.name_buffer.is_empty() || !self.last_match.is_none() |
| 220 | + } |
199 | 221 | }
|
200 | 222 |
|
201 | 223 | pub(crate) fn emit_name_error<Sink: TokenSink>(name: StrTendril, tokenizer: &Tokenizer<Sink>) {
|
|
0 commit comments