Skip to content

Tokenize named character references using a DAFSA #645

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ members = [
"rcdom",
"xml5ever",
"match_token"
]
, "html_named_entities"]

[workspace.package]
version = "0.35.0"
Expand All @@ -24,6 +24,7 @@ match_token = { version = "0.35.0", path = "match_token" }
markup5ever = { version = "0.35.0", path = "markup5ever" }
xml5ever = { version = "0.35.0", path = "xml5ever" }
html5ever = { version = "0.35.0", path = "html5ever" }
html_named_entities = { version = "0.1", path = "html_named_entities"}

# External dependencies
syn = { version = "2", features = ["full"] }
Expand All @@ -40,9 +41,7 @@ phf_codegen = "0.11"
# Dev dependencies
criterion = "0.6"
libtest-mimic = "0.8.1"
serde = { version = "1", features = ["derive"] }
serde_json = "1.0"
env_logger = "0.10"
typed-arena = "2.0.2"



7 changes: 7 additions & 0 deletions RELEASING.MD
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ The **web_atoms** crate is on a separate cycle as it needs frequent releases but
- Publish the new version of **web_atoms**
- Optionally: publish a new version of the other crates to match

## Making a release of **web_atoms**:

- Bump the version in `html_named_entities/Cargo.toml`
- Update the version **html_named_entities** in the workspace `Cargo.toml`'s `[workspace.dependencies]` section to match
- Publish the new version of **html_named_entities**
- Optionally: publish a new version of the other crates to match

## Making a release of all other crates

In the workspace `Cargo.toml`:
Expand Down
1 change: 1 addition & 0 deletions html5ever/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ trace_tokenizer = []
[dependencies]
markup5ever = { workspace = true }
match_token = { workspace = true }
html_named_entities = { workspace = true }
log = { workspace = true }

[dev-dependencies]
Expand Down
1 change: 1 addition & 0 deletions html5ever/benches/html5ever.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ fn html5ever_benchmark(c: &mut Criterion) {
run_bench(c, "small-fragment.html");
run_bench(c, "tiny-fragment.html");
run_bench(c, "strong.html");
run_bench(c, "entities.html");
}

criterion_group!(benches, html5ever_benchmark);
Expand Down
1 change: 1 addition & 0 deletions html5ever/data/bench/entities.html

Large diffs are not rendered by default.

250 changes: 57 additions & 193 deletions html5ever/src/tokenizer/char_ref/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2014-2017 The html5ever Project Developers. See the
// Copyright 2014-2025 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
Expand All @@ -12,18 +12,14 @@ use crate::buffer_queue::BufferQueue;
use crate::data;
use crate::tendril::StrTendril;

use html_named_entities::{
format_name_error, CharRef, NamedReferenceTokenizationResult, NamedReferenceTokenizerState,
};

use log::debug;
use std::borrow::Cow::{self, Borrowed};
use std::char::from_u32;

//§ tokenizing-character-references
pub(super) struct CharRef {
/// The resulting character(s)
pub(super) chars: [char; 2],

/// How many slots in `chars` are valid?
pub(super) num_chars: u8,
}
use std::mem;

pub(super) enum Status {
Stuck,
Expand All @@ -37,8 +33,8 @@ enum State {
Octothorpe,
Numeric(u32), // base
NumericSemicolon,
Named,
BogusName,
Named(NamedReferenceTokenizerState),
BogusName(StrTendril),
}

pub(super) struct CharRefTokenizer {
Expand All @@ -49,17 +45,6 @@ pub(super) struct CharRefTokenizer {
num_too_big: bool,
seen_digit: bool,
hex_marker: Option<char>,

name_buf_opt: Option<StrTendril>,
name_match: Option<(u32, u32)>,
name_len: usize,
}

impl CharRef {
const EMPTY: CharRef = CharRef {
chars: ['\0', '\0'],
num_chars: 0,
};
}

impl CharRefTokenizer {
Expand All @@ -71,24 +56,9 @@ impl CharRefTokenizer {
num_too_big: false,
seen_digit: false,
hex_marker: None,
name_buf_opt: None,
name_match: None,
name_len: 0,
}
}

fn name_buf(&self) -> &StrTendril {
self.name_buf_opt
.as_ref()
.expect("name_buf missing in named character reference")
}

fn name_buf_mut(&mut self) -> &mut StrTendril {
self.name_buf_opt
.as_mut()
.expect("name_buf missing in named character reference")
}

fn finish_one(&mut self, c: char) -> Status {
Status::Done(CharRef {
chars: [c, '\0'],
Expand All @@ -109,8 +79,40 @@ impl CharRefTokenizer {
State::Octothorpe => self.do_octothorpe(tokenizer, input),
State::Numeric(base) => self.do_numeric(tokenizer, input, base),
State::NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
State::Named => self.do_named(tokenizer, input),
State::BogusName => self.do_bogus_name(tokenizer, input),
State::Named(ref mut named_tokenizer) => loop {
let Some(c) = tokenizer.peek(input) else {
return Status::Stuck;
};
tokenizer.discard_char(input);

match named_tokenizer.feed_character(c, input, |error| tokenizer.emit_error(error))
{
NamedReferenceTokenizationResult::Success(reference) => {
return Status::Done(reference);
},
NamedReferenceTokenizationResult::Failed(characters) => {
self.state = State::BogusName(characters.into());
return Status::Progress;
},
NamedReferenceTokenizationResult::Continue => {},
}
},
State::BogusName(ref mut invalid_name) => {
let Some(c) = tokenizer.peek(input) else {
return Status::Stuck;
};
tokenizer.discard_char(input);
invalid_name.push_char(c);
match c {
_ if c.is_ascii_alphanumeric() => return Status::Progress,
';' => {
tokenizer.emit_error(Cow::from(format_name_error(&invalid_name)));
},
_ => (),
}
input.push_front(mem::take(invalid_name));
Status::Done(CharRef::EMPTY)
},
}
}

Expand All @@ -121,8 +123,9 @@ impl CharRefTokenizer {
) -> Status {
match tokenizer.peek(input) {
Some('a'..='z' | 'A'..='Z' | '0'..='9') => {
self.state = State::Named;
self.name_buf_opt = Some(StrTendril::new());
self.state = State::Named(NamedReferenceTokenizerState::new(
self.is_consumed_in_attribute,
));
Status::Progress
},
Some('#') => {
Expand Down Expand Up @@ -253,175 +256,36 @@ impl CharRefTokenizer {
self.finish_one(c)
}

fn do_named<Sink: TokenSink>(
&mut self,
tokenizer: &Tokenizer<Sink>,
input: &BufferQueue,
) -> Status {
// peek + discard skips over newline normalization, therefore making it easier to
// un-consume
let Some(c) = tokenizer.peek(input) else {
return Status::Stuck;
};
tokenizer.discard_char(input);
self.name_buf_mut().push_char(c);
match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
// We have either a full match or a prefix of one.
Some(&m) => {
if m.0 != 0 {
// We have a full match, but there might be a longer one to come.
self.name_match = Some(m);
self.name_len = self.name_buf().len();
}
// Otherwise we just have a prefix match.
Status::Progress
},

// Can't continue the match.
None => self.finish_named(tokenizer, input, Some(c)),
}
}

fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) {
let msg = if tokenizer.opts.exact_errors {
Cow::from(format!("Invalid character reference &{}", self.name_buf()))
} else {
Cow::from("Invalid character reference")
};
tokenizer.emit_error(msg);
}

fn unconsume_name(&mut self, input: &BufferQueue) {
input.push_front(self.name_buf_opt.take().unwrap());
}

fn finish_named<Sink: TokenSink>(
&mut self,
tokenizer: &Tokenizer<Sink>,
input: &BufferQueue,
end_char: Option<char>,
) -> Status {
match self.name_match {
None => {
match end_char {
Some(c) if c.is_ascii_alphanumeric() => {
// Keep looking for a semicolon, to determine whether
// we emit a parse error.
self.state = State::BogusName;
return Status::Progress;
},

// Check length because &; is not a parse error.
Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),

_ => (),
}
self.unconsume_name(input);
Status::Done(CharRef::EMPTY)
},

Some((c1, c2)) => {
// We have a complete match, but we may have consumed
// additional characters into self.name_buf. Usually
// at least one, but several in cases like
//
// &not => match for U+00AC
// &noti => valid prefix for &notin
// &notit => can't continue match

let name_len = self.name_len;
assert!(name_len > 0);
let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();

// There might not be a next character after the match, if
// we had a full match and then hit EOF.
let next_after = if name_len == self.name_buf().len() {
None
} else {
Some(self.name_buf()[name_len..].chars().next().unwrap())
};

// If the character reference was consumed as part of an attribute, and the last
// character matched is not a U+003B SEMICOLON character (;), and the next input
// character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric,
// then, for historical reasons, flush code points consumed as a character
// reference and switch to the return state.

let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after)
{
(_, ';', _) => false,
(true, _, Some('=')) => true,
(true, _, Some(c)) if c.is_ascii_alphanumeric() => true,
_ => {
// 1. If the last character matched is not a U+003B SEMICOLON character
// (;), then this is a missing-semicolon-after-character-reference parse
// error.
tokenizer.emit_error(Borrowed(
"Character reference does not end with semicolon",
));
false
},
};

if unconsume_all {
self.unconsume_name(input);
Status::Done(CharRef::EMPTY)
} else {
input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
tokenizer.ignore_lf.set(false);
Status::Done(CharRef {
chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
num_chars: if c2 == 0 { 1 } else { 2 },
})
}
},
}
}

fn do_bogus_name<Sink: TokenSink>(
&mut self,
tokenizer: &Tokenizer<Sink>,
input: &BufferQueue,
) -> Status {
// peek + discard skips over newline normalization, therefore making it easier to
// un-consume
let Some(c) = tokenizer.peek(input) else {
return Status::Stuck;
};
tokenizer.discard_char(input);
self.name_buf_mut().push_char(c);
match c {
_ if c.is_ascii_alphanumeric() => return Status::Progress,
';' => self.emit_name_error(tokenizer),
_ => (),
}
self.unconsume_name(input);
Status::Done(CharRef::EMPTY)
}

pub(super) fn end_of_file<Sink: TokenSink>(
&mut self,
tokenizer: &Tokenizer<Sink>,
input: &BufferQueue,
) -> CharRef {
loop {
let status = match self.state {
let status = match &mut self.state {
State::Begin => Status::Done(CharRef::EMPTY),
State::Numeric(_) if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
State::Numeric(_) | State::NumericSemicolon => {
tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
self.finish_numeric(tokenizer)
},
State::Named => self.finish_named(tokenizer, input, None),
State::BogusName => {
self.unconsume_name(input);
Status::Done(CharRef::EMPTY)
State::Named(state) => {
return state
.notify_end_of_file(input, |error| tokenizer.emit_error(error))
.unwrap_or(CharRef::EMPTY)
},
State::Octothorpe => {
input.push_front(StrTendril::from_slice("#"));
tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
Status::Done(CharRef::EMPTY)
},
State::BogusName(bogus_name) => {
input.push_front(bogus_name.clone());
if bogus_name.ends_with(';') {
tokenizer.emit_error(Cow::from(format_name_error(&bogus_name)));
}
Status::Done(CharRef::EMPTY)
},
};

match status {
Expand Down
3 changes: 2 additions & 1 deletion html5ever/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ use self::states::{DoubleEscaped, Escaped};
use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};

use self::char_ref::{CharRef, CharRefTokenizer};
use self::char_ref::CharRefTokenizer;
use html_named_entities::CharRef;

use crate::util::str::lower_ascii_letter;

Expand Down
Loading