servo · simonwuelker · Jul 27, 2025 · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -7,7 +7,7 @@ members = [
     "rcdom",
     "xml5ever",
     "match_token"
-]
+, "html_named_entities"]
 
 [workspace.package]
 version = "0.35.0"
@@ -24,6 +24,7 @@ match_token = { version = "0.35.0", path = "match_token" }
 markup5ever = { version = "0.35.0", path = "markup5ever" }
 xml5ever = { version = "0.35.0", path = "xml5ever" }
 html5ever = { version = "0.35.0", path = "html5ever" }
+html_named_entities = { version = "0.1", path = "html_named_entities"}
 
 # External dependencies
 syn = { version = "2", features = ["full"] }
@@ -40,9 +41,7 @@ phf_codegen = "0.11"
 # Dev dependencies
 criterion = "0.6"
 libtest-mimic = "0.8.1"
+serde = { version = "1", features = ["derive"] }
 serde_json = "1.0"
 env_logger = "0.10"
 typed-arena = "2.0.2"
-
-
-
diff --git a/RELEASING.MD b/RELEASING.MD
@@ -21,6 +21,13 @@ The **web_atoms** crate is on a separate cycle as it needs frequent releases but
 - Publish the new version of **web_atoms**
 - Optionally: publish a new version of the other crates to match
 
+## Making a release of **web_atoms**:
+
+- Bump the version in `html_named_entities/Cargo.toml`
+- Update the version **html_named_entities** in the workspace `Cargo.toml`'s `[workspace.dependencies]` section to match
+- Publish the new version of **html_named_entities**
+- Optionally: publish a new version of the other crates to match
+
 ## Making a release of all other crates
 
 In the workspace `Cargo.toml`:

diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml
@@ -18,6 +18,7 @@ trace_tokenizer = []
 [dependencies]
 markup5ever = { workspace = true }
 match_token = { workspace = true }
+html_named_entities = { workspace = true }
 log = { workspace = true }
 
 [dev-dependencies]

diff --git a/html5ever/benches/html5ever.rs b/html5ever/benches/html5ever.rs
@@ -80,6 +80,7 @@ fn html5ever_benchmark(c: &mut Criterion) {
     run_bench(c, "small-fragment.html");
     run_bench(c, "tiny-fragment.html");
     run_bench(c, "strong.html");
+    run_bench(c, "entities.html");
 }
 
 criterion_group!(benches, html5ever_benchmark);

diff --git a/html5ever/data/bench/entities.html b/html5ever/data/bench/entities.html
diff --git a/html5ever/src/tokenizer/char_ref/mod.rs b/html5ever/src/tokenizer/char_ref/mod.rs
@@ -1,4 +1,4 @@
-// Copyright 2014-2017 The html5ever Project Developers. See the
+// Copyright 2014-2025 The html5ever Project Developers. See the
 // COPYRIGHT file at the top-level directory of this distribution.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
@@ -12,18 +12,14 @@ use crate::buffer_queue::BufferQueue;
 use crate::data;
 use crate::tendril::StrTendril;
 
+use html_named_entities::{
+    format_name_error, CharRef, NamedReferenceTokenizationResult, NamedReferenceTokenizerState,
+};
+
 use log::debug;
 use std::borrow::Cow::{self, Borrowed};
 use std::char::from_u32;
-
-//§ tokenizing-character-references
-pub(super) struct CharRef {
-    /// The resulting character(s)
-    pub(super) chars: [char; 2],
-
-    /// How many slots in `chars` are valid?
-    pub(super) num_chars: u8,
-}
+use std::mem;
 
 pub(super) enum Status {
     Stuck,
@@ -37,8 +33,8 @@ enum State {
     Octothorpe,
     Numeric(u32), // base
     NumericSemicolon,
-    Named,
-    BogusName,
+    Named(NamedReferenceTokenizerState),
+    BogusName(StrTendril),
 }
 
 pub(super) struct CharRefTokenizer {
@@ -49,17 +45,6 @@ pub(super) struct CharRefTokenizer {
     num_too_big: bool,
     seen_digit: bool,
     hex_marker: Option<char>,
-
-    name_buf_opt: Option<StrTendril>,
-    name_match: Option<(u32, u32)>,
-    name_len: usize,
-}
-
-impl CharRef {
-    const EMPTY: CharRef = CharRef {
-        chars: ['\0', '\0'],
-        num_chars: 0,
-    };
 }
 
 impl CharRefTokenizer {
@@ -71,24 +56,9 @@ impl CharRefTokenizer {
             num_too_big: false,
             seen_digit: false,
             hex_marker: None,
-            name_buf_opt: None,
-            name_match: None,
-            name_len: 0,
         }
     }
 
-    fn name_buf(&self) -> &StrTendril {
-        self.name_buf_opt
-            .as_ref()
-            .expect("name_buf missing in named character reference")
-    }
-
-    fn name_buf_mut(&mut self) -> &mut StrTendril {
-        self.name_buf_opt
-            .as_mut()
-            .expect("name_buf missing in named character reference")
-    }
-
     fn finish_one(&mut self, c: char) -> Status {
         Status::Done(CharRef {
             chars: [c, '\0'],
@@ -109,8 +79,40 @@ impl CharRefTokenizer {
             State::Octothorpe => self.do_octothorpe(tokenizer, input),
             State::Numeric(base) => self.do_numeric(tokenizer, input, base),
             State::NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
-            State::Named => self.do_named(tokenizer, input),
-            State::BogusName => self.do_bogus_name(tokenizer, input),
+            State::Named(ref mut named_tokenizer) => loop {
+                let Some(c) = tokenizer.peek(input) else {
+                    return Status::Stuck;
+                };
+                tokenizer.discard_char(input);
+
+                match named_tokenizer.feed_character(c, input, |error| tokenizer.emit_error(error))
+                {
+                    NamedReferenceTokenizationResult::Success(reference) => {
+                        return Status::Done(reference);
+                    },
+                    NamedReferenceTokenizationResult::Failed(characters) => {
+                        self.state = State::BogusName(characters.into());
+                        return Status::Progress;
+                    },
+                    NamedReferenceTokenizationResult::Continue => {},
+                }
+            },
+            State::BogusName(ref mut invalid_name) => {
+                let Some(c) = tokenizer.peek(input) else {
+                    return Status::Stuck;
+                };
+                tokenizer.discard_char(input);
+                invalid_name.push_char(c);
+                match c {
+                    _ if c.is_ascii_alphanumeric() => return Status::Progress,
+                    ';' => {
+                        tokenizer.emit_error(Cow::from(format_name_error(&invalid_name)));
+                    },
+                    _ => (),
+                }
+                input.push_front(mem::take(invalid_name));
+                Status::Done(CharRef::EMPTY)
+            },
         }
     }
 
@@ -121,8 +123,9 @@ impl CharRefTokenizer {
     ) -> Status {
         match tokenizer.peek(input) {
             Some('a'..='z' | 'A'..='Z' | '0'..='9') => {
-                self.state = State::Named;
-                self.name_buf_opt = Some(StrTendril::new());
+                self.state = State::Named(NamedReferenceTokenizerState::new(
+                    self.is_consumed_in_attribute,
+                ));
                 Status::Progress
             },
             Some('#') => {
@@ -253,175 +256,36 @@ impl CharRefTokenizer {
         self.finish_one(c)
     }
 
-    fn do_named<Sink: TokenSink>(
-        &mut self,
-        tokenizer: &Tokenizer<Sink>,
-        input: &BufferQueue,
-    ) -> Status {
-        // peek + discard skips over newline normalization, therefore making it easier to
-        // un-consume
-        let Some(c) = tokenizer.peek(input) else {
-            return Status::Stuck;
-        };
-        tokenizer.discard_char(input);
-        self.name_buf_mut().push_char(c);
-        match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
-            // We have either a full match or a prefix of one.
-            Some(&m) => {
-                if m.0 != 0 {
-                    // We have a full match, but there might be a longer one to come.
-                    self.name_match = Some(m);
-                    self.name_len = self.name_buf().len();
-                }
-                // Otherwise we just have a prefix match.
-                Status::Progress
-            },
-
-            // Can't continue the match.
-            None => self.finish_named(tokenizer, input, Some(c)),
-        }
-    }
-
-    fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) {
-        let msg = if tokenizer.opts.exact_errors {
-            Cow::from(format!("Invalid character reference &{}", self.name_buf()))
-        } else {
-            Cow::from("Invalid character reference")
-        };
-        tokenizer.emit_error(msg);
-    }
-
-    fn unconsume_name(&mut self, input: &BufferQueue) {
-        input.push_front(self.name_buf_opt.take().unwrap());
-    }
-
-    fn finish_named<Sink: TokenSink>(
-        &mut self,
-        tokenizer: &Tokenizer<Sink>,
-        input: &BufferQueue,
-        end_char: Option<char>,
-    ) -> Status {
-        match self.name_match {
-            None => {
-                match end_char {
-                    Some(c) if c.is_ascii_alphanumeric() => {
-                        // Keep looking for a semicolon, to determine whether
-                        // we emit a parse error.
-                        self.state = State::BogusName;
-                        return Status::Progress;
-                    },
-
-                    // Check length because &; is not a parse error.
-                    Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
-
-                    _ => (),
-                }
-                self.unconsume_name(input);
-                Status::Done(CharRef::EMPTY)
-            },
-
-            Some((c1, c2)) => {
-                // We have a complete match, but we may have consumed
-                // additional characters into self.name_buf.  Usually
-                // at least one, but several in cases like
-                //
-                //     &not    => match for U+00AC
-                //     &noti   => valid prefix for &notin
-                //     &notit  => can't continue match
-
-                let name_len = self.name_len;
-                assert!(name_len > 0);
-                let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
-
-                // There might not be a next character after the match, if
-                // we had a full match and then hit EOF.
-                let next_after = if name_len == self.name_buf().len() {
-                    None
-                } else {
-                    Some(self.name_buf()[name_len..].chars().next().unwrap())
-                };
-
-                // If the character reference was consumed as part of an attribute, and the last
-                // character matched is not a U+003B SEMICOLON character (;), and the next input
-                // character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric,
-                // then, for historical reasons, flush code points consumed as a character
-                // reference and switch to the return state.
-
-                let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after)
-                {
-                    (_, ';', _) => false,
-                    (true, _, Some('=')) => true,
-                    (true, _, Some(c)) if c.is_ascii_alphanumeric() => true,
-                    _ => {
-                        // 1. If the last character matched is not a U+003B SEMICOLON character
-                        //    (;), then this is a missing-semicolon-after-character-reference parse
-                        //    error.
-                        tokenizer.emit_error(Borrowed(
-                            "Character reference does not end with semicolon",
-                        ));
-                        false
-                    },
-                };
-
-                if unconsume_all {
-                    self.unconsume_name(input);
-                    Status::Done(CharRef::EMPTY)
-                } else {
-                    input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
-                    tokenizer.ignore_lf.set(false);
-                    Status::Done(CharRef {
-                        chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
-                        num_chars: if c2 == 0 { 1 } else { 2 },
-                    })
-                }
-            },
-        }
-    }
-
-    fn do_bogus_name<Sink: TokenSink>(
-        &mut self,
-        tokenizer: &Tokenizer<Sink>,
-        input: &BufferQueue,
-    ) -> Status {
-        // peek + discard skips over newline normalization, therefore making it easier to
-        // un-consume
-        let Some(c) = tokenizer.peek(input) else {
-            return Status::Stuck;
-        };
-        tokenizer.discard_char(input);
-        self.name_buf_mut().push_char(c);
-        match c {
-            _ if c.is_ascii_alphanumeric() => return Status::Progress,
-            ';' => self.emit_name_error(tokenizer),
-            _ => (),
-        }
-        self.unconsume_name(input);
-        Status::Done(CharRef::EMPTY)
-    }
-
     pub(super) fn end_of_file<Sink: TokenSink>(
         &mut self,
         tokenizer: &Tokenizer<Sink>,
         input: &BufferQueue,
     ) -> CharRef {
         loop {
-            let status = match self.state {
+            let status = match &mut self.state {
                 State::Begin => Status::Done(CharRef::EMPTY),
                 State::Numeric(_) if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
                 State::Numeric(_) | State::NumericSemicolon => {
                     tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
                     self.finish_numeric(tokenizer)
                 },
-                State::Named => self.finish_named(tokenizer, input, None),
-                State::BogusName => {
-                    self.unconsume_name(input);
-                    Status::Done(CharRef::EMPTY)
+                State::Named(state) => {
+                    return state
+                        .notify_end_of_file(input, |error| tokenizer.emit_error(error))
+                        .unwrap_or(CharRef::EMPTY)
                 },
                 State::Octothorpe => {
                     input.push_front(StrTendril::from_slice("#"));
                     tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
                     Status::Done(CharRef::EMPTY)
                 },
+                State::BogusName(bogus_name) => {
+                    input.push_front(bogus_name.clone());
+                    if bogus_name.ends_with(';') {
+                        tokenizer.emit_error(Cow::from(format_name_error(&bogus_name)));
+                    }
+                    Status::Done(CharRef::EMPTY)
+                },
             };
 
             match status {

diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
@@ -19,7 +19,8 @@ use self::states::{DoubleEscaped, Escaped};
 use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
 use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
 
-use self::char_ref::{CharRef, CharRefTokenizer};
+use self::char_ref::CharRefTokenizer;
+use html_named_entities::CharRef;
 
 use crate::util::str::lower_ascii_letter;