Use Earley parser in 2015 day 19 part 2

ictrobot · ictrobot · commit c697f69b7f22 · 2025-05-17T13:04:17.000+01:00
This ensures the molecule can actually be created from the provided rules and is still fast (&lt;1ms locally).
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,7 +10,7 @@ edition = "2024"
 license = "MIT"
 publish = false
 repository = "https://github.com/ictrobot/aoc-rs"
-rust-version = "1.85.0"
+rust-version = "1.86.0"
 
 [workspace.lints.clippy]
 pedantic = { level = "warn", priority = -1 }
diff --git a/crates/utils/src/lib.rs b/crates/utils/src/lib.rs
@@ -27,5 +27,5 @@ pub use wasm::multithreading;
 pub mod prelude {
     pub use crate::examples;
     pub use crate::input::{InputError, InputType, MapWithInputExt as _};
-    pub use crate::parser::{self, Parser as _};
+    pub use crate::parser::{self, Parseable as _, Parser as _};
 }
diff --git a/crates/utils/src/parser/macros.rs b/crates/utils/src/parser/macros.rs
@@ -9,10 +9,12 @@
 /// Using this makes [2017 day 11](../../year2017/struct.Day11.html), which parses a sequence of
 /// literals separated by commas, over 2x faster.
 ///
+/// See also [`parser::parsable_enum!`](crate::parser::parsable_enum), which provides a macro to
+/// define an enum and literal parser together.
+///
 /// # Examples
 /// ```
 /// # use utils::parser::{Parser, self};
-///
 /// #[derive(Debug, PartialEq)]
 /// enum Example {
 ///     A,
@@ -36,7 +38,7 @@ macro_rules! parser_literal_map {
     (
         $($($l:literal)|+ => $e:expr),+$(,)?
     ) => {{
-        fn coerce_to_parser<F: Fn(&[u8]) -> $crate::parser::ParseResult<'_, O>, O>(f: F) -> F { f }
+        const fn coerce_to_parser<F: Fn(&[u8]) -> $crate::parser::ParseResult<'_, O>, O>(f: F) -> F { f }
 
         coerce_to_parser(|input| {
             $($(
@@ -56,6 +58,58 @@ macro_rules! parser_literal_map {
     };
 }
 
+/// Macro to define an enum that implements [`Parseable`](crate::parser::Parseable).
+///
+/// The parser is implemented using [`parser::literal_map!`](crate::parser::literal_map).
+///
+/// # Examples
+/// ```
+/// # use utils::parser::{Parser, Parseable, self};
+/// parser::parsable_enum! {
+///     #[derive(Debug, PartialEq, Default)]
+///     enum Direction {
+///         #[default]
+///         "north" | "n" => North,
+///         "south" | "s" => South,
+///         "east" | "e" => East,
+///         "west" | "w" => West,
+///     }
+/// }
+///
+/// assert_eq!(Direction::PARSER.parse(b"north"), Ok((Direction::North, &b""[..])));
+/// assert_eq!(Direction::PARSER.parse(b"s"), Ok((Direction::South, &b""[..])));
+/// assert!(Direction::PARSER.parse(b"a").is_err());
+/// ```
+#[macro_export]
+macro_rules! parser_parsable_enum {
+    (
+        $(#[$enum_meta:meta])*
+        enum $name:ident {$(
+            $(#[$meta:meta])*
+            $($l:literal)|+ => $variant:ident $(= $value:expr)?,
+        )+}
+    ) => {
+        $(#[$enum_meta])*
+        pub enum $name {$(
+            $(#[$meta])*
+            $variant $(= $value)?,
+        )+}
+
+        impl $name {
+            const ALL: &'static [$name] = &[$(
+                Self::$variant,
+            )+];
+        }
+
+        impl $crate::parser::Parseable for $name {
+            type Parser = for<'a> fn(&'a [u8]) -> $crate::parser::ParseResult<'a, Self>;
+            const PARSER: Self::Parser = $crate::parser_literal_map!($(
+                $($l)|+ => Self::$variant,
+            )+);
+        }
+    };
+}
+
 /// Macro to define a custom parser using a `match` inspired parse tree syntax.
 ///
 /// Each rule is made up of a list of chained parsers enclosed in brackets on the left-hand side.
@@ -203,7 +257,7 @@ macro_rules! parser_parse_tree {
 
     // Ensures this branch only matches inputs starting with (, giving each rule set a unique prefix
     (($($first:tt)+) $($tail:tt)+) => {{
-        fn coerce_to_parser<F: Fn(&[u8]) -> $crate::parser::ParseResult<'_, O>, O>(f: F) -> F { f }
+        const fn coerce_to_parser<F: Fn(&[u8]) -> $crate::parser::ParseResult<'_, O>, O>(f: F) -> F { f }
 
         coerce_to_parser(|input| {
             let mut furthest_err = $crate::parser::ParseError::Custom("unreachable");
diff --git a/crates/utils/src/parser/mod.rs b/crates/utils/src/parser/mod.rs
@@ -18,4 +18,5 @@ pub use one_of::one_of;
 pub use simple::{byte, byte_range, constant, eof, eol, noop, take_while, take_while1};
 
 pub use crate::parser_literal_map as literal_map;
+pub use crate::parser_parsable_enum as parsable_enum;
 pub use crate::parser_parse_tree as parse_tree;
diff --git a/crates/year2015/src/day19.rs b/crates/year2015/src/day19.rs
@@ -1,77 +1,222 @@
 use std::collections::HashSet;
+use utils::array::ArrayVec;
 use utils::prelude::*;
 
 /// Molecule string replacements.
 ///
-/// Part 2 assumes there is only one possible number of steps, and that the replacements are always
-/// the same length or longer.
+/// Part 2 assumes there is only one possible number of steps but does not assume the `Rn` `Y` `Ar`
+/// bracket structure or use the formula. Instead, it uses an optimized
+/// [Earley parser](https://en.wikipedia.org/wiki/Earley_parser), which ensures the molecule can be
+/// created from the provided rules.
 #[derive(Clone, Debug)]
-pub struct Day19<'a> {
-    replacements: Vec<(&'a [u8], &'a [u8])>,
-    molecule: &'a [u8],
+pub struct Day19 {
+    rules: Vec<(Option<Atom>, ArrayVec<Atom, 8>)>,
+    molecule: Vec<Atom>,
 }
 
-impl<'a> Day19<'a> {
-    pub fn new(input: &'a str, _: InputType) -> Result<Self, InputError> {
-        let Some((replacements, molecule)) = input.rsplit_once("\n\n") else {
+parser::parsable_enum! {
+    #[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash)]
+    #[repr(u8)]
+    enum Atom {
+        #[default]
+        "Al" => Al,
+        "Ar" => Ar,
+        "B" => B,
+        "Ca" => Ca,
+        "C" => C,
+        "F" => F,
+        "H" => H,
+        "Mg" => Mg,
+        "N" => N,
+        "O" => O,
+        "P" => P,
+        "Rn" => Rn,
+        "Si" => Si,
+        "Th" => Th,
+        "Ti" => Ti,
+        "Y" => Y,
+    }
+}
+
+const _: () = {
+    assert!(Atom::ALL.len() <= 16);
+};
+
+impl Day19 {
+    pub fn new(input: &str, _: InputType) -> Result<Self, InputError> {
+        let Some((rules_str, molecule)) = input.rsplit_once("\n\n") else {
             return Err(InputError::new(
                 input,
                 0,
-                "expected replacements then a blank line then the molecule",
+                "expected rules then a blank line then the molecule",
             ));
         };
 
+        let rules = Atom::PARSER
+            .map(Some)
+            .or(b'e'.map(|_| None))
+            .with_suffix(" => ")
+            .then(Atom::PARSER.repeat_arrayvec(parser::noop(), 1))
+            .parse_lines(rules_str)?;
+
+        if rules.len() > 64 {
+            return Err(InputError::new(input, rules_str.len(), "too many rules"));
+        }
+
         Ok(Self {
-            replacements: parser::take_while1(u8::is_ascii_alphabetic)
-                .then(parser::take_while1(u8::is_ascii_alphabetic).with_prefix(" => "))
-                .parse_lines(replacements)?,
-            molecule: molecule.trim_ascii_end().as_bytes(),
+            rules,
+            molecule: Atom::PARSER.parse_all(molecule)?,
         })
     }
 
     #[must_use]
     pub fn part1(&self) -> usize {
         let mut set = HashSet::new();
-        for &(from, to) in &self.replacements {
-            let new_length = self.molecule.len() - from.len() + to.len();
+        for (from, to) in &self.rules {
+            let Some(from) = *from else { continue };
+            let new_length = self.molecule.len() + to.len() - 1;
             for i in 0..self.molecule.len() {
-                if self.molecule[i..].starts_with(from) {
+                if self.molecule[i] == from {
                     let mut molecule = Vec::with_capacity(new_length);
                     molecule.extend_from_slice(&self.molecule[..i]);
                     molecule.extend_from_slice(to);
-                    molecule.extend_from_slice(&self.molecule[i + from.len()..]);
-                    set.insert(molecule);
+                    molecule.extend_from_slice(&self.molecule[i + 1..]);
+
+                    // `.into_iter().map(|x| x as u8).collect::<Vec<_>>()` makes this function 2-3x
+                    // faster as the std::hash::Hash implementation for u8 implements hash_slice
+                    // efficiently using a single call to write, and the into_iter-map-collect chain
+                    // is a no-op. It isn't possible to implement Hash::hash_slice for Atom so
+                    // efficiently without unsafe code / transmute.
+                    set.insert(molecule.into_iter().map(|x| x as u8).collect::<Vec<_>>());
                 }
             }
         }
         set.len()
     }
 
     #[must_use]
-    pub fn part2(&self) -> u64 {
-        let mut molecule = self.molecule.to_vec();
-        let mut steps = 0;
-        while molecule.iter().any(|&x| x != b'e') {
-            for &(from, to) in &self.replacements {
-                let mut i = 0;
-                while i < molecule.len() {
-                    if molecule[i..].starts_with(to) {
-                        // Replace to with from, presuming from.len() <= to.len()
-                        molecule[i..i + from.len()].copy_from_slice(from);
-                        molecule.drain(i + from.len()..i + to.len());
-
-                        steps += 1;
-                    } else {
-                        i += 1;
+    pub fn part2(&self) -> u32 {
+        #[derive(Copy, Clone, Debug)]
+        struct State {
+            rule: usize,
+            dot: usize,
+            origin: usize,
+        }
+
+        // Store the chart as a list of state lists at each position, plus a bitset for the current
+        // and next positions. This works well as only the current and next position sets are ever
+        // updated, and the bitset makes duplicate checking fast. Previous sets are only ever
+        // iterated over. The current list is also reused as a queue of states to process.
+        let mut chart = vec![Vec::new(); self.molecule.len() + 1];
+
+        // Indexed by bitset[origin][dot] & (1 << rule):
+        // - 9 possible dot values (0-8 inclusive, enforced by ArrayVec N),
+        // - 64 possible rules (checked in new).
+        let mut current_bitset = vec![[0u64; 9]; self.molecule.len() + 1];
+        let mut next_bitset = vec![[0u64; 9]; self.molecule.len() + 1];
+
+        // Preprocess the rules into separate lists by the LHS, populating e rules into the initial
+        // set.
+        let mut rules_by_lhs = vec![Vec::new(); 16];
+        for (i, (lhs, _)) in self.rules.iter().enumerate() {
+            if let Some(lhs) = *lhs {
+                rules_by_lhs[lhs as usize].push(i);
+            } else {
+                let state = State {
+                    rule: i,
+                    dot: 0,
+                    origin: 0,
+                };
+                current_bitset[state.origin][state.dot] |= 1 << state.rule;
+                chart[0].push((state, 1));
+            }
+        }
+
+        // Optimization: Only do predictions once per atom per position.
+        let mut predictions_done = 0u16;
+        // Optimization: Only do completions once per (origin, atom) per position.
+        let mut completions_done = vec![0u16; self.molecule.len() + 1];
+
+        for pos in 0..chart.len() {
+            let mut set_idx = 0;
+            while let Some(&(state, steps)) = chart[pos].get(set_idx) {
+                let (lhs, rhs) = &self.rules[state.rule];
+
+                if state.dot < rhs.len() {
+                    // Prediction
+                    if predictions_done & (1 << rhs[state.dot] as usize) == 0 {
+                        predictions_done |= 1 << rhs[state.dot] as usize;
+
+                        for &i in &rules_by_lhs[rhs[state.dot] as usize] {
+                            let new = State {
+                                rule: i,
+                                dot: 0,
+                                origin: pos,
+                            };
+                            if current_bitset[new.origin][new.dot] & (1 << new.rule) == 0 {
+                                current_bitset[new.origin][new.dot] |= 1 << new.rule;
+                                chart[pos].push((new, 1));
+                            }
+                        }
                     }
+
+                    // Scanning
+                    if self.molecule.get(pos) == Some(&rhs[state.dot]) {
+                        let new = State {
+                            rule: state.rule,
+                            dot: state.dot + 1,
+                            origin: state.origin,
+                        };
+                        if next_bitset[new.origin][new.dot] & (1 << new.rule) == 0 {
+                            next_bitset[new.origin][new.dot] |= 1 << new.rule;
+                            chart[pos + 1].push((new, steps));
+                        }
+                    }
+                } else if let Some(lhs) = *lhs {
+                    // Completion
+                    if completions_done[state.origin] & (1 << lhs as usize) == 0 {
+                        completions_done[state.origin] |= 1 << lhs as usize;
+
+                        let [current_chart, origin_chart] = chart
+                            .get_disjoint_mut([pos, state.origin])
+                            .expect("origin must be less than pos");
+
+                        for (prev_state, prev_steps) in origin_chart.iter() {
+                            let (_, prev_rhs) = &self.rules[prev_state.rule];
+                            if prev_state.dot < prev_rhs.len() && prev_rhs[prev_state.dot] == lhs {
+                                let new = State {
+                                    rule: prev_state.rule,
+                                    dot: prev_state.dot + 1,
+                                    origin: prev_state.origin,
+                                };
+                                if current_bitset[new.origin][new.dot] & (1 << new.rule) == 0 {
+                                    current_bitset[new.origin][new.dot] |= 1 << new.rule;
+                                    current_chart.push((new, steps + prev_steps));
+                                }
+                            }
+                        }
+                    }
+                } else if pos == self.molecule.len() {
+                    // Completion of a start rule consuming the entire molecule
+                    return steps;
                 }
+
+                set_idx += 1;
             }
+
+            (current_bitset, next_bitset) = (next_bitset, current_bitset);
+            next_bitset[..=pos].fill([0; 9]);
+
+            // Reset optimization caches for the next position
+            predictions_done = 0u16;
+            completions_done[..pos].fill(0);
         }
-        steps
+
+        panic!("no solution found");
     }
 }
 
-examples!(Day19<'_> -> (usize, u64) [
+examples!(Day19 -> (usize, u32) [
     {input: "H => HO\nH => OH\nO => HH\n\nHOH", part1: 4},
     {input: "e => H\ne => O\nH => HO\nH => OH\nO => HH\n\nHOH", part2: 3},
     {input: "e => H\ne => O\nH => HO\nH => OH\nO => HH\n\nHOHOHO", part2: 6},
diff --git a/crates/year2015/src/lib.rs b/crates/year2015/src/lib.rs
@@ -20,7 +20,7 @@ utils::year!(2015 => year2015, ${
     16 => day16::Day16<'_>,
     17 => day17::Day17,
     18 => day18::Day18,
-    19 => day19::Day19<'_>,
+    19 => day19::Day19,
     20 => day20::Day20,
     21 => day21::Day21,
     22 => day22::Day22,

Original file line number	Diff line number	Diff line change
`@@ -27,5 +27,5 @@ pub use wasm::multithreading;`
`27`	`27`	`pub mod prelude {`
`28`	`28`	`pub use crate::examples;`
`29`	`29`	`pub use crate::input::{InputError, InputType, MapWithInputExt as _};`
`30`		`- pub use crate::parser::{self, Parser as _};`
	`30`	`+ pub use crate::parser::{self, Parseable as _, Parser as _};`
`31`	`31`	`}`