@@ -780,57 +780,162 @@ for term in transducer.query_filtered("var", 1, |scope_id| *scope_id <= 1) {
780780
781781### Phonetic Rewrite Rules (Formally Verified)
782782
783- The ` phonetic-rules ` feature provides ** mathematically verified** phonetic transformation rules for fuzzy matching. Use ` PhoneticGrep ` for on-the-fly matching with automatic Levenshtein composition:
783+ The ` phonetic-rules ` feature provides ** mathematically verified** phonetic transformation rules for fuzzy matching. Compose phonetic NFAs with Levenshtein automata for approximate string matching without normalization.
784+
785+ #### Compile-Time Macros
786+
787+ Use ` llre! ` and ` llev! ` macros to compile patterns and rules at build time (NFA embedded in binary):
784788
785789``` rust
786- use liblevenshtein :: phonetic :: grep :: PhoneticGrep ;
790+ use liblevenshtein :: {llre, llev, llre_file, llev_file};
791+
792+ // Compile regex pattern at build time - NFA embedded in binary
793+ let pattern = llre! (r " (ph|f)one" );
794+ assert! (pattern . matches (" phone" ));
795+ assert! (pattern . matches (" fone" ));
796+
797+ // Compile LLev rules at build time
798+ let rules = llev! (r # "
799+ ph -> f; // phone → fone
800+ gh -> / [:vowel:]_; // night → nit (silent gh after vowel)
801+ c -> s / _[:front_vowel:]; // city → sity
802+ " # );
803+
804+ // Load from files (imports resolved at compile time)
805+ let phonetic = llre_file! (" patterns/phonetic.llre" );
806+ let english = llev_file! (" rules/english.llev" );
807+ ```
787808
788- // PhoneticGrep composes NFA patterns with Levenshtein automata
789- // No preprocessing or normalization required!
790- let grep = PhoneticGrep :: from_pattern (" phone" , 1 )? ;
809+ #### Composing NFAs with Levenshtein Automata
810+
811+ Build fuzzy regex matchers by composing phonetic NFAs with Levenshtein automata:
812+
813+ ``` rust
814+ use liblevenshtein :: phonetic :: nfa :: {compile, ProductAutomatonChar };
815+ use liblevenshtein :: phonetic :: regex :: parse;
816+
817+ // Step 1: Parse phonetic regex pattern
818+ let regex = parse (" (ph|f)one" )? ;
791819
792- // Matches within edit distance 1
793- assert! (grep . matches (" phone" ). is_some ()); // exact match (distance 0)
794- assert! (grep . matches (" fone" ). is_some ()); // distance 1 (p→f)
795- assert! (grep . matches (" phon" ). is_some ()); // distance 1 (missing e)
796- assert! (grep . matches (" phones" ). is_some ()); // distance 1 (extra s)
820+ // Step 2: Compile regex to NFA
821+ let nfa = compile (& regex )? ;
822+
823+ // Step 3: Compose NFA with Levenshtein automaton (max distance 2)
824+ let product = ProductAutomatonChar :: new (nfa , 2 );
825+
826+ // Step 4: Fuzzy match without preprocessing
827+ assert! (product . accepts (" phone" )); // exact match (distance 0)
828+ assert! (product . accepts (" fone" )); // exact match (distance 0)
829+ assert! (product . accepts (" phones" )); // distance 1 (insertion)
830+ assert! (product . accepts (" phon" )); // distance 1 (deletion)
831+ assert! (product . accepts (" phome" )); // distance 1 (substitution)
832+
833+ // Get minimum edit distance
834+ assert_eq! (product . min_distance (" phone" ), Some (0 ));
835+ assert_eq! (product . min_distance (" fon" ), Some (1 ));
836+ assert_eq! (product . min_distance (" xyz" ), None ); // outside budget
797837```
798838
799- ** Named Character Classes** (phonetic-aware):
839+ #### Spelling Correction with Embedded English Rules
840+
841+ Combine pre-compiled English phonetic rules and search a dictionary for correction candidates:
842+
800843``` rust
801- // [:vowel:] includes ASCII vowels + IPA vowels (ə, ɪ, ʊ, ɛ, etc.)
802- // [:consonant:] includes ASCII consonants + IPA symbols
803- // [:fricative:] matches f,v,s,z,h + digraphs (sh,th,zh)
804- // [:nasal:] matches m,n + ng digraph + IPA ŋ
805- // [:stop:] / [:plosive:] matches p,b,t,d,k,g
806- // [:voiced:] / [:voiceless:] by voicing
844+ use liblevenshtein :: phonetic :: rules :: english;
845+ use liblevenshtein :: phonetic :: llev :: RuleSetChar ;
846+ use liblevenshtein :: phonetic :: verified :: rules_to_nfa_char;
847+ use liblevenshtein :: phonetic :: nfa :: ProductAutomatonChar ;
848+ use liblevenshtein :: dictionary :: DynamicDawgChar ;
849+
850+ // Load pre-compiled English phonetic rules
851+ let zompist = english :: zompist (); // 62 rules: ph→f, gh→∅, tion→ʃən
852+ let homophones = english :: homophones (); // too/two→to, their/there→ther
853+ let text_speak = english :: text_speak (); // u→you, 2→to, thx→thanks
854+
855+ // Combine all rules into a new RuleSetChar
856+ let mut combined = RuleSetChar :: new ();
857+ combined . merge (zompist . clone ());
858+ combined . merge (homophones . clone ());
859+ combined . merge (text_speak . clone ());
860+
861+ // Convert combined rules to NFA for fuzzy matching (no normalization)
862+ let rules_nfa = rules_to_nfa_char (& combined . rules);
863+ let product = ProductAutomatonChar :: new (rules_nfa , 1 );
864+
865+ // Build a dictionary of terms to search
866+ let mut dictionary = DynamicDawgChar :: new ();
867+ dictionary . insert (" phone" );
868+ dictionary . insert (" fone" );
869+ dictionary . insert (" today" );
870+ dictionary . insert (" knight" );
871+
872+ // Query for correction candidates by filtering dictionary terms
873+ let query = " fone" ;
874+ let mut candidates : Vec <(& str , u8 )> = dictionary
875+ . iter ()
876+ . filter_map (| term | {
877+ product . min_distance (term ). map (| dist | (term , dist ))
878+ })
879+ . collect ();
880+
881+ // Sort by distance (best matches first)
882+ candidates . sort_by_key (| (_ , dist )| * dist );
883+ // candidates = [("fone", 0), ("phone", 0), ...]
884+
885+ // Optional: Apply rules to normalize text
886+ let normalized = combined . apply (" phone" ); // "fone" (ph→f)
887+ let normalized = combined . apply (" knight" ); // "nit" (silent k, gh→∅)
888+ ```
889+
890+ #### Algorithm Variants
891+
892+ ``` rust
893+ use liblevenshtein :: transducer :: Algorithm ;
894+
895+ // Standard Levenshtein
896+ let standard = ProductAutomatonChar :: new (nfa . clone (), 1 );
807897
808- // Match words with nasal-vowel-stop pattern within distance 1
809- let grep = PhoneticGrep :: from_pattern (" [:nasal:][:vowel:][:stop:]" , 1 )? ;
810- // Matches: "nap", "map", "nod", "mob", "nip", "mat" ...
898+ // Transposition-aware (character swaps count as 1 edit)
899+ let transposition = ProductAutomatonChar :: with_algorithm (
900+ nfa . clone (), 1 , Algorithm :: Transposition
901+ );
811902
812- // Front vowels for palatalization contexts
813- let grep = PhoneticGrep :: from_pattern (" c[:front_vowel:]" , 1 )? ;
814- // Matches: "ce", "ci" (soft c contexts)
903+ // Merge-and-split (for OCR: "cl"→"d", "ä"→"ae")
904+ let merge_split = ProductAutomatonChar :: with_algorithm (
905+ nfa , 1 , Algorithm :: MergeAndSplit
906+ );
815907```
816908
817- ** Phonetic Flags** (compile-time transformations):
909+ #### PhoneticGrep (Convenience API)
910+
911+ For quick on-the-fly matching without building a dictionary:
912+
818913``` rust
819- // Case-insensitive matching
820- let grep = PhoneticGrep :: from_pattern (" (?i:hello)" , 1 )? ;
821- assert! (grep . matches (" HELLO" ). is_some ());
914+ use liblevenshtein :: phonetic :: grep :: PhoneticGrep ;
822915
823- // Accent-insensitive matching
824- let grep = PhoneticGrep :: from_pattern (" (?a:cafe)" , 0 )? ;
825- assert! (grep . matches (" café" ). is_some ()); // é → e
916+ // Quick on-the-fly matching
917+ let grep = PhoneticGrep :: from_pattern (" phone" , 1 )? ;
918+ assert! (grep . matches (" phone" ). is_some ());
919+ assert! (grep . matches (" fone" ). is_some ());
826920
827- // Combined case + accent insensitive
921+ // With phonetic flags ( case + accent insensitive)
828922let grep = PhoneticGrep :: from_pattern (" (?ia:cafe)" , 1 )? ;
829923assert! (grep . matches (" CAFÉ" ). is_some ());
830924
831- // Local distance override: (?;N:pattern)
832- let grep = PhoneticGrep :: from_pattern (" (?;2:difficult)" , 0 )? ;
833- // Pattern-level distance of 2, ignoring the constructor's 0
925+ // With rules from file
926+ let grep = PhoneticGrep :: with_rules (" fone" , Path :: new (" english.llev" ), 1 )? ;
927+ assert! (grep . matches (" phone" ). is_some ());
928+ ```
929+
930+ ** Named Character Classes** (phonetic-aware):
931+ ``` rust
932+ // [:vowel:] includes ASCII vowels + IPA vowels (ə, ɪ, ʊ, ɛ, etc.)
933+ // [:consonant:] includes ASCII consonants + IPA symbols
934+ // [:fricative:] matches f,v,s,z,h + digraphs (sh,th,zh)
935+ // [:nasal:] matches m,n + ng digraph + IPA ŋ
936+ // [:stop:] / [:plosive:] matches p,b,t,d,k,g
937+ // [:front_vowel:] / [:back_vowel:] by tongue position
938+ // [:voiced:] / [:voiceless:] by voicing
834939```
835940
836941** Formal Verification** : All algorithms are proven correct in Coq/Rocq:
@@ -847,43 +952,11 @@ let grep = PhoneticGrep::from_pattern("(?;2:difficult)", 0)?;
847952- Position skipping optimization: 50/50 proofs complete (100% verified)
848953- Modular proof decomposition with 0 Admitted lemmas
849954
850- ** Rule Sets** :
851- - ` orthography_rules() ` - 8 orthographic transformations (ch→ç, ph→f, silent letters)
852- - ` phonetic_rules() ` - 3 phonetic approximations for fuzzy matching (th→t, qu↔kw)
853- - ` zompist_rules() ` - Complete 13-rule set from Zompist English spelling system
854-
855- ** Dual u8/char Support** (following existing codebase patterns):
856- ``` rust
857- // Byte-level (ASCII, ~5% faster, 4× less memory)
858- let result = apply_rules_seq (& orthography_rules (), & phones_u8 , fuel );
859-
860- // Character-level (Unicode, correct for accented chars, CJK, emoji)
861- let result = apply_rules_seq_char (& orthography_rules_char (), & phones_char , fuel );
862- ```
863-
864- ** With Phonetic Rules** (for spelling normalization):
865- ``` rust
866- use liblevenshtein :: phonetic :: grep :: PhoneticGrep ;
867- use std :: path :: Path ;
868-
869- // Load rules from .llev file and compose with Levenshtein automaton
870- let grep = PhoneticGrep :: with_rules (
871- " fone" , // Query (phonetic spelling)
872- Path :: new (" english.llev" ), // Rules: ph→f, gh→silent, etc.
873- 1 // Max Levenshtein distance
874- )? ;
875-
876- // "phone" matches "fone" after rule application + distance tolerance
877- assert! (grep . matches (" phone" ). is_some ());
878- assert! (grep . matches (" fone" ). is_some ()); // exact after normalization
879- assert! (grep . matches (" phones" ). is_some ()); // distance 1
880- ```
881-
882955** Verification Artifacts** :
883956- Coq proofs: [ ` docs/verification/phonetic/ ` ] ( docs/verification/phonetic/ )
884957- Rust implementation: [ ` src/phonetic/ ` ] ( src/phonetic/ )
885- - Property tests: [ ` src/phonetic/properties.rs ` ] ( src/phonetic/properties.rs ) (14 tests, 3,584 test cases)
886- - Benchmarks: [ ` benches/phonetic_rules.rs ` ] ( benches/phonetic_rules.rs ) (7 benchmark groups)
958+ - Property tests: [ ` src/phonetic/properties.rs ` ] ( src/phonetic/properties.rs )
959+ - Benchmarks: [ ` benches/phonetic_rules.rs ` ] ( benches/phonetic_rules.rs )
887960- Example: [ ` examples/phonetic_rewrite.rs ` ] ( examples/phonetic_rewrite.rs )
888961
889962Enable with:
@@ -892,11 +965,6 @@ Enable with:
892965liblevenshtein = { git = " https://github.com/universal-automata/liblevenshtein-rust" , features = [" phonetic-rules" ] }
893966```
894967
895- Run the example:
896- ``` bash
897- cargo run --example phonetic_rewrite --features phonetic-rules
898- ```
899-
900968** Source** : Based on [ Zompist's English spelling rules] ( https://zompist.com/spell.html ) with complete formal verification in Coq/Rocq (630+ lines of proofs, 100% proven, zero Admitted).
901969
902970### LLev/LLRE - Phonetic Pattern Languages
@@ -907,36 +975,24 @@ For complex phonetic matching, use `.llev` files for rewrite rules or `.llre` fi
907975use liblevenshtein :: phonetic :: llev :: parse_str;
908976use liblevenshtein :: phonetic :: llre;
909977
910- // LLev: Rewrite rules with phonetic context
978+ // LLev: Rewrite rules with phonetic context and named classes
911979let rules = parse_str (r # "
912980 @name "English Phonetic Rules"
913981
914- [id: 1, name: "ph to f"]
915- ph -> f; // phone → fone
916-
917- [id: 2, name: "soft c"]
918- c -> s / _[:front_vowel:]; // city → sity (before e,i)
919-
920- [id: 3, name: "silent gh"]
921- gh -> / [:vowel:]_; // night → nit (after vowel)
982+ ph -> f; // phone → fone
983+ c -> s / _[:front_vowel:]; // city → sity (before e,i)
984+ gh -> / [:vowel:]_; // night → nit (silent after vowel)
922985" # )? ;
923986
924- // LLRE: Regex patterns with named character classes
987+ // LLRE: Compile regex pattern to NFA
925988let pattern = llre :: compile_pattern (" [:fricative:]one" )? ;
926- assert! (pattern . matches (" fone" )); // f ∈ [:fricative:]
927- assert! (pattern . matches (" shone" )); // sh ∈ [:fricative:]
928- assert! (pattern . matches (" zone" )); // z ∈ [:fricative:]
929-
930- // LLRE with flags
931- let pattern = llre :: compile_pattern (" (?ia:[:nasal:][:vowel:][:stop:])" )? ;
932- // Case+accent insensitive nasal-vowel-stop pattern
989+ assert! (pattern . matches (" fone" )); // f ∈ [:fricative:]
990+ assert! (pattern . matches (" shone" )); // sh ∈ [:fricative:]
933991
934- // AOT compilation for instant loading
935- #[cfg(feature = " serialization" )]
936- {
937- llre :: save (& pattern , " pattern.llre.bin" )? ;
938- let loaded = llre :: load (" pattern.llre.bin" )? ;
939- }
992+ // Compose LLRE pattern with Levenshtein for fuzzy matching
993+ use liblevenshtein :: phonetic :: nfa :: ProductAutomatonChar ;
994+ let product = ProductAutomatonChar :: new (pattern . nfa. clone (), 1 );
995+ assert! (product . accepts (" phone" )); // distance 1 from pattern
940996```
941997
942998See [ ` examples/phonetic_spellcheck ` ] ( examples/phonetic_spellcheck/ ) for a complete demo, and the [ LLev Grammar] ( docs/grammar/llev.ebnf ) / [ LLRE Grammar] ( docs/grammar/llre.ebnf ) for full syntax reference.
0 commit comments