Skip to content

Commit ba571f0

Browse files
committed
Improves phonetic README docs
1 parent 22f398c commit ba571f0

File tree

1 file changed

+152
-96
lines changed

1 file changed

+152
-96
lines changed

README.md

Lines changed: 152 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -780,57 +780,162 @@ for term in transducer.query_filtered("var", 1, |scope_id| *scope_id <= 1) {
780780

781781
### Phonetic Rewrite Rules (Formally Verified)
782782

783-
The `phonetic-rules` feature provides **mathematically verified** phonetic transformation rules for fuzzy matching. Use `PhoneticGrep` for on-the-fly matching with automatic Levenshtein composition:
783+
The `phonetic-rules` feature provides **mathematically verified** phonetic transformation rules for fuzzy matching. Compose phonetic NFAs with Levenshtein automata for approximate string matching without normalization.
784+
785+
#### Compile-Time Macros
786+
787+
Use `llre!` and `llev!` macros to compile patterns and rules at build time (NFA embedded in binary):
784788

785789
```rust
786-
use liblevenshtein::phonetic::grep::PhoneticGrep;
790+
use liblevenshtein::{llre, llev, llre_file, llev_file};
791+
792+
// Compile regex pattern at build time - NFA embedded in binary
793+
let pattern = llre!(r"(ph|f)one");
794+
assert!(pattern.matches("phone"));
795+
assert!(pattern.matches("fone"));
796+
797+
// Compile LLev rules at build time
798+
let rules = llev!(r#"
799+
ph -> f; // phone → fone
800+
gh -> / [:vowel:]_; // night → nit (silent gh after vowel)
801+
c -> s / _[:front_vowel:]; // city → sity
802+
"#);
803+
804+
// Load from files (imports resolved at compile time)
805+
let phonetic = llre_file!("patterns/phonetic.llre");
806+
let english = llev_file!("rules/english.llev");
807+
```
787808

788-
// PhoneticGrep composes NFA patterns with Levenshtein automata
789-
// No preprocessing or normalization required!
790-
let grep = PhoneticGrep::from_pattern("phone", 1)?;
809+
#### Composing NFAs with Levenshtein Automata
810+
811+
Build fuzzy regex matchers by composing phonetic NFAs with Levenshtein automata:
812+
813+
```rust
814+
use liblevenshtein::phonetic::nfa::{compile, ProductAutomatonChar};
815+
use liblevenshtein::phonetic::regex::parse;
816+
817+
// Step 1: Parse phonetic regex pattern
818+
let regex = parse("(ph|f)one")?;
791819

792-
// Matches within edit distance 1
793-
assert!(grep.matches("phone").is_some()); // exact match (distance 0)
794-
assert!(grep.matches("fone").is_some()); // distance 1 (p→f)
795-
assert!(grep.matches("phon").is_some()); // distance 1 (missing e)
796-
assert!(grep.matches("phones").is_some()); // distance 1 (extra s)
820+
// Step 2: Compile regex to NFA
821+
let nfa = compile(&regex)?;
822+
823+
// Step 3: Compose NFA with Levenshtein automaton (max distance 2)
824+
let product = ProductAutomatonChar::new(nfa, 2);
825+
826+
// Step 4: Fuzzy match without preprocessing
827+
assert!(product.accepts("phone")); // exact match (distance 0)
828+
assert!(product.accepts("fone")); // exact match (distance 0)
829+
assert!(product.accepts("phones")); // distance 1 (insertion)
830+
assert!(product.accepts("phon")); // distance 1 (deletion)
831+
assert!(product.accepts("phome")); // distance 1 (substitution)
832+
833+
// Get minimum edit distance
834+
assert_eq!(product.min_distance("phone"), Some(0));
835+
assert_eq!(product.min_distance("fon"), Some(1));
836+
assert_eq!(product.min_distance("xyz"), None); // outside budget
797837
```
798838

799-
**Named Character Classes** (phonetic-aware):
839+
#### Spelling Correction with Embedded English Rules
840+
841+
Combine pre-compiled English phonetic rules and search a dictionary for correction candidates:
842+
800843
```rust
801-
// [:vowel:] includes ASCII vowels + IPA vowels (ə, ɪ, ʊ, ɛ, etc.)
802-
// [:consonant:] includes ASCII consonants + IPA symbols
803-
// [:fricative:] matches f,v,s,z,h + digraphs (sh,th,zh)
804-
// [:nasal:] matches m,n + ng digraph + IPA ŋ
805-
// [:stop:] / [:plosive:] matches p,b,t,d,k,g
806-
// [:voiced:] / [:voiceless:] by voicing
844+
use liblevenshtein::phonetic::rules::english;
845+
use liblevenshtein::phonetic::llev::RuleSetChar;
846+
use liblevenshtein::phonetic::verified::rules_to_nfa_char;
847+
use liblevenshtein::phonetic::nfa::ProductAutomatonChar;
848+
use liblevenshtein::dictionary::DynamicDawgChar;
849+
850+
// Load pre-compiled English phonetic rules
851+
let zompist = english::zompist(); // 62 rules: ph→f, gh→∅, tion→ʃən
852+
let homophones = english::homophones(); // too/two→to, their/there→ther
853+
let text_speak = english::text_speak(); // u→you, 2→to, thx→thanks
854+
855+
// Combine all rules into a new RuleSetChar
856+
let mut combined = RuleSetChar::new();
857+
combined.merge(zompist.clone());
858+
combined.merge(homophones.clone());
859+
combined.merge(text_speak.clone());
860+
861+
// Convert combined rules to NFA for fuzzy matching (no normalization)
862+
let rules_nfa = rules_to_nfa_char(&combined.rules);
863+
let product = ProductAutomatonChar::new(rules_nfa, 1);
864+
865+
// Build a dictionary of terms to search
866+
let mut dictionary = DynamicDawgChar::new();
867+
dictionary.insert("phone");
868+
dictionary.insert("fone");
869+
dictionary.insert("today");
870+
dictionary.insert("knight");
871+
872+
// Query for correction candidates by filtering dictionary terms
873+
let query = "fone";
874+
let mut candidates: Vec<(&str, u8)> = dictionary
875+
.iter()
876+
.filter_map(|term| {
877+
product.min_distance(term).map(|dist| (term, dist))
878+
})
879+
.collect();
880+
881+
// Sort by distance (best matches first)
882+
candidates.sort_by_key(|(_, dist)| *dist);
883+
// candidates = [("fone", 0), ("phone", 0), ...]
884+
885+
// Optional: Apply rules to normalize text
886+
let normalized = combined.apply("phone"); // "fone" (ph→f)
887+
let normalized = combined.apply("knight"); // "nit" (silent k, gh→∅)
888+
```
889+
890+
#### Algorithm Variants
891+
892+
```rust
893+
use liblevenshtein::transducer::Algorithm;
894+
895+
// Standard Levenshtein
896+
let standard = ProductAutomatonChar::new(nfa.clone(), 1);
807897

808-
// Match words with nasal-vowel-stop pattern within distance 1
809-
let grep = PhoneticGrep::from_pattern("[:nasal:][:vowel:][:stop:]", 1)?;
810-
// Matches: "nap", "map", "nod", "mob", "nip", "mat" ...
898+
// Transposition-aware (character swaps count as 1 edit)
899+
let transposition = ProductAutomatonChar::with_algorithm(
900+
nfa.clone(), 1, Algorithm::Transposition
901+
);
811902

812-
// Front vowels for palatalization contexts
813-
let grep = PhoneticGrep::from_pattern("c[:front_vowel:]", 1)?;
814-
// Matches: "ce", "ci" (soft c contexts)
903+
// Merge-and-split (for OCR: "cl"→"d", "ä"→"ae")
904+
let merge_split = ProductAutomatonChar::with_algorithm(
905+
nfa, 1, Algorithm::MergeAndSplit
906+
);
815907
```
816908

817-
**Phonetic Flags** (compile-time transformations):
909+
#### PhoneticGrep (Convenience API)
910+
911+
For quick on-the-fly matching without building a dictionary:
912+
818913
```rust
819-
// Case-insensitive matching
820-
let grep = PhoneticGrep::from_pattern("(?i:hello)", 1)?;
821-
assert!(grep.matches("HELLO").is_some());
914+
use liblevenshtein::phonetic::grep::PhoneticGrep;
822915

823-
// Accent-insensitive matching
824-
let grep = PhoneticGrep::from_pattern("(?a:cafe)", 0)?;
825-
assert!(grep.matches("café").is_some()); // é → e
916+
// Quick on-the-fly matching
917+
let grep = PhoneticGrep::from_pattern("phone", 1)?;
918+
assert!(grep.matches("phone").is_some());
919+
assert!(grep.matches("fone").is_some());
826920

827-
// Combined case + accent insensitive
921+
// With phonetic flags (case + accent insensitive)
828922
let grep = PhoneticGrep::from_pattern("(?ia:cafe)", 1)?;
829923
assert!(grep.matches("CAFÉ").is_some());
830924

831-
// Local distance override: (?;N:pattern)
832-
let grep = PhoneticGrep::from_pattern("(?;2:difficult)", 0)?;
833-
// Pattern-level distance of 2, ignoring the constructor's 0
925+
// With rules from file
926+
let grep = PhoneticGrep::with_rules("fone", Path::new("english.llev"), 1)?;
927+
assert!(grep.matches("phone").is_some());
928+
```
929+
930+
**Named Character Classes** (phonetic-aware):
931+
```rust
932+
// [:vowel:] includes ASCII vowels + IPA vowels (ə, ɪ, ʊ, ɛ, etc.)
933+
// [:consonant:] includes ASCII consonants + IPA symbols
934+
// [:fricative:] matches f,v,s,z,h + digraphs (sh,th,zh)
935+
// [:nasal:] matches m,n + ng digraph + IPA ŋ
936+
// [:stop:] / [:plosive:] matches p,b,t,d,k,g
937+
// [:front_vowel:] / [:back_vowel:] by tongue position
938+
// [:voiced:] / [:voiceless:] by voicing
834939
```
835940

836941
**Formal Verification**: All algorithms are proven correct in Coq/Rocq:
@@ -847,43 +952,11 @@ let grep = PhoneticGrep::from_pattern("(?;2:difficult)", 0)?;
847952
- Position skipping optimization: 50/50 proofs complete (100% verified)
848953
- Modular proof decomposition with 0 Admitted lemmas
849954

850-
**Rule Sets**:
851-
- `orthography_rules()` - 8 orthographic transformations (ch→ç, ph→f, silent letters)
852-
- `phonetic_rules()` - 3 phonetic approximations for fuzzy matching (th→t, qu↔kw)
853-
- `zompist_rules()` - Complete 13-rule set from Zompist English spelling system
854-
855-
**Dual u8/char Support** (following existing codebase patterns):
856-
```rust
857-
// Byte-level (ASCII, ~5% faster, 4× less memory)
858-
let result = apply_rules_seq(&orthography_rules(), &phones_u8, fuel);
859-
860-
// Character-level (Unicode, correct for accented chars, CJK, emoji)
861-
let result = apply_rules_seq_char(&orthography_rules_char(), &phones_char, fuel);
862-
```
863-
864-
**With Phonetic Rules** (for spelling normalization):
865-
```rust
866-
use liblevenshtein::phonetic::grep::PhoneticGrep;
867-
use std::path::Path;
868-
869-
// Load rules from .llev file and compose with Levenshtein automaton
870-
let grep = PhoneticGrep::with_rules(
871-
"fone", // Query (phonetic spelling)
872-
Path::new("english.llev"), // Rules: ph→f, gh→silent, etc.
873-
1 // Max Levenshtein distance
874-
)?;
875-
876-
// "phone" matches "fone" after rule application + distance tolerance
877-
assert!(grep.matches("phone").is_some());
878-
assert!(grep.matches("fone").is_some()); // exact after normalization
879-
assert!(grep.matches("phones").is_some()); // distance 1
880-
```
881-
882955
**Verification Artifacts**:
883956
- Coq proofs: [`docs/verification/phonetic/`](docs/verification/phonetic/)
884957
- Rust implementation: [`src/phonetic/`](src/phonetic/)
885-
- Property tests: [`src/phonetic/properties.rs`](src/phonetic/properties.rs) (14 tests, 3,584 test cases)
886-
- Benchmarks: [`benches/phonetic_rules.rs`](benches/phonetic_rules.rs) (7 benchmark groups)
958+
- Property tests: [`src/phonetic/properties.rs`](src/phonetic/properties.rs)
959+
- Benchmarks: [`benches/phonetic_rules.rs`](benches/phonetic_rules.rs)
887960
- Example: [`examples/phonetic_rewrite.rs`](examples/phonetic_rewrite.rs)
888961

889962
Enable with:
@@ -892,11 +965,6 @@ Enable with:
892965
liblevenshtein = { git = "https://github.com/universal-automata/liblevenshtein-rust", features = ["phonetic-rules"] }
893966
```
894967

895-
Run the example:
896-
```bash
897-
cargo run --example phonetic_rewrite --features phonetic-rules
898-
```
899-
900968
**Source**: Based on [Zompist's English spelling rules](https://zompist.com/spell.html) with complete formal verification in Coq/Rocq (630+ lines of proofs, 100% proven, zero Admitted).
901969

902970
### LLev/LLRE - Phonetic Pattern Languages
@@ -907,36 +975,24 @@ For complex phonetic matching, use `.llev` files for rewrite rules or `.llre` fi
907975
use liblevenshtein::phonetic::llev::parse_str;
908976
use liblevenshtein::phonetic::llre;
909977

910-
// LLev: Rewrite rules with phonetic context
978+
// LLev: Rewrite rules with phonetic context and named classes
911979
let rules = parse_str(r#"
912980
@name "English Phonetic Rules"
913981
914-
[id: 1, name: "ph to f"]
915-
ph -> f; // phone → fone
916-
917-
[id: 2, name: "soft c"]
918-
c -> s / _[:front_vowel:]; // city → sity (before e,i)
919-
920-
[id: 3, name: "silent gh"]
921-
gh -> / [:vowel:]_; // night → nit (after vowel)
982+
ph -> f; // phone → fone
983+
c -> s / _[:front_vowel:]; // city → sity (before e,i)
984+
gh -> / [:vowel:]_; // night → nit (silent after vowel)
922985
"#)?;
923986

924-
// LLRE: Regex patterns with named character classes
987+
// LLRE: Compile regex pattern to NFA
925988
let pattern = llre::compile_pattern("[:fricative:]one")?;
926-
assert!(pattern.matches("fone")); // f ∈ [:fricative:]
927-
assert!(pattern.matches("shone")); // sh ∈ [:fricative:]
928-
assert!(pattern.matches("zone")); // z ∈ [:fricative:]
929-
930-
// LLRE with flags
931-
let pattern = llre::compile_pattern("(?ia:[:nasal:][:vowel:][:stop:])")?;
932-
// Case+accent insensitive nasal-vowel-stop pattern
989+
assert!(pattern.matches("fone")); // f ∈ [:fricative:]
990+
assert!(pattern.matches("shone")); // sh ∈ [:fricative:]
933991

934-
// AOT compilation for instant loading
935-
#[cfg(feature = "serialization")]
936-
{
937-
llre::save(&pattern, "pattern.llre.bin")?;
938-
let loaded = llre::load("pattern.llre.bin")?;
939-
}
992+
// Compose LLRE pattern with Levenshtein for fuzzy matching
993+
use liblevenshtein::phonetic::nfa::ProductAutomatonChar;
994+
let product = ProductAutomatonChar::new(pattern.nfa.clone(), 1);
995+
assert!(product.accepts("phone")); // distance 1 from pattern
940996
```
941997

942998
See [`examples/phonetic_spellcheck`](examples/phonetic_spellcheck/) for a complete demo, and the [LLev Grammar](docs/grammar/llev.ebnf) / [LLRE Grammar](docs/grammar/llre.ebnf) for full syntax reference.

0 commit comments

Comments
 (0)