Skip to content

Commit 87195a4

Browse files
committed
Add soundex, refined_soundex & phonogram moonblade functions
1 parent 9e94113 commit 87195a4

File tree

6 files changed

+52
-13
lines changed

6 files changed

+52
-13
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
* Adding `xan bisect`.
88
* Adding `xan flatten -N/--non-empty`.
9+
* Adding the `soundex`, `refined_soundex` & `phonogram` moonblade functions for phonetic encoding.
910

1011
*Fixes*
1112

Cargo.lock

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ npyz = "0.8.3"
100100
opener = "0.7.2"
101101
ordered-float = "5.0.0"
102102
pad = "0.1.6"
103-
paltoquet = "0.11.0"
103+
paltoquet = "0.12.0"
104104
pariter = "0.5.1"
105105
pest = "2.7.15"
106106
pest_derive = "2.7.15"

docs/moonblade/functions.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,9 @@ add(trim(name) | len, 2) - Can be used anywhere
252252
## Fuzzy matching & information retrieval
253253

254254
- **fingerprint**(*string*) -> `string`: Fingerprint a string by normalizing characters, re-ordering and deduplicating its word tokens before re-joining them by spaces.
255+
- **soundex**(*name*) -> `string`: Compute the SOUNDEX code (a phonetic encoding) of given name.
256+
- **refined_soundex**(*name*) -> `string`: Compute the refined SOUNDEX code (a phonetic encoding) of given name.
257+
- **phonogram**(*name*) -> `string`: Compute the "phonogram" code (yomguithereal's own phonetic encoding) of given name.
255258
- **carry_stemmer**(*string*) -> `string`: Apply the "Carry" stemmer targeting the French language.
256259
- **s_stemmer**(*string*) -> `string`: Apply a very simple stemmer removing common plural inflexions in some languages.
257260
- **unidecode**(*string*) -> `string`: Convert string to ascii as well as possible.

src/moonblade/doc/functions.json

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,24 @@
641641
"returns": "string",
642642
"help": "Fingerprint a string by normalizing characters, re-ordering and deduplicating its word tokens before re-joining them by spaces."
643643
},
644+
{
645+
"name": "soundex",
646+
"arguments": ["name"],
647+
"returns": "string",
648+
"help": "Compute the SOUNDEX code (a phonetic encoding) of given name."
649+
},
650+
{
651+
"name": "refined_soundex",
652+
"arguments": ["name"],
653+
"returns": "string",
654+
"help": "Compute the refined SOUNDEX code (a phonetic encoding) of given name."
655+
},
656+
{
657+
"name": "phonogram",
658+
"arguments": ["name"],
659+
"returns": "string",
660+
"help": "Compute the \"phonogram\" code (yomguithereal's own phonetic encoding) of given name."
661+
},
644662
{
645663
"name": "carry_stemmer",
646664
"arguments": ["string"],

src/moonblade/functions.rs

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use lazy_static::lazy_static;
1717
use mime2ext::mime2ext;
1818
use namedlock::{AutoCleanup, LockSpace};
1919
use paltoquet::{
20+
phonetics::{phonogram, refined_soundex, soundex},
2021
stemmers::{fr::carry_stemmer, s_stemmer},
2122
tokenizers::FingerprintTokenizer,
2223
};
@@ -81,7 +82,10 @@ pub fn get_function(name: &str) -> Option<(Function, FunctionArguments)> {
8182
),
8283
"basename" => (basename, FunctionArguments::with_range(1..=2)),
8384
"bytesize" => (bytesize, FunctionArguments::unary()),
84-
"carry_stemmer" => (carry_stemmer_fn, FunctionArguments::unary()),
85+
"carry_stemmer" => (
86+
|args| abstract_unary_string_fn(args, |string| Cow::Owned(carry_stemmer(string))),
87+
FunctionArguments::unary(),
88+
),
8589
"ceil" => (
8690
|args| round_like_op(args, DynamicNumber::ceil),
8791
FunctionArguments::with_range(1..=2),
@@ -227,6 +231,10 @@ pub fn get_function(name: &str) -> Option<(Function, FunctionArguments)> {
227231
"parse_dataurl" => (parse_dataurl, FunctionArguments::unary()),
228232
"parse_json" => (parse_json, FunctionArguments::unary()),
229233
"parse_py_literal" => (parse_py_literal, FunctionArguments::unary()),
234+
"phonogram" => (
235+
|args| abstract_unary_string_fn(args, |string| Cow::Owned(phonogram(string))),
236+
FunctionArguments::unary(),
237+
),
230238
"pjoin" | "pathjoin" => (pathjoin, FunctionArguments::variadic(2)),
231239
"pow" => (
232240
|args| binary_arithmetic_op(args, DynamicNumber::pow),
@@ -244,6 +252,10 @@ pub fn get_function(name: &str) -> Option<(Function, FunctionArguments)> {
244252
),
245253
"read_csv" => (read_csv, FunctionArguments::unary()),
246254
"read_json" => (read_json, FunctionArguments::unary()),
255+
"refined_soundex" => (
256+
|args| abstract_unary_string_fn(args, |string| Cow::Owned(refined_soundex(string))),
257+
FunctionArguments::unary(),
258+
),
247259
"regex" => (parse_regex, FunctionArguments::unary()),
248260
"replace" => (replace, FunctionArguments::nary(3)),
249261
"round" => (
@@ -253,6 +265,10 @@ pub fn get_function(name: &str) -> Option<(Function, FunctionArguments)> {
253265
"shell" => (shell, FunctionArguments::unary()),
254266
"shlex_split" => (shlex_split, FunctionArguments::unary()),
255267
"slice" => (slice, FunctionArguments::with_range(2..=3)),
268+
"soundex" => (
269+
|args| abstract_unary_string_fn(args, |string| Cow::Owned(soundex(string))),
270+
FunctionArguments::unary(),
271+
),
256272
"split" => (split, FunctionArguments::with_range(2..=3)),
257273
"sqrt" => (
258274
|args| unary_arithmetic_op(args, DynamicNumber::sqrt),
@@ -272,7 +288,10 @@ pub fn get_function(name: &str) -> Option<(Function, FunctionArguments)> {
272288
FunctionArguments::variadic(2),
273289
),
274290
"sum" => (sum, FunctionArguments::unary()),
275-
"s_stemmer" => (s_stemmer_fn, FunctionArguments::unary()),
291+
"s_stemmer" => (
292+
|args| abstract_unary_string_fn(args, s_stemmer),
293+
FunctionArguments::unary(),
294+
),
276295
"eq" => (
277296
|args| sequence_compare(args, Ordering::is_eq),
278297
FunctionArguments::binary(),
@@ -1702,16 +1721,13 @@ fn fingerprint(args: BoundArguments) -> FunctionResult {
17021721
))
17031722
}
17041723

1705-
fn s_stemmer_fn(args: BoundArguments) -> FunctionResult {
1706-
let string = args.get1().try_as_str()?;
1707-
1708-
Ok(DynamicValue::from(s_stemmer(&string)))
1709-
}
1710-
1711-
fn carry_stemmer_fn(args: BoundArguments) -> FunctionResult {
1724+
fn abstract_unary_string_fn<F>(args: BoundArguments, function: F) -> FunctionResult
1725+
where
1726+
F: FnOnce(&str) -> Cow<str>,
1727+
{
17121728
let string = args.get1().try_as_str()?;
17131729

1714-
Ok(DynamicValue::from(carry_stemmer(&string)))
1730+
Ok(DynamicValue::from(function(&string)))
17151731
}
17161732

17171733
// Utils

0 commit comments

Comments
 (0)