Skip to content

Commit 702cccc

Browse files
authored
Merge pull request #33 from Quad9DNS/feature/substring-levenshtein-variants
Add substring variants of levenshtein and damerau-levenshtein rules
2 parents b15aa22 + 6cd3a6f commit 702cccc

File tree

10 files changed

+350
-10
lines changed

10 files changed

+350
-10
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,12 @@ Stringsimile supports different inputs, outputs and rules to use when comparing
3636
### Supported rules
3737

3838
- Levenshtein
39+
- Levenshtein substring variant
3940
- Jaro
4041
- Jaro-Winkler
4142
- IDN Confusables
4243
- Damerau-Levenshtein
44+
- Damerau-Levenshtein substring variant
4345
- Hamming
4446
- Soundex
4547
- Metaphone

bin/stringsimile-service/tests/basic_file_test.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ const INPUT_DATA: &[u8] =
2323
"#;
2424

2525
const RULES_DATA: &[u8] = br#"
26-
{ "name": "Example string group", "rule_sets": [ { "name": "Test rule set", "string_match": "test", "preprocessors": [ { "preprocessor_type": "split_target", "ignore_tld": true } ], "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "hamming", "values": { "maximum_distance": 3 } }, { "rule_type": "soundex", "values": { "minimum_similarity": 3 } }, { "rule_type": "metaphone", "values": { "max_code_length": 3 } }, { "rule_type": "nysiis", "values": { "strict": true } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "jaro_winkler", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "confusables" }, { "rule_type": "match_rating" }, { "rule_type": "damerau_levenshtein", "values": { "maximum_distance": 3 } } ] }, { "name": "Example rule set", "split_target": true, "ignore_tld": true, "string_match": "example", "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "bitflip" }, { "rule_type": "regex", "values": { "pattern": "test" } }, { "rule_type": "cidr", "values": { "address": "192.168.0.0/24" } } ] } ] }
26+
{ "name": "Example string group", "rule_sets": [ { "name": "Test rule set", "string_match": "test", "preprocessors": [ { "preprocessor_type": "split_target", "ignore_tld": true } ], "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "levenshtein_substring", "values": { "maximum_distance": 3 } }, { "rule_type": "hamming", "values": { "maximum_distance": 3 } }, { "rule_type": "soundex", "values": { "minimum_similarity": 3 } }, { "rule_type": "metaphone", "values": { "max_code_length": 3 } }, { "rule_type": "nysiis", "values": { "strict": true } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "jaro_winkler", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "confusables" }, { "rule_type": "match_rating" }, { "rule_type": "damerau_levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "damerau_levenshtein_substring", "values": { "maximum_distance": 3 } } ] }, { "name": "Example rule set", "split_target": true, "ignore_tld": true, "string_match": "example", "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "bitflip" }, { "rule_type": "regex", "values": { "pattern": "test" } }, { "rule_type": "cidr", "values": { "address": "192.168.0.0/24" } } ] } ] }
2727
"#;
2828

2929
#[test]

crates/stringsimile-config/src/rules.rs

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ use stringsimile_matcher::{
99
bitflip::BitflipRule,
1010
cidr::CidrRule,
1111
confusables::ConfusablesRule,
12-
damerau_levenshtein::DamerauLevenshteinRule,
12+
damerau_levenshtein::{DamerauLevenshteinRule, DamerauLevenshteinSubstringRule},
1313
hamming::HammingRule,
1414
jaro::JaroRule,
1515
jaro_winkler::JaroWinklerRule,
16-
levenshtein::LevenshteinRule,
16+
levenshtein::{LevenshteinRule, LevenshteinSubstringRule},
1717
match_rating::MatchRatingRule,
1818
metaphone::{MetaphoneRule, MetaphoneRuleType},
1919
nysiis::NysiisRule,
@@ -52,12 +52,16 @@ impl From<&CommonRuleConfig> for stringsimile_matcher::ruleset::CommonRuleConfig
5252
pub enum RuleTypeConfig {
5353
/// Configuration for Levenshtein rule
5454
Levenshtein(LevenshteinConfig),
55+
/// Configuration for Levenshtein substring rule
56+
LevenshteinSubstring(LevenshteinSubstringConfig),
5557
/// Configuration for Hamming rule
5658
Hamming(HammingConfig),
5759
/// Configuration for Confusables rule
5860
Confusables,
5961
/// Configuration for Damerau Levenshtein rule
6062
DamerauLevenshtein(DamerauLevenshteinConfig),
63+
/// Configuration for Damerau Levenshtein substring rule
64+
DamerauLevenshteinSubstring(DamerauLevenshteinSubstringConfig),
6165
/// Configuration for Jaro rule
6266
Jaro(JaroConfig),
6367
/// Configuration for Jaro-Winkler rule
@@ -169,6 +173,9 @@ impl RuleConfig {
169173
.build(ignore_mismatch_metadata)?
170174
.into_generic_matcher(),
171175
),
176+
RuleTypeConfig::LevenshteinSubstring(levenshtein_substring_config) => {
177+
Box::new(levenshtein_substring_config.build()?.into_generic_matcher())
178+
}
172179
RuleTypeConfig::Hamming(hamming_config) => {
173180
Box::new(hamming_config.build()?.into_generic_matcher())
174181
}
@@ -184,6 +191,9 @@ impl RuleConfig {
184191
RuleTypeConfig::DamerauLevenshtein(damerau_levenshtein_config) => {
185192
Box::new(damerau_levenshtein_config.build(ignore_mismatch_metadata)?)
186193
}
194+
RuleTypeConfig::DamerauLevenshteinSubstring(
195+
damerau_levenshtein_substring_config,
196+
) => Box::new(damerau_levenshtein_substring_config.build()?),
187197
RuleTypeConfig::Soundex(soundex_config) => {
188198
Box::new(soundex_config.build(target_str)?.into_generic_matcher())
189199
}
@@ -230,6 +240,21 @@ impl LevenshteinConfig {
230240
}
231241
}
232242

243+
/// Configuration for Levenshtein substring rule
244+
#[derive(Debug, Clone, Serialize, Deserialize)]
245+
pub struct LevenshteinSubstringConfig {
246+
/// Maximum distance
247+
pub maximum_distance: u32,
248+
}
249+
250+
impl LevenshteinSubstringConfig {
251+
fn build(&self) -> Result<LevenshteinSubstringRule, Error> {
252+
Ok(LevenshteinSubstringRule {
253+
maximum_distance: self.maximum_distance,
254+
})
255+
}
256+
}
257+
233258
/// Configuration for Levenshtein rule
234259
#[derive(Debug, Clone, Serialize, Deserialize)]
235260
pub struct HammingConfig {
@@ -271,6 +296,21 @@ impl DamerauLevenshteinConfig {
271296
}
272297
}
273298

299+
/// Configuration for Damerau Levenshtein substring rule
300+
#[derive(Debug, Clone, Serialize, Deserialize)]
301+
pub struct DamerauLevenshteinSubstringConfig {
302+
/// Maximum distance
303+
pub maximum_distance: u32,
304+
}
305+
306+
impl DamerauLevenshteinSubstringConfig {
307+
fn build(&self) -> Result<DamerauLevenshteinSubstringRule, Error> {
308+
Ok(DamerauLevenshteinSubstringRule {
309+
maximum_distance: self.maximum_distance,
310+
})
311+
}
312+
}
313+
274314
/// Configuration for Jaro rule
275315
#[derive(Debug, Clone, Serialize, Deserialize)]
276316
pub struct JaroConfig {
@@ -536,6 +576,24 @@ mod tests {
536576
assert_eq!(3, config.maximum_distance);
537577
}
538578

579+
#[test]
580+
fn test_parse_levenshtein_substring() {
581+
let json = r#"
582+
{
583+
"rule_type": "levenshtein_substring",
584+
"values": {
585+
"maximum_distance": 3
586+
}
587+
}
588+
"#;
589+
590+
let RuleTypeConfig::LevenshteinSubstring(config) = serde_json::from_str(json).unwrap()
591+
else {
592+
panic!("Expected Levenshtein substring config");
593+
};
594+
assert_eq!(3, config.maximum_distance);
595+
}
596+
539597
#[test]
540598
fn test_parse_jaro() {
541599
let json = r#"
@@ -583,6 +641,25 @@ mod tests {
583641
assert_eq!(3, config.maximum_distance);
584642
}
585643

644+
#[test]
645+
fn test_parse_damerau_levenshtein_substring() {
646+
let json = r#"
647+
{
648+
"rule_type": "damerau_levenshtein_substring",
649+
"values": {
650+
"maximum_distance": 3
651+
}
652+
}
653+
"#;
654+
655+
let RuleTypeConfig::DamerauLevenshteinSubstring(config) =
656+
serde_json::from_str(json).unwrap()
657+
else {
658+
panic!("Expected Damera Levenshtein substring config");
659+
};
660+
assert_eq!(3, config.maximum_distance);
661+
}
662+
586663
#[test]
587664
fn test_parse_jaro_winkler() {
588665
let json = r#"

crates/stringsimile-matcher/benches/rules.rs

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ use stringsimile_matcher::{
66
bitflip::BitflipRule,
77
cidr::CidrRule,
88
confusables::ConfusablesRule,
9-
damerau_levenshtein::DamerauLevenshteinRule,
9+
damerau_levenshtein::{DamerauLevenshteinRule, DamerauLevenshteinSubstringRule},
1010
hamming::HammingRule,
1111
jaro::JaroRule,
1212
jaro_winkler::JaroWinklerRule,
13-
levenshtein::LevenshteinRule,
13+
levenshtein::{LevenshteinRule, LevenshteinSubstringRule},
1414
match_rating::MatchRatingRule,
1515
metaphone::{MetaphoneRule, MetaphoneRuleType},
1616
nysiis::NysiisRule,
@@ -189,6 +189,15 @@ bench_rule! {
189189
}
190190
}
191191

192+
bench_rule! {
193+
name = levenshtein_substring;
194+
single_match = "some_string_including_ranodm_string_to_find_inside";
195+
single_mismatch = "some string including some different string inside";
196+
builder {
197+
LevenshteinSubstringRule { maximum_distance: 5 }
198+
}
199+
}
200+
192201
bench_rule! {
193202
name = damerau_levenshtein;
194203
single_match = "ranodm_string_to_find";
@@ -207,6 +216,15 @@ bench_rule! {
207216
}
208217
}
209218

219+
bench_rule! {
220+
name = damerau_levenshtein_substring;
221+
single_match = "some_string_including_ranodm_string_to_find_inside";
222+
single_mismatch = "some string including some different string inside";
223+
builder {
224+
DamerauLevenshteinSubstringRule { maximum_distance: 5 }
225+
}
226+
}
227+
210228
bench_rule! {
211229
name = hamming;
212230
single_match = "ranodm_string_to_find";
@@ -365,8 +383,10 @@ criterion_group!(
365383
confusables,
366384
levenshtein,
367385
levenshtein_optimized_mismatch,
386+
levenshtein_substring,
368387
damerau_levenshtein,
369388
damerau_levenshtein_optimized_mismatch,
389+
damerau_levenshtein_substring,
370390
hamming,
371391
jaro,
372392
jaro_winkler,

crates/stringsimile-matcher/benches/rulesets.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ use stringsimile_matcher::{
77
bitflip::BitflipRule,
88
cidr::CidrRule,
99
confusables::ConfusablesRule,
10-
damerau_levenshtein::DamerauLevenshteinRule,
10+
damerau_levenshtein::{DamerauLevenshteinRule, DamerauLevenshteinSubstringRule},
1111
hamming::HammingRule,
1212
jaro::JaroRule,
1313
jaro_winkler::JaroWinklerRule,
14-
levenshtein::LevenshteinRule,
14+
levenshtein::{LevenshteinRule, LevenshteinSubstringRule},
1515
match_rating::MatchRatingRule,
1616
metaphone::{MetaphoneRule, MetaphoneRuleType},
1717
nysiis::NysiisRule,
@@ -192,7 +192,9 @@ bench_ruleset! {
192192
rules: [
193193
ConfusablesRule.into_generic_matcher().clone_dyn(),
194194
Box::new(LevenshteinRule { maximum_distance: 5, ignore_mismatch_metadata: true }.into_generic_matcher()),
195+
Box::new(LevenshteinSubstringRule { maximum_distance: 5 }.into_generic_matcher()),
195196
Box::new(DamerauLevenshteinRule { maximum_distance: 5, ignore_mismatch_metadata: true }.into_generic_matcher()),
197+
Box::new(DamerauLevenshteinSubstringRule { maximum_distance: 5 }.into_generic_matcher()),
196198
Box::new(HammingRule { maximum_distance: 5 }.into_generic_matcher()),
197199
Box::new(JaroRule { match_percent: 0.5 }.into_generic_matcher()),
198200
Box::new(JaroWinklerRule { match_percent: 0.5 }.into_generic_matcher()),
@@ -226,7 +228,9 @@ bench_ruleset! {
226228
rules: [
227229
ConfusablesRule.into_generic_matcher().clone_dyn(),
228230
Box::new(LevenshteinRule { maximum_distance: 5, ignore_mismatch_metadata: true }.into_generic_matcher()),
231+
Box::new(LevenshteinSubstringRule { maximum_distance: 5 }.into_generic_matcher()),
229232
Box::new(DamerauLevenshteinRule { maximum_distance: 5, ignore_mismatch_metadata: true }.into_generic_matcher()),
233+
Box::new(DamerauLevenshteinSubstringRule { maximum_distance: 5 }.into_generic_matcher()),
230234
Box::new(HammingRule { maximum_distance: 5 }.into_generic_matcher()),
231235
Box::new(JaroRule { match_percent: 0.5 }.into_generic_matcher()),
232236
Box::new(JaroWinklerRule { match_percent: 0.5 }.into_generic_matcher()),
@@ -260,7 +264,9 @@ bench_ruleset! {
260264
rules: [
261265
ConfusablesRule.into_generic_matcher().clone_dyn(),
262266
Box::new(LevenshteinRule { maximum_distance: 5, ignore_mismatch_metadata: true }.into_generic_matcher()),
267+
Box::new(LevenshteinSubstringRule { maximum_distance: 5 }.into_generic_matcher()),
263268
Box::new(DamerauLevenshteinRule { maximum_distance: 5, ignore_mismatch_metadata: true }.into_generic_matcher()),
269+
Box::new(DamerauLevenshteinSubstringRule { maximum_distance: 5 }.into_generic_matcher()),
264270
Box::new(HammingRule { maximum_distance: 5 }.into_generic_matcher()),
265271
Box::new(JaroRule { match_percent: 0.5 }.into_generic_matcher()),
266272
Box::new(JaroWinklerRule { match_percent: 0.5 }.into_generic_matcher()),

0 commit comments

Comments
 (0)