Skip to content

Commit 19e9f10

Browse files
authored
Merge pull request #31 from Quad9DNS/feature/regex-rule
Add regex rule
2 parents a1d319a + 469bfbf commit 19e9f10

File tree

14 files changed

+211
-16
lines changed

14 files changed

+211
-16
lines changed

Cargo.lock

Lines changed: 6 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ lazy_static = { version = "1.5.0" }
2828
metrics = { version = "0.24.2" }
2929
metrics-exporter-prometheus = { version = "0.17.0" }
3030
metrics-util = { version = "0.19.1" }
31+
regex = { version = "1.12.3" }
3132
rdkafka = { version = "0.37.0" }
3233
rphonetic = { version = "3.0.3" }
3334
serde = { version = "1.0.219" }

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ Stringsimile supports different inputs, outputs and rules to use when comparing
4646
- NYSIIS
4747
- Match Rating
4848
- Bitflip
49+
- Regex
4950

5051
Check out the [example rules file](./distribution/rules/example.json) to see how they can be defined. You can also check out the included man pages (`man 5 stringsimile-rule-config`).
5152

bin/stringsimile-service/tests/basic_file_test.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ const INPUT_DATA: &[u8] =
2323
"#;
2424

2525
const RULES_DATA: &[u8] = br#"
26-
{ "name": "Example string group", "rule_sets": [ { "name": "Test rule set", "string_match": "test", "preprocessors": [ { "preprocessor_type": "split_target", "ignore_tld": true } ], "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "hamming", "values": { "maximum_distance": 3 } }, { "rule_type": "soundex", "values": { "minimum_similarity": 3 } }, { "rule_type": "metaphone", "values": { "max_code_length": 3 } }, { "rule_type": "nysiis", "values": { "strict": true } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "jaro_winkler", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "confusables" }, { "rule_type": "match_rating" }, { "rule_type": "damerau_levenshtein", "values": { "maximum_distance": 3 } } ] }, { "name": "Example rule set", "split_target": true, "ignore_tld": true, "string_match": "example", "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "bitflip" } ] } ] }
26+
{ "name": "Example string group", "rule_sets": [ { "name": "Test rule set", "string_match": "test", "preprocessors": [ { "preprocessor_type": "split_target", "ignore_tld": true } ], "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "hamming", "values": { "maximum_distance": 3 } }, { "rule_type": "soundex", "values": { "minimum_similarity": 3 } }, { "rule_type": "metaphone", "values": { "max_code_length": 3 } }, { "rule_type": "nysiis", "values": { "strict": true } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "jaro_winkler", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "confusables" }, { "rule_type": "match_rating" }, { "rule_type": "damerau_levenshtein", "values": { "maximum_distance": 3 } } ] }, { "name": "Example rule set", "split_target": true, "ignore_tld": true, "string_match": "example", "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "bitflip" }, { "rule_type": "regex", "values": { "pattern": "test" } } ] } ] }
2727
"#;
2828

2929
#[test]

crates/stringsimile-config/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ version.workspace = true
99
stringsimile-matcher.workspace = true
1010

1111
hashbrown.workspace = true
12+
regex.workspace = true
1213
serde = { workspace = true, features = ["derive"] }
1314
serde_json.workspace = true
1415
snafu.workspace = true

crates/stringsimile-config/src/rules.rs

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
//! Configuration for rules
2+
use regex::Regex;
23
use serde::{Deserialize, Serialize};
3-
use snafu::Snafu;
4+
use snafu::{ResultExt, Snafu};
45
use stringsimile_matcher::{
56
Error,
67
rule::{GenericMatcherRule, IntoGenericMatcherRule},
@@ -15,6 +16,7 @@ use stringsimile_matcher::{
1516
match_rating::MatchRatingRule,
1617
metaphone::{MetaphoneRule, MetaphoneRuleType},
1718
nysiis::NysiisRule,
19+
regex::RegexRule,
1820
soundex::{SoundexRule, SoundexRuleType},
1921
},
2022
};
@@ -45,6 +47,8 @@ pub enum RuleConfig {
4547
MatchRating,
4648
/// Configuration for Bitflip rule
4749
Bitflip(Option<BitflipConfig>),
50+
/// Configuration for Regex rule
51+
Regex(RegexConfig),
4852
}
4953

5054
/// Errors for rule configuration
@@ -98,6 +102,13 @@ pub enum RuleConfigError {
98102
/// Position at which non-ASCII char was found.
99103
index: usize,
100104
},
105+
106+
/// Regex rule configuration error
107+
#[snafu(display("Invalid pattern for Regex rule: {}", source))]
108+
RegexInvalidPattern {
109+
/// Regex error.
110+
source: regex::Error,
111+
},
101112
}
102113

103114
impl RuleConfig {
@@ -146,6 +157,9 @@ impl RuleConfig {
146157
.build(target_str)?
147158
.into_generic_matcher(),
148159
),
160+
RuleConfig::Regex(regex_config) => {
161+
Box::new(regex_config.build()?.into_generic_matcher())
162+
}
149163
})
150164
}
151165
}
@@ -421,6 +435,21 @@ impl Default for BitflipConfig {
421435
}
422436
}
423437

438+
/// Configuration for Regex rule
439+
#[derive(Debug, Clone, Serialize, Deserialize)]
440+
pub struct RegexConfig {
441+
/// Regex pattern to match against.
442+
pub pattern: String,
443+
}
444+
445+
impl RegexConfig {
446+
fn build(&self) -> Result<RegexRule, Error> {
447+
Ok(RegexRule::new(
448+
Regex::new(&self.pattern).context(RegexInvalidPatternSnafu)?,
449+
))
450+
}
451+
}
452+
424453
#[cfg(test)]
425454
mod tests {
426455
use super::*;
@@ -802,4 +831,44 @@ mod tests {
802831
assert_eq!(input_str, "abcčćddžđ");
803832
assert_eq!(*index, 3);
804833
}
834+
835+
#[test]
836+
fn test_parse_regex() {
837+
let json = r#"
838+
{
839+
"rule_type": "regex",
840+
"values": {
841+
"pattern": "test"
842+
}
843+
}
844+
"#;
845+
846+
let RuleConfig::Regex(config) = serde_json::from_str(json).unwrap() else {
847+
panic!("Expected Regex config");
848+
};
849+
assert_eq!(config.pattern, "test");
850+
851+
let res = config.build();
852+
assert!(res.is_ok());
853+
}
854+
855+
#[test]
856+
fn test_parse_regex_invalid_pattern() {
857+
let json = r#"
858+
{
859+
"rule_type": "regex",
860+
"values": {
861+
"pattern": "["
862+
}
863+
}
864+
"#;
865+
866+
let RuleConfig::Regex(config) = serde_json::from_str(json).unwrap() else {
867+
panic!("Expected Regex config");
868+
};
869+
assert_eq!(config.pattern, "[");
870+
871+
let res = config.build();
872+
assert!(res.is_err());
873+
}
805874
}

crates/stringsimile-matcher/Cargo.toml

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,24 +7,26 @@ version.workspace = true
77

88
[features]
99
default = ["all"]
10-
all = ["rules-levenshtein", "rules-hamming", "rules-jaro", "rules-jaro-winkler", "rules-confusables", "rules-damerau-levenshtein", "rules-soundex", "rules-metaphone", "rules-nysiis", "rules-match-rating", "rules-bitflip"]
11-
rules-levenshtein = ["dep:triple_accel"]
10+
all = ["rules-levenshtein", "rules-hamming", "rules-jaro", "rules-jaro-winkler", "rules-confusables", "rules-damerau-levenshtein", "rules-soundex", "rules-metaphone", "rules-nysiis", "rules-match-rating", "rules-bitflip", "rules-regex"]
11+
rules-bitflip = ["dep:lazy_static"]
12+
rules-confusables = ["dep:confusables"]
13+
rules-damerau-levenshtein = ["dep:triple_accel"]
1214
rules-hamming = ["dep:triple_accel"]
1315
rules-jaro = ["dep:strsim"]
1416
rules-jaro-winkler = ["dep:strsim"]
15-
rules-confusables = ["dep:confusables"]
16-
rules-damerau-levenshtein = ["dep:triple_accel"]
17-
rules-soundex = ["dep:rphonetic"]
17+
rules-levenshtein = ["dep:triple_accel"]
18+
rules-match-rating = ["dep:rphonetic"]
1819
rules-metaphone = ["dep:rphonetic"]
1920
rules-nysiis = ["dep:rphonetic"]
20-
rules-match-rating = ["dep:rphonetic"]
21-
rules-bitflip = ["dep:lazy_static"]
21+
rules-regex = ["dep:regex"]
22+
rules-soundex = ["dep:rphonetic"]
2223

2324
[dependencies]
2425
confusables = { workspace = true, optional = true }
26+
hashbrown.workspace = true
2527
lazy_static = { workspace = true, optional = true }
2628
metrics.workspace = true
27-
hashbrown.workspace = true
29+
regex = { workspace = true, optional = true }
2830
rphonetic = { workspace = true, optional = true }
2931
serde = { workspace = true, features = ["derive"] }
3032
serde_json.workspace = true

crates/stringsimile-matcher/benches/rules.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use criterion::{Criterion, criterion_group, criterion_main};
2+
use regex::Regex;
23
use stringsimile_matcher::{
34
rule::MatcherRule,
45
rules::{
@@ -12,6 +13,7 @@ use stringsimile_matcher::{
1213
match_rating::MatchRatingRule,
1314
metaphone::{MetaphoneRule, MetaphoneRuleType},
1415
nysiis::NysiisRule,
16+
regex::RegexRule,
1517
soundex::{SoundexRule, SoundexRuleType},
1618
},
1719
};
@@ -330,6 +332,24 @@ bench_rule! {
330332
}
331333
}
332334

335+
bench_rule! {
336+
name = regex_exact_match;
337+
single_match = "randomWstring_to_find";
338+
single_mismatch = "some different string";
339+
builder {
340+
RegexRule::new(Regex::new("randomWstring_to_find").unwrap())
341+
}
342+
}
343+
344+
bench_rule! {
345+
name = regex_complex_pattern;
346+
single_match = "randomWstring_to_find";
347+
single_mismatch = "some different string";
348+
builder {
349+
RegexRule::new(Regex::new(r#"\w*_.*"#).unwrap())
350+
}
351+
}
352+
333353
criterion_group!(
334354
benches,
335355
confusables,
@@ -351,5 +371,7 @@ criterion_group!(
351371
bitflip_case_insensitive,
352372
bitflip_ascii_printable,
353373
bitflip_ascii_printable_case_insensitive,
374+
regex_exact_match,
375+
regex_complex_pattern,
354376
);
355377
criterion_main!(benches);

crates/stringsimile-matcher/benches/rulesets.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use criterion::{Criterion, criterion_group, criterion_main};
2+
use regex::Regex;
23
use stringsimile_matcher::{
34
preprocessors::{Preprocessor, SplitTargetConfig},
45
rule::IntoGenericMatcherRule,
@@ -13,6 +14,7 @@ use stringsimile_matcher::{
1314
match_rating::MatchRatingRule,
1415
metaphone::{MetaphoneRule, MetaphoneRuleType},
1516
nysiis::NysiisRule,
17+
regex::RegexRule,
1618
soundex::{SoundexRule, SoundexRuleType},
1719
},
1820
ruleset::{RuleSet, StringGroup, StringGroupContext},
@@ -201,6 +203,7 @@ bench_ruleset! {
201203
Box::new(SoundexRule::new(SoundexRuleType::Normal, 5, target_str).into_generic_matcher()),
202204
Box::new(SoundexRule::new(SoundexRuleType::Refined, 5, target_str).into_generic_matcher()),
203205
Box::new(BitflipRule::new_dns(target_str, true)),
206+
Box::new(RegexRule::new(Regex::new(target_str).unwrap())),
204207
]
205208
}])
206209
}
@@ -233,6 +236,7 @@ bench_ruleset! {
233236
Box::new(SoundexRule::new(SoundexRuleType::Normal, 5, target_str).into_generic_matcher()),
234237
Box::new(SoundexRule::new(SoundexRuleType::Refined, 5, target_str).into_generic_matcher()),
235238
Box::new(BitflipRule::new_dns(target_str, true)),
239+
Box::new(RegexRule::new(Regex::new(target_str).unwrap())),
236240
]
237241
}])
238242
}
@@ -265,6 +269,7 @@ bench_ruleset! {
265269
Box::new(SoundexRule::new(SoundexRuleType::Normal, 5, target_str).into_generic_matcher()),
266270
Box::new(SoundexRule::new(SoundexRuleType::Refined, 5, target_str).into_generic_matcher()),
267271
Box::new(BitflipRule::new_dns(target_str, true)),
272+
Box::new(RegexRule::new(Regex::new(target_str).unwrap())),
268273
]
269274
}])
270275
}

crates/stringsimile-matcher/src/rules/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,7 @@ pub mod match_rating;
2020
pub mod metaphone;
2121
#[cfg(feature = "rules-nysiis")]
2222
pub mod nysiis;
23+
#[cfg(feature = "rules-regex")]
24+
pub mod regex;
2325
#[cfg(feature = "rules-soundex")]
2426
pub mod soundex;

0 commit comments

Comments
 (0)