Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ lazy_static = { version = "1.5.0" }
metrics = { version = "0.24.2" }
metrics-exporter-prometheus = { version = "0.17.0" }
metrics-util = { version = "0.19.1" }
regex = { version = "1.12.3" }
rdkafka = { version = "0.37.0" }
rphonetic = { version = "3.0.3" }
serde = { version = "1.0.219" }
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Stringsimile supports different inputs, outputs and rules to use when comparing
- NYSIIS
- Match Rating
- Bitflip
- Regex

Check out the [example rules file](./distribution/rules/example.json) to see how they can be defined. You can also check out the included man pages (`man 5 stringsimile-rule-config`).

Expand Down
2 changes: 1 addition & 1 deletion bin/stringsimile-service/tests/basic_file_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ const INPUT_DATA: &[u8] =
"#;

const RULES_DATA: &[u8] = br#"
{ "name": "Example string group", "rule_sets": [ { "name": "Test rule set", "string_match": "test", "preprocessors": [ { "preprocessor_type": "split_target", "ignore_tld": true } ], "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "hamming", "values": { "maximum_distance": 3 } }, { "rule_type": "soundex", "values": { "minimum_similarity": 3 } }, { "rule_type": "metaphone", "values": { "max_code_length": 3 } }, { "rule_type": "nysiis", "values": { "strict": true } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "jaro_winkler", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "confusables" }, { "rule_type": "match_rating" }, { "rule_type": "damerau_levenshtein", "values": { "maximum_distance": 3 } } ] }, { "name": "Example rule set", "split_target": true, "ignore_tld": true, "string_match": "example", "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "bitflip" } ] } ] }
{ "name": "Example string group", "rule_sets": [ { "name": "Test rule set", "string_match": "test", "preprocessors": [ { "preprocessor_type": "split_target", "ignore_tld": true } ], "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "hamming", "values": { "maximum_distance": 3 } }, { "rule_type": "soundex", "values": { "minimum_similarity": 3 } }, { "rule_type": "metaphone", "values": { "max_code_length": 3 } }, { "rule_type": "nysiis", "values": { "strict": true } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "jaro_winkler", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "confusables" }, { "rule_type": "match_rating" }, { "rule_type": "damerau_levenshtein", "values": { "maximum_distance": 3 } } ] }, { "name": "Example rule set", "split_target": true, "ignore_tld": true, "string_match": "example", "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "bitflip" }, { "rule_type": "regex", "values": { "pattern": "test" } } ] } ] }
"#;

#[test]
Expand Down
1 change: 1 addition & 0 deletions crates/stringsimile-config/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ version.workspace = true
stringsimile-matcher.workspace = true

hashbrown.workspace = true
regex.workspace = true
serde = { workspace = true, features = ["derive"] }
serde_json.workspace = true
snafu.workspace = true
Expand Down
71 changes: 70 additions & 1 deletion crates/stringsimile-config/src/rules.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! Configuration for rules
use regex::Regex;
use serde::{Deserialize, Serialize};
use snafu::Snafu;
use snafu::{ResultExt, Snafu};
use stringsimile_matcher::{
Error,
rule::{GenericMatcherRule, IntoGenericMatcherRule},
Expand All @@ -15,6 +16,7 @@ use stringsimile_matcher::{
match_rating::MatchRatingRule,
metaphone::{MetaphoneRule, MetaphoneRuleType},
nysiis::NysiisRule,
regex::RegexRule,
soundex::{SoundexRule, SoundexRuleType},
},
};
Expand Down Expand Up @@ -45,6 +47,8 @@ pub enum RuleConfig {
MatchRating,
/// Configuration for Bitflip rule
Bitflip(Option<BitflipConfig>),
/// Configuration for Regex rule
Regex(RegexConfig),
}

/// Errors for rule configuration
Expand Down Expand Up @@ -98,6 +102,13 @@ pub enum RuleConfigError {
/// Position at which non-ASCII char was found.
index: usize,
},

/// Regex rule configuration error
#[snafu(display("Invalid pattern for Regex rule: {}", source))]
RegexInvalidPattern {
/// Regex error.
source: regex::Error,
},
}

impl RuleConfig {
Expand Down Expand Up @@ -146,6 +157,9 @@ impl RuleConfig {
.build(target_str)?
.into_generic_matcher(),
),
RuleConfig::Regex(regex_config) => {
Box::new(regex_config.build()?.into_generic_matcher())
}
})
}
}
Expand Down Expand Up @@ -421,6 +435,21 @@ impl Default for BitflipConfig {
}
}

/// Configuration for Regex rule
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RegexConfig {
/// Regex pattern to match against.
pub pattern: String,
}

impl RegexConfig {
fn build(&self) -> Result<RegexRule, Error> {
Ok(RegexRule::new(
Regex::new(&self.pattern).context(RegexInvalidPatternSnafu)?,
))
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -802,4 +831,44 @@ mod tests {
assert_eq!(input_str, "abcčćddžđ");
assert_eq!(*index, 3);
}

#[test]
fn test_parse_regex() {
let json = r#"
{
"rule_type": "regex",
"values": {
"pattern": "test"
}
}
"#;

let RuleConfig::Regex(config) = serde_json::from_str(json).unwrap() else {
panic!("Expected Regex config");
};
assert_eq!(config.pattern, "test");

let res = config.build();
assert!(res.is_ok());
}

#[test]
fn test_parse_regex_invalid_pattern() {
let json = r#"
{
"rule_type": "regex",
"values": {
"pattern": "["
}
}
"#;

let RuleConfig::Regex(config) = serde_json::from_str(json).unwrap() else {
panic!("Expected Regex config");
};
assert_eq!(config.pattern, "[");

let res = config.build();
assert!(res.is_err());
}
}
18 changes: 10 additions & 8 deletions crates/stringsimile-matcher/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,26 @@ version.workspace = true

[features]
default = ["all"]
all = ["rules-levenshtein", "rules-hamming", "rules-jaro", "rules-jaro-winkler", "rules-confusables", "rules-damerau-levenshtein", "rules-soundex", "rules-metaphone", "rules-nysiis", "rules-match-rating", "rules-bitflip"]
rules-levenshtein = ["dep:triple_accel"]
all = ["rules-levenshtein", "rules-hamming", "rules-jaro", "rules-jaro-winkler", "rules-confusables", "rules-damerau-levenshtein", "rules-soundex", "rules-metaphone", "rules-nysiis", "rules-match-rating", "rules-bitflip", "rules-regex"]
rules-bitflip = ["dep:lazy_static"]
rules-confusables = ["dep:confusables"]
rules-damerau-levenshtein = ["dep:triple_accel"]
rules-hamming = ["dep:triple_accel"]
rules-jaro = ["dep:strsim"]
rules-jaro-winkler = ["dep:strsim"]
rules-confusables = ["dep:confusables"]
rules-damerau-levenshtein = ["dep:triple_accel"]
rules-soundex = ["dep:rphonetic"]
rules-levenshtein = ["dep:triple_accel"]
rules-match-rating = ["dep:rphonetic"]
rules-metaphone = ["dep:rphonetic"]
rules-nysiis = ["dep:rphonetic"]
rules-match-rating = ["dep:rphonetic"]
rules-bitflip = ["dep:lazy_static"]
rules-regex = ["dep:regex"]
rules-soundex = ["dep:rphonetic"]

[dependencies]
confusables = { workspace = true, optional = true }
hashbrown.workspace = true
lazy_static = { workspace = true, optional = true }
metrics.workspace = true
hashbrown.workspace = true
regex = { workspace = true, optional = true }
rphonetic = { workspace = true, optional = true }
serde = { workspace = true, features = ["derive"] }
serde_json.workspace = true
Expand Down
22 changes: 22 additions & 0 deletions crates/stringsimile-matcher/benches/rules.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use criterion::{Criterion, criterion_group, criterion_main};
use regex::Regex;
use stringsimile_matcher::{
rule::MatcherRule,
rules::{
Expand All @@ -12,6 +13,7 @@ use stringsimile_matcher::{
match_rating::MatchRatingRule,
metaphone::{MetaphoneRule, MetaphoneRuleType},
nysiis::NysiisRule,
regex::RegexRule,
soundex::{SoundexRule, SoundexRuleType},
},
};
Expand Down Expand Up @@ -330,6 +332,24 @@ bench_rule! {
}
}

bench_rule! {
name = regex_exact_match;
single_match = "randomWstring_to_find";
single_mismatch = "some different string";
builder {
RegexRule::new(Regex::new("randomWstring_to_find").unwrap())
}
}

bench_rule! {
name = regex_complex_pattern;
single_match = "randomWstring_to_find";
single_mismatch = "some different string";
builder {
RegexRule::new(Regex::new(r#"\w*_.*"#).unwrap())
}
}

criterion_group!(
benches,
confusables,
Expand All @@ -351,5 +371,7 @@ criterion_group!(
bitflip_case_insensitive,
bitflip_ascii_printable,
bitflip_ascii_printable_case_insensitive,
regex_exact_match,
regex_complex_pattern,
);
criterion_main!(benches);
5 changes: 5 additions & 0 deletions crates/stringsimile-matcher/benches/rulesets.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use criterion::{Criterion, criterion_group, criterion_main};
use regex::Regex;
use stringsimile_matcher::{
preprocessors::{Preprocessor, SplitTargetConfig},
rule::IntoGenericMatcherRule,
Expand All @@ -13,6 +14,7 @@ use stringsimile_matcher::{
match_rating::MatchRatingRule,
metaphone::{MetaphoneRule, MetaphoneRuleType},
nysiis::NysiisRule,
regex::RegexRule,
soundex::{SoundexRule, SoundexRuleType},
},
ruleset::{RuleSet, StringGroup, StringGroupContext},
Expand Down Expand Up @@ -201,6 +203,7 @@ bench_ruleset! {
Box::new(SoundexRule::new(SoundexRuleType::Normal, 5, target_str).into_generic_matcher()),
Box::new(SoundexRule::new(SoundexRuleType::Refined, 5, target_str).into_generic_matcher()),
Box::new(BitflipRule::new_dns(target_str, true)),
Box::new(RegexRule::new(Regex::new(target_str).unwrap())),
]
}])
}
Expand Down Expand Up @@ -233,6 +236,7 @@ bench_ruleset! {
Box::new(SoundexRule::new(SoundexRuleType::Normal, 5, target_str).into_generic_matcher()),
Box::new(SoundexRule::new(SoundexRuleType::Refined, 5, target_str).into_generic_matcher()),
Box::new(BitflipRule::new_dns(target_str, true)),
Box::new(RegexRule::new(Regex::new(target_str).unwrap())),
]
}])
}
Expand Down Expand Up @@ -265,6 +269,7 @@ bench_ruleset! {
Box::new(SoundexRule::new(SoundexRuleType::Normal, 5, target_str).into_generic_matcher()),
Box::new(SoundexRule::new(SoundexRuleType::Refined, 5, target_str).into_generic_matcher()),
Box::new(BitflipRule::new_dns(target_str, true)),
Box::new(RegexRule::new(Regex::new(target_str).unwrap())),
]
}])
}
Expand Down
2 changes: 2 additions & 0 deletions crates/stringsimile-matcher/src/rules/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,7 @@ pub mod match_rating;
pub mod metaphone;
#[cfg(feature = "rules-nysiis")]
pub mod nysiis;
#[cfg(feature = "rules-regex")]
pub mod regex;
#[cfg(feature = "rules-soundex")]
pub mod soundex;
80 changes: 80 additions & 0 deletions crates/stringsimile-matcher/src/rules/regex.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
//! Regex rule implementation

use std::{fmt::Debug, io::Error};

use regex::Regex;
use serde::{Deserialize, Serialize};

use crate::{
MatcherResult,
rule::{MatcherResultRuleMetadataExt, MatcherRule, RuleMetadata},
};

/// Rule
#[derive(Debug, Clone)]
pub struct RegexRule {
pattern: Regex,
}

impl RegexRule {
/// Creates a new instance of [`RegexRule`], with compiled pattern.
pub fn new(pattern: Regex) -> Self {
Self { pattern }
}
}

/// metadata
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RegexMetadata;

impl MatcherRule for RegexRule {
type OutputMetadata = RegexMetadata;
type Error = Error;

fn match_rule(
&self,
input_str: &str,
_target_str: &str,
) -> MatcherResult<Self::OutputMetadata, Self::Error> {
if self.pattern.is_match(input_str) {
MatcherResult::new_match(RegexMetadata)
} else {
MatcherResult::new_no_match(RegexMetadata)
}
}
}

impl RuleMetadata for RegexMetadata {
const RULE_NAME: &str = "regex";
}

#[cfg(test)]
mod tests {
use crate::rule::MatcherResultExt;

use super::*;

#[test]
fn simple_example() {
let rule = RegexRule::new(Regex::new(r#"netflix\.com\.$"#).unwrap());

let result = rule.match_rule("netflix.com.", "netflix.com.");
assert!(result.is_match());
let result = rule.match_rule("netflix.com", "netflix.com.");
assert!(!result.is_match());
let result = rule.match_rule("neftlix.com.", "netflix.com.");
assert!(!result.is_match());
}

#[test]
fn complex_pattern_example() {
let rule = RegexRule::new(Regex::new(r#".*n.*t.*f.*"#).unwrap());

let result = rule.match_rule("netflix.com.", "netflix.com.");
assert!(result.is_match());
let result = rule.match_rule("netflix.com", "netflix.com.");
assert!(result.is_match());
let result = rule.match_rule("neftlix.com.", "netflix.com.");
assert!(!result.is_match());
}
}
Loading
Loading