diff --git a/crates/stringsimile-config/src/rules.rs b/crates/stringsimile-config/src/rules.rs index b73a64e..561e27a 100644 --- a/crates/stringsimile-config/src/rules.rs +++ b/crates/stringsimile-config/src/rules.rs @@ -22,10 +22,34 @@ use stringsimile_matcher::{ }, }; +/// Configuration for rules +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RuleConfig { + #[serde(flatten, default)] + pub(crate) common: CommonRuleConfig, + #[serde(flatten)] + pub(crate) rule_type: RuleTypeConfig, +} + +/// Common configuration for rules +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CommonRuleConfig { + #[serde(default)] + pub(crate) exit_on_match: bool, +} + +impl From<&CommonRuleConfig> for stringsimile_matcher::ruleset::CommonRuleConfig { + fn from(value: &CommonRuleConfig) -> Self { + Self { + exit_on_match: value.exit_on_match, + } + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "rule_type", rename_all = "snake_case", content = "values")] -/// Configuration for rules -pub enum RuleConfig { +/// Configuration for specific rule types +pub enum RuleTypeConfig { /// Configuration for Levenshtein rule Levenshtein(LevenshteinConfig), /// Configuration for Hamming rule @@ -130,48 +154,63 @@ impl RuleConfig { &self, target_str: &str, ignore_mismatch_metadata: bool, - ) -> Result, Error> { - Ok(match self { - RuleConfig::Levenshtein(levenshtein_config) => Box::new( - levenshtein_config - .build(ignore_mismatch_metadata)? - .into_generic_matcher(), - ), - RuleConfig::Hamming(hamming_config) => { - Box::new(hamming_config.build()?.into_generic_matcher()) - } - RuleConfig::Confusables => Box::new(ConfusablesConfig.build()?.into_generic_matcher()), - RuleConfig::Jaro(jaro_config) => Box::new(jaro_config.build()?.into_generic_matcher()), - RuleConfig::JaroWinkler(jaro_winkler_config) => { - Box::new(jaro_winkler_config.build()?.into_generic_matcher()) - } - RuleConfig::DamerauLevenshtein(damerau_levenshtein_config) => { - Box::new(damerau_levenshtein_config.build(ignore_mismatch_metadata)?) - } - RuleConfig::Soundex(soundex_config) => { - Box::new(soundex_config.build(target_str)?.into_generic_matcher()) - } - RuleConfig::Metaphone(metaphone_config) => { - Box::new(metaphone_config.build(target_str)?.into_generic_matcher()) - } - RuleConfig::Nysiis(nysiis_config) => { - Box::new(nysiis_config.build(target_str)?.into_generic_matcher()) - } - RuleConfig::MatchRating => { - Box::new(MatchRatingConfig.build(target_str)?.into_generic_matcher()) - } - RuleConfig::Bitflip(bitflip_config) => Box::new( - bitflip_config - .clone() - .unwrap_or_default() - .build(target_str)? - .into_generic_matcher(), - ), - RuleConfig::Regex(regex_config) => { - Box::new(regex_config.build()?.into_generic_matcher()) - } - RuleConfig::Cidr(cidr_config) => Box::new(cidr_config.build()?.into_generic_matcher()), - }) + ) -> Result< + ( + stringsimile_matcher::ruleset::CommonRuleConfig, + Box, + ), + Error, + > { + Ok(( + (&self.common).into(), + match &self.rule_type { + RuleTypeConfig::Levenshtein(levenshtein_config) => Box::new( + levenshtein_config + .build(ignore_mismatch_metadata)? + .into_generic_matcher(), + ), + RuleTypeConfig::Hamming(hamming_config) => { + Box::new(hamming_config.build()?.into_generic_matcher()) + } + RuleTypeConfig::Confusables => { + Box::new(ConfusablesConfig.build()?.into_generic_matcher()) + } + RuleTypeConfig::Jaro(jaro_config) => { + Box::new(jaro_config.build()?.into_generic_matcher()) + } + RuleTypeConfig::JaroWinkler(jaro_winkler_config) => { + Box::new(jaro_winkler_config.build()?.into_generic_matcher()) + } + RuleTypeConfig::DamerauLevenshtein(damerau_levenshtein_config) => { + Box::new(damerau_levenshtein_config.build(ignore_mismatch_metadata)?) + } + RuleTypeConfig::Soundex(soundex_config) => { + Box::new(soundex_config.build(target_str)?.into_generic_matcher()) + } + RuleTypeConfig::Metaphone(metaphone_config) => { + Box::new(metaphone_config.build(target_str)?.into_generic_matcher()) + } + RuleTypeConfig::Nysiis(nysiis_config) => { + Box::new(nysiis_config.build(target_str)?.into_generic_matcher()) + } + RuleTypeConfig::MatchRating => { + Box::new(MatchRatingConfig.build(target_str)?.into_generic_matcher()) + } + RuleTypeConfig::Bitflip(bitflip_config) => Box::new( + bitflip_config + .clone() + .unwrap_or_default() + .build(target_str)? + .into_generic_matcher(), + ), + RuleTypeConfig::Regex(regex_config) => { + Box::new(regex_config.build()?.into_generic_matcher()) + } + RuleTypeConfig::Cidr(cidr_config) => { + Box::new(cidr_config.build()?.into_generic_matcher()) + } + }, + )) } } @@ -491,7 +530,7 @@ mod tests { } "#; - let RuleConfig::Levenshtein(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Levenshtein(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Levenshtein config"); }; assert_eq!(3, config.maximum_distance); @@ -508,7 +547,7 @@ mod tests { } "#; - let RuleConfig::Jaro(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Jaro(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Jaro config"); }; assert_eq!(0.4, config.match_percent_threshold); @@ -522,7 +561,7 @@ mod tests { } "#; - let RuleConfig::Confusables = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Confusables = serde_json::from_str(json).unwrap() else { panic!("Expected Confusables config"); }; } @@ -538,7 +577,7 @@ mod tests { } "#; - let RuleConfig::DamerauLevenshtein(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::DamerauLevenshtein(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Damera Levenshtein config"); }; assert_eq!(3, config.maximum_distance); @@ -555,7 +594,7 @@ mod tests { } "#; - let RuleConfig::JaroWinkler(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::JaroWinkler(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Jaro-Winkler config"); }; assert_eq!(0.4, config.match_percent_threshold); @@ -572,7 +611,7 @@ mod tests { } "#; - let RuleConfig::Hamming(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Hamming(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Hamming config"); }; assert_eq!(3, config.maximum_distance); @@ -590,7 +629,7 @@ mod tests { } "#; - let RuleConfig::Soundex(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Soundex(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Soundex config"); }; assert_eq!(3, config.minimum_similarity); @@ -608,7 +647,7 @@ mod tests { } "#; - let RuleConfig::Soundex(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Soundex(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Soundex config"); }; assert_eq!(3, config.minimum_similarity); @@ -627,7 +666,7 @@ mod tests { } "#; - let RuleConfig::Soundex(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Soundex(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Soundex config"); }; assert_eq!(3, config.minimum_similarity); @@ -646,7 +685,7 @@ mod tests { } "#; - let RuleConfig::Metaphone(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Metaphone(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Metaphone config"); }; assert_eq!(Some(3), config.max_code_length); @@ -662,7 +701,7 @@ mod tests { } "#; - let RuleConfig::Metaphone(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Metaphone(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Metaphone config"); }; assert_eq!(Some(4), config.max_code_length); @@ -680,7 +719,7 @@ mod tests { } "#; - let RuleConfig::Metaphone(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Metaphone(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Metaphone config"); }; assert_eq!(None, config.max_code_length); @@ -698,7 +737,7 @@ mod tests { } "#; - let RuleConfig::Metaphone(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Metaphone(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Metaphone config"); }; assert_eq!(default_metaphone_max_code_length(), config.max_code_length); @@ -717,7 +756,7 @@ mod tests { } "#; - let RuleConfig::Metaphone(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Metaphone(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Metaphone config"); }; assert_eq!(Some(3), config.max_code_length); @@ -735,7 +774,7 @@ mod tests { } "#; - let RuleConfig::Nysiis(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Nysiis(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Nysiis config"); }; assert!(!config.strict); @@ -750,7 +789,7 @@ mod tests { } "#; - let RuleConfig::Nysiis(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Nysiis(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Nysiis config"); }; assert!(config.strict); @@ -764,7 +803,7 @@ mod tests { } "#; - let RuleConfig::MatchRating = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::MatchRating = serde_json::from_str(json).unwrap() else { panic!("Expected Match Rating config"); }; } @@ -777,7 +816,7 @@ mod tests { } "#; - let RuleConfig::Bitflip(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Bitflip(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Biflip config"); }; let config = config.unwrap_or_default(); @@ -798,7 +837,7 @@ mod tests { } "#; - let RuleConfig::Bitflip(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Bitflip(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Biflip config"); }; let config = config.unwrap_or_default(); @@ -820,7 +859,7 @@ mod tests { } "#; - let RuleConfig::Bitflip(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Bitflip(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Biflip config"); }; let config = config.unwrap_or_default(); @@ -842,7 +881,7 @@ mod tests { } "#; - let RuleConfig::Bitflip(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Bitflip(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Biflip config"); }; let config = config.unwrap_or_default(); @@ -869,7 +908,7 @@ mod tests { } "#; - let RuleConfig::Regex(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Regex(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Regex config"); }; assert_eq!(config.pattern, "test"); @@ -889,7 +928,7 @@ mod tests { } "#; - let RuleConfig::Regex(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Regex(config) = serde_json::from_str(json).unwrap() else { panic!("Expected Regex config"); }; assert_eq!(config.pattern, "["); @@ -909,7 +948,7 @@ mod tests { } "#; - let RuleConfig::Cidr(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Cidr(config) = serde_json::from_str(json).unwrap() else { panic!("Expected CIDR config"); }; assert_eq!(config.address, "192.168.0.0/24"); @@ -929,7 +968,7 @@ mod tests { } "#; - let RuleConfig::Cidr(config) = serde_json::from_str(json).unwrap() else { + let RuleTypeConfig::Cidr(config) = serde_json::from_str(json).unwrap() else { panic!("Expected CIDR config"); }; assert_eq!(config.address, "test"); diff --git a/crates/stringsimile-config/src/rulesets.rs b/crates/stringsimile-config/src/rulesets.rs index 161be26..382d362 100644 --- a/crates/stringsimile-config/src/rulesets.rs +++ b/crates/stringsimile-config/src/rulesets.rs @@ -91,6 +91,8 @@ impl PreprocessorConfig { #[cfg(test)] mod tests { + use crate::rules::RuleTypeConfig; + use super::*; #[test] @@ -105,6 +107,7 @@ mod tests { "match_rules": [ { "rule_type": "levenshtein", + "exit_on_match": true, "values": { "maximum_distance": 3 } @@ -149,12 +152,21 @@ mod tests { assert_eq!("wikipedia", &set_1.string_match); assert_eq!(2, set_1.match_rules.len()); - let RuleConfig::Levenshtein(set_1_rule_1) = &set_1.match_rules[0] else { + let RuleConfig { + common, + rule_type: RuleTypeConfig::Levenshtein(set_1_rule_1), + } = &set_1.match_rules[0] + else { panic!("Expected levenshtein rule"); }; + assert!(common.exit_on_match); assert_eq!(3, set_1_rule_1.maximum_distance); - let RuleConfig::Jaro(set_1_rule_2) = &set_1.match_rules[1] else { + let RuleConfig { + common: _, + rule_type: RuleTypeConfig::Jaro(set_1_rule_2), + } = &set_1.match_rules[1] + else { panic!("Expected jaro rule"); }; assert_eq!(85.0, set_1_rule_2.match_percent_threshold); @@ -164,11 +176,19 @@ mod tests { assert_eq!("wikilearning", &set_2.string_match); assert_eq!(2, set_2.match_rules.len()); - let RuleConfig::Hamming(set_2_rule_1) = &set_2.match_rules[0] else { + let RuleConfig { + common: _, + rule_type: RuleTypeConfig::Hamming(set_2_rule_1), + } = &set_2.match_rules[0] + else { panic!("Expected hamming rule"); }; assert_eq!(3, set_2_rule_1.maximum_distance); - let RuleConfig::JaroWinkler(set_2_rule_2) = &set_2.match_rules[1] else { + let RuleConfig { + common: _, + rule_type: RuleTypeConfig::JaroWinkler(set_2_rule_2), + } = &set_2.match_rules[1] + else { panic!("Expected jaro winkler rule"); }; assert_eq!(85.0, set_2_rule_2.match_percent_threshold); diff --git a/crates/stringsimile-matcher/benches/rulesets.rs b/crates/stringsimile-matcher/benches/rulesets.rs index 0859e0f..1cf3483 100644 --- a/crates/stringsimile-matcher/benches/rulesets.rs +++ b/crates/stringsimile-matcher/benches/rulesets.rs @@ -2,7 +2,7 @@ use criterion::{Criterion, criterion_group, criterion_main}; use regex::Regex; use stringsimile_matcher::{ preprocessors::{Preprocessor, SplitTargetConfig}, - rule::IntoGenericMatcherRule, + rule::{GenericMatcherRule, IntoGenericMatcherRule}, rules::{ bitflip::BitflipRule, cidr::CidrRule, @@ -18,7 +18,7 @@ use stringsimile_matcher::{ regex::RegexRule, soundex::{SoundexRule, SoundexRuleType}, }, - ruleset::{RuleSet, StringGroup, StringGroupContext}, + ruleset::{CommonRuleConfig, RuleSet, StringGroup, StringGroupContext}, }; const INPUT_DATA: [&str; 100] = [ @@ -189,8 +189,8 @@ bench_ruleset! { preprocessors: vec![Preprocessor::SplitTarget(SplitTargetConfig { ignore_tld: false })], - rules: vec![ - Box::new(ConfusablesRule.into_generic_matcher()), + rules: [ + ConfusablesRule.into_generic_matcher().clone_dyn(), Box::new(LevenshteinRule { maximum_distance: 5, ignore_mismatch_metadata: true }.into_generic_matcher()), Box::new(DamerauLevenshteinRule { maximum_distance: 5, ignore_mismatch_metadata: true }.into_generic_matcher()), Box::new(HammingRule { maximum_distance: 5 }.into_generic_matcher()), @@ -206,7 +206,7 @@ bench_ruleset! { Box::new(BitflipRule::new_dns(target_str, true)), Box::new(RegexRule::new(Regex::new(target_str).unwrap())), Box::new(CidrRule::new("192.168.0.0/24".parse().unwrap())), - ] + ].into_iter().map(|r| (CommonRuleConfig::default(), r)).collect() }]) } } @@ -223,8 +223,8 @@ bench_ruleset! { preprocessors: vec![Preprocessor::SplitTarget(SplitTargetConfig { ignore_tld: true })], - rules: vec![ - Box::new(ConfusablesRule.into_generic_matcher()), + rules: [ + ConfusablesRule.into_generic_matcher().clone_dyn(), Box::new(LevenshteinRule { maximum_distance: 5, ignore_mismatch_metadata: true }.into_generic_matcher()), Box::new(DamerauLevenshteinRule { maximum_distance: 5, ignore_mismatch_metadata: true }.into_generic_matcher()), Box::new(HammingRule { maximum_distance: 5 }.into_generic_matcher()), @@ -240,7 +240,7 @@ bench_ruleset! { Box::new(BitflipRule::new_dns(target_str, true)), Box::new(RegexRule::new(Regex::new(target_str).unwrap())), Box::new(CidrRule::new("192.168.0.0/24".parse().unwrap())), - ] + ].into_iter().map(|r| (CommonRuleConfig::default(), r)).collect() }]) } } @@ -257,8 +257,8 @@ bench_ruleset! { preprocessors: vec![Preprocessor::SplitTarget(SplitTargetConfig { ignore_tld: true })], - rules: vec![ - Box::new(ConfusablesRule.into_generic_matcher()), + rules: [ + ConfusablesRule.into_generic_matcher().clone_dyn(), Box::new(LevenshteinRule { maximum_distance: 5, ignore_mismatch_metadata: true }.into_generic_matcher()), Box::new(DamerauLevenshteinRule { maximum_distance: 5, ignore_mismatch_metadata: true }.into_generic_matcher()), Box::new(HammingRule { maximum_distance: 5 }.into_generic_matcher()), @@ -274,7 +274,7 @@ bench_ruleset! { Box::new(BitflipRule::new_dns(target_str, true)), Box::new(RegexRule::new(Regex::new(target_str).unwrap())), Box::new(CidrRule::new("192.168.0.0/24".parse().unwrap())), - ] + ].into_iter().map(|r| (CommonRuleConfig::default(), r)).collect() }]) } } diff --git a/crates/stringsimile-matcher/src/preprocessors.rs b/crates/stringsimile-matcher/src/preprocessors.rs index 0a915be..903ab10 100644 --- a/crates/stringsimile-matcher/src/preprocessors.rs +++ b/crates/stringsimile-matcher/src/preprocessors.rs @@ -50,6 +50,7 @@ impl Iterator for IgnoreLastIterator { } impl Preprocessor { + // TODO: Change ot use `Cow` so preprocessors can generate their own stuff /// Processes iterator of input data producing another iterator with modified data pub fn process<'a>( &self, diff --git a/crates/stringsimile-matcher/src/ruleset.rs b/crates/stringsimile-matcher/src/ruleset.rs index b0d8552..e11d850 100644 --- a/crates/stringsimile-matcher/src/ruleset.rs +++ b/crates/stringsimile-matcher/src/ruleset.rs @@ -18,7 +18,7 @@ pub struct RuleSet { /// Preprocessors to apply to input strings before passing them to rules pub preprocessors: Vec, /// Rules to apply to this match - pub rules: Vec>, + pub rules: Vec<(CommonRuleConfig, Box)>, } impl Clone for RuleSet { @@ -27,11 +27,22 @@ impl Clone for RuleSet { name: self.name.clone(), string_match: self.string_match.clone(), preprocessors: self.preprocessors.clone(), - rules: self.rules.iter().map(|r| r.clone_dyn()).collect(), + rules: self + .rules + .iter() + .map(|(c, r)| (c.clone(), r.clone_dyn())) + .collect(), } } } +/// Common configuration for rules +#[derive(Clone, Default)] +pub struct CommonRuleConfig { + /// Whether match on this rule should result in early exit from ruleset + pub exit_on_match: bool, +} + /// String group #[derive(Clone)] pub struct StringGroup { @@ -94,7 +105,7 @@ impl StringGroupContext { metrics: rs .rules .iter() - .map(|rule| { + .map(|(_, rule)| { ( rule.name().to_string(), RuleMetrics::new(name, &rs.name, rule.name()), @@ -133,19 +144,28 @@ impl RuleSet { .fold(input, |acc, p| p.process(acc)); for it in input.enumerate() { - for rule in &self.rules { + for (config, rule) in &self.rules { let rule_metrics = context .metrics .get(rule.name()) .expect("Missing metrics for rule"); - self.generate_match( + let matched = self.generate_match( &mut matches, rule.deref(), it, rule_metrics, full_metadata_for_all, ); + + if matched && config.exit_on_match { + matches + .last_mut() + .expect("Last match not found after generating it") + .metadata + .insert("early_match_exit".to_string(), true.into()); + break; + } } } @@ -159,7 +179,7 @@ impl RuleSet { (index, part): (usize, &str), rule_metrics: &RuleMetrics, full_metadata_for_all: bool, - ) { + ) -> bool { match rule.match_rule_generic(part, &self.string_match, full_metadata_for_all) { Ok(mut result) => { if result.matched { @@ -167,6 +187,7 @@ impl RuleSet { } else { rule_metrics.misses.increment(1); } + let matched = result.matched; if result.matched || full_metadata_for_all { self.preprocessors .iter() @@ -175,10 +196,12 @@ impl RuleSet { } else { matches.push(result); } + matched } Err(err) => { rule_metrics.errors.increment(1); warn!(message = "Matcher failed", error = ?err); + false } } } @@ -275,13 +298,19 @@ mod tests { string_match: "www.test.com".to_string(), preprocessors: Vec::default(), rules: vec![ - Box::new(BitflipRule::new_dns("www.test.com", true).into_generic_matcher()), - Box::new( - LevenshteinRule { - maximum_distance: 3, - ignore_mismatch_metadata: false, - } - .into_generic_matcher(), + ( + Default::default(), + Box::new(BitflipRule::new_dns("www.test.com", true).into_generic_matcher()), + ), + ( + Default::default(), + Box::new( + LevenshteinRule { + maximum_distance: 3, + ignore_mismatch_metadata: false, + } + .into_generic_matcher(), + ), ), ], }], @@ -309,13 +338,19 @@ mod tests { ignore_tld: true, })], rules: vec![ - Box::new(BitflipRule::new_dns("test", true).into_generic_matcher()), - Box::new( - LevenshteinRule { - maximum_distance: 3, - ignore_mismatch_metadata: false, - } - .into_generic_matcher(), + ( + Default::default(), + Box::new(BitflipRule::new_dns("test", true).into_generic_matcher()), + ), + ( + Default::default(), + Box::new( + LevenshteinRule { + maximum_distance: 3, + ignore_mismatch_metadata: false, + } + .into_generic_matcher(), + ), ), ], }], diff --git a/distribution/rules/example.json b/distribution/rules/example.json index b2a2c6c..b2c58d3 100644 --- a/distribution/rules/example.json +++ b/distribution/rules/example.json @@ -20,6 +20,7 @@ }, { "rule_type": "hamming", + "exit_on_match": true, "values": { "maximum_distance": 3 } diff --git a/distribution/rules/example.jsonl b/distribution/rules/example.jsonl index c273820..33f0717 100644 --- a/distribution/rules/example.jsonl +++ b/distribution/rules/example.jsonl @@ -1,2 +1,2 @@ -{ "name": "Example string group", "rule_sets": [ { "name": "Test rule set", "string_match": "test", "preprocessors": [ { "preprocessor_type": "split_target", "ignore_tld": true } ], "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "hamming", "values": { "maximum_distance": 3 } }, { "rule_type": "soundex", "values": { "minimum_similarity": 3 } }, { "rule_type": "metaphone", "values": { "max_code_length": 3 } }, { "rule_type": "nysiis", "values": { "strict": true } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "jaro_winkler", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "confusables" }, { "rule_type": "match_rating" }, { "rule_type": "damerau_levenshtein", "values": { "maximum_distance": 3 } } ] }, { "name": "Example rule set", "preprocessors": [ { "preprocessor_type": "split_target", "ignore_tld": true } ], "string_match": "example", "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "bitflip", "values": { "char_subset": "dns", "case_sensitive": true } }, { "rule_type": "regex", "values": { "pattern": "test" } }, { "rule_type": "cidr", "values": { "address": "192.168.0.0/24" } } ] } ] } +{ "name": "Example string group", "rule_sets": [ { "name": "Test rule set", "string_match": "test", "preprocessors": [ { "preprocessor_type": "split_target", "ignore_tld": true } ], "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "hamming", "exit_on_match": true, "values": { "maximum_distance": 3 } }, { "rule_type": "soundex", "values": { "minimum_similarity": 3 } }, { "rule_type": "metaphone", "values": { "max_code_length": 3 } }, { "rule_type": "nysiis", "values": { "strict": true } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "jaro_winkler", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "confusables" }, { "rule_type": "match_rating" }, { "rule_type": "damerau_levenshtein", "values": { "maximum_distance": 3 } } ] }, { "name": "Example rule set", "preprocessors": [ { "preprocessor_type": "split_target", "ignore_tld": true } ], "string_match": "example", "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 3 } }, { "rule_type": "jaro", "values": { "match_percent_threshold": 0.85 } }, { "rule_type": "bitflip", "values": { "char_subset": "dns", "case_sensitive": true } }, { "rule_type": "regex", "values": { "pattern": "test" } }, { "rule_type": "cidr", "values": { "address": "192.168.0.0/24" } } ] } ] } { "name": "Second string group", "rule_sets": [ { "name": "Second groups rule set", "string_match": "second", "preprocessors": [ { "preprocessor_type": "split_target", "ignore_tld": true } ], "match_rules": [ { "rule_type": "levenshtein", "values": { "maximum_distance": 10 } } ] } ] } diff --git a/doc/stringsimile-rule-config.5.scd b/doc/stringsimile-rule-config.5.scd index d615c88..322b006 100644 --- a/doc/stringsimile-rule-config.5.scd +++ b/doc/stringsimile-rule-config.5.scd @@ -87,6 +87,9 @@ Each rule set has the following keys: Each rule has the following keys: - rule_type One of "levenshtein", "jaro", "jaro_winkler", "confusables", "damerau_levenshtein", "hamming", "soundex", "metaphone", "nysiis", "match_rating", "bitflip", "regex", "cidr" +- exit_on_match + If set to true and this rule matches the input string, this rule set will stop + processing and no rules after this one will be checked. - values Object dependent on the rule_type used. Some rules don't have additional values and this field can be skipped for them.