diff --git a/sds/benches/bench.rs b/sds/benches/bench.rs index 8061a846..dafe7f40 100644 --- a/sds/benches/bench.rs +++ b/sds/benches/bench.rs @@ -59,6 +59,14 @@ pub fn scoped_ruleset(c: &mut Criterion) { }); false } + + fn find_true_positive_rules_from_current_path( + &self, + sanitized_path: &str, + current_true_positive_rule_idx: &mut Vec, + ) -> usize { + 0 + } } fast_rule_set.visit_string_rule_combinations( diff --git a/sds/src/path.rs b/sds/src/path.rs index 61a963e6..79c11e55 100644 --- a/sds/src/path.rs +++ b/sds/src/path.rs @@ -110,10 +110,10 @@ impl<'a> PathSegment<'a> { } } - pub fn sanitize(&self) -> Option> { + pub fn sanitize(&'a self) -> Option> { if let PathSegment::Field(field) = self { match should_bypass_standardize_path(field) { - BypassStandardizePathResult::BypassAndAllLowercase => Some(field.clone()), + BypassStandardizePathResult::BypassAndAllLowercase => Some(Cow::Borrowed(field)), BypassStandardizePathResult::BypassAndAllUppercase => { Some(Cow::Owned(field.to_ascii_lowercase())) } diff --git a/sds/src/scanner/mod.rs b/sds/src/scanner/mod.rs index d32f2135..6755a0fc 100644 --- a/sds/src/scanner/mod.rs +++ b/sds/src/scanner/mod.rs @@ -17,6 +17,7 @@ use std::any::{Any, TypeId}; use std::sync::Arc; use self::metrics::ScannerMetrics; +use crate::proximity_keywords::{contains_keyword_in_path, CompiledIncludedProximityKeywords}; use crate::scanner::config::RuleConfig; use crate::scanner::regex_rule::compiled::RegexCompiledRule; use crate::scanner::regex_rule::{access_regex_caches, RegexCaches}; @@ -59,18 +60,18 @@ where pub trait CompiledRuleDyn: Send + Sync { fn get_match_action(&self) -> &MatchAction; fn get_scope(&self) -> &Scope; + fn get_included_keywords(&self) -> Option<&CompiledIncludedProximityKeywords>; #[allow(clippy::too_many_arguments)] fn get_string_matches( &self, content: &str, - path: &Path, regex_caches: &mut RegexCaches, group_data: &mut AHashMap>, exclusion_check: &ExclusionCheck<'_>, excluded_matches: &mut AHashSet, match_emitter: &mut dyn MatchEmitter, - should_keywords_match_event_paths: bool, + true_positive_rule_idx: &Vec, scanner_labels: &Labels, ); @@ -102,16 +103,19 @@ impl CompiledRuleDyn for T { self.get_scope() } + fn get_included_keywords(&self) -> Option<&CompiledIncludedProximityKeywords> { + self.get_included_keywords() + } + fn get_string_matches( &self, content: &str, - path: &Path, regex_caches: &mut RegexCaches, group_data: &mut AHashMap>, exclusion_check: &ExclusionCheck<'_>, excluded_matches: &mut AHashSet, match_emitter: &mut dyn MatchEmitter, - should_keywords_match_event_paths: bool, + true_positive_rule_idx: &Vec, scanner_labels: &Labels, ) { let group_data_any = group_data @@ -120,13 +124,12 @@ impl CompiledRuleDyn for T { let group_data: &mut T::GroupData = group_data_any.downcast_mut().unwrap(); self.get_string_matches( content, - path, regex_caches, group_data, exclusion_check, excluded_matches, match_emitter, - should_keywords_match_event_paths, + true_positive_rule_idx, ) } @@ -158,18 +161,18 @@ pub trait CompiledRule: Send + Sync { fn get_match_action(&self) -> &MatchAction; fn get_scope(&self) -> &Scope; + fn get_included_keywords(&self) -> Option<&CompiledIncludedProximityKeywords>; #[allow(clippy::too_many_arguments)] fn get_string_matches( &self, content: &str, - path: &Path, regex_caches: &mut RegexCaches, group_data: &mut Self::GroupData, exclusion_check: &ExclusionCheck<'_>, excluded_matches: &mut AHashSet, match_emitter: &mut dyn MatchEmitter, - should_keywords_match_event_paths: bool, + true_positive_rule_idx: &Vec, ); // Whether a match from this rule should be excluded (marked as a false-positive) @@ -615,7 +618,10 @@ impl ScannerBuilder<'_> { .map(|rule| rule.get_scope().clone()) .collect::>(), ) - .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards); + .with_implicit_index_wildcards(self.scanner_features.add_implicit_index_wildcards) + .with_keywords_should_match_event_paths( + self.scanner_features.should_keywords_match_event_paths, + ); { let stats = &*GLOBAL_STATS; @@ -652,6 +658,7 @@ impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> { content: &str, mut rule_visitor: crate::scoped_ruleset::RuleIndexVisitor, exclusion_check: ExclusionCheck<'b>, + true_positive_rule_idx: &Vec, ) -> bool { // matches for a single path let mut path_rules_matches = vec![]; @@ -678,15 +685,12 @@ impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> { rule.get_string_matches( content, - path, - self.regex_caches, + &mut self.regex_caches, &mut group_data, &exclusion_check, self.excluded_matches, &mut emitter, - self.scanner - .scanner_features - .should_keywords_match_event_paths, + true_positive_rule_idx, &self.scanner.labels, ); } @@ -718,6 +722,26 @@ impl<'a, E: Encoding> ContentVisitor<'a> for ScannerContentVisitor<'a, E> { has_match } + + fn find_true_positive_rules_from_current_path( + &self, + sanitized_path: &str, + current_true_positive_rule_idx: &mut Vec, + ) -> usize { + let mut times_pushed = 0; + for (idx, rule) in self.scanner.rules.iter().enumerate() { + if !current_true_positive_rule_idx.contains(&idx) { + if let Some(keywords) = rule.get_included_keywords() { + if contains_keyword_in_path(&sanitized_path, &keywords.keywords_pattern) { + // The rule is found has a true positive for this path, push it + current_true_positive_rule_idx.push(idx); + times_pushed += 1 + } + } + } + } + times_pushed + } } // Calculates the next starting position for a regex match if a the previous match is a false positive @@ -801,17 +825,22 @@ mod test { fn get_scope(&self) -> &Scope { &self.scope } + fn create_group_data(_: &Labels) {} + + fn get_included_keywords(&self) -> Option<&CompiledIncludedProximityKeywords> { + None + } + fn get_string_matches( &self, _content: &str, - _path: &Path, _regex_caches: &mut RegexCaches, _group_data: &mut Self::GroupData, _exclusion_check: &ExclusionCheck<'_>, _excluded_matches: &mut AHashSet, match_emitter: &mut dyn MatchEmitter, - _should_keywords_match_event_paths: bool, + _true_positive_rule_idx: &Vec, ) { match_emitter.emit(StringMatch { start: 10, end: 16 }); } diff --git a/sds/src/scanner/regex_rule/compiled.rs b/sds/src/scanner/regex_rule/compiled.rs index 276ad40a..1a7ab808 100644 --- a/sds/src/scanner/regex_rule/compiled.rs +++ b/sds/src/scanner/regex_rule/compiled.rs @@ -1,7 +1,7 @@ use crate::match_validation::config::{InternalMatchValidationType, MatchValidationType}; use crate::proximity_keywords::{ - contains_keyword_in_path, get_prefix_start, is_index_within_prefix, - CompiledExcludedProximityKeywords, CompiledIncludedProximityKeywords, + get_prefix_start, is_index_within_prefix, CompiledExcludedProximityKeywords, + CompiledIncludedProximityKeywords, }; use crate::scanner::metrics::RuleMetrics; use crate::scanner::regex_rule::regex_store::SharedRegex; @@ -9,7 +9,7 @@ use crate::scanner::regex_rule::RegexCaches; use crate::scanner::scope::Scope; use crate::scanner::{get_next_regex_start, is_false_positive_match}; use crate::secondary_validation::Validator; -use crate::{CompiledRule, ExclusionCheck, Labels, MatchAction, MatchEmitter, Path, StringMatch}; +use crate::{CompiledRule, ExclusionCheck, Labels, MatchAction, MatchEmitter, StringMatch}; use ahash::AHashSet; use regex_automata::meta::Cache; use regex_automata::Input; @@ -40,27 +40,29 @@ impl CompiledRule for RegexCompiledRule { &self.scope } fn create_group_data(_: &Labels) {} + fn get_included_keywords(&self) -> Option<&CompiledIncludedProximityKeywords> { + self.included_keywords.as_ref() + } + fn get_string_matches( &self, content: &str, - path: &Path, regex_caches: &mut RegexCaches, _group_data: &mut (), exclusion_check: &ExclusionCheck<'_>, excluded_matches: &mut AHashSet, match_emitter: &mut dyn MatchEmitter, - should_keywords_match_event_paths: bool, + true_positive_rule_idx: &Vec, ) { match self.included_keywords { Some(ref included_keywords) => { self.get_string_matches_with_included_keywords( content, - path, regex_caches, exclusion_check, excluded_matches, match_emitter, - should_keywords_match_event_paths, + true_positive_rule_idx, included_keywords, ); } @@ -108,31 +110,27 @@ impl RegexCompiledRule { fn get_string_matches_with_included_keywords( &self, content: &str, - path: &Path, regex_caches: &mut RegexCaches, exclusion_check: &ExclusionCheck<'_>, excluded_matches: &mut AHashSet, match_emitter: &mut dyn MatchEmitter, - should_keywords_match_event_paths: bool, + true_positive_rule_idx: &Vec, included_keywords: &CompiledIncludedProximityKeywords, ) { - if should_keywords_match_event_paths { - let sanitized_path = path.sanitize(); - if contains_keyword_in_path(&sanitized_path, &included_keywords.keywords_pattern) { - // since the path contains a match, we can skip future included keyword checks - let true_positive_search = self.true_positive_matches( - content, - 0, - regex_caches.get(&self.regex), - false, - exclusion_check, - excluded_matches, - ); - for string_match in true_positive_search { - match_emitter.emit(string_match); - } - return; + if !true_positive_rule_idx.is_empty() && true_positive_rule_idx.contains(&self.rule_index) { + // since the path contains a match, we can skip future included keyword checks + let true_positive_search = self.true_positive_matches( + content, + 0, + regex_caches.get(&self.regex), + false, + exclusion_check, + excluded_matches, + ); + for string_match in true_positive_search { + match_emitter.emit(string_match); } + return; } let mut included_keyword_matches = included_keywords.keyword_matches(content); diff --git a/sds/src/scoped_ruleset/mod.rs b/sds/src/scoped_ruleset/mod.rs index 8ee3d0e4..f6d926ac 100644 --- a/sds/src/scoped_ruleset/mod.rs +++ b/sds/src/scoped_ruleset/mod.rs @@ -1,6 +1,7 @@ mod bool_set; use crate::event::{EventVisitor, VisitStringResult}; +use crate::proximity_keywords::UNIFIED_LINK_STR; use crate::scanner::scope::Scope; use crate::scoped_ruleset::bool_set::BoolSet; use crate::{Event, Path, PathSegment}; @@ -15,6 +16,7 @@ pub struct ScopedRuleSet { // The number of rules stored in this set num_rules: usize, add_implicit_index_wildcards: bool, + should_keywords_match_event_paths: bool, } impl ScopedRuleSet { @@ -45,6 +47,7 @@ impl ScopedRuleSet { tree, num_rules: rules_scopes.len(), add_implicit_index_wildcards: false, + should_keywords_match_event_paths: false, } } @@ -53,6 +56,11 @@ impl ScopedRuleSet { self } + pub fn with_keywords_should_match_event_paths(mut self, value: bool) -> Self { + self.should_keywords_match_event_paths = value; + self + } + pub fn visit_string_rule_combinations<'path, 'c: 'path>( &'c self, event: &'path mut impl Event, @@ -78,6 +86,7 @@ impl ScopedRuleSet { path: Path::root(), bool_set, add_implicit_index_wildcards: self.add_implicit_index_wildcards, + should_keywords_match_event_paths: self.should_keywords_match_event_paths, }; event.visit_event(&mut visitor) @@ -113,7 +122,14 @@ pub trait ContentVisitor<'path> { content: &str, rules: RuleIndexVisitor, is_excluded: ExclusionCheck<'content_visitor>, + true_positive_rule_idx: &Vec, ) -> bool; + + fn find_true_positive_rules_from_current_path( + &self, + sanitized_path: &str, + current_true_positive_rule_idx: &mut Vec, + ) -> usize; } // This is just a reference to a RuleTree with some additional information @@ -162,6 +178,7 @@ struct ScopedRuledSetEventVisitor<'a, C> { bool_set: Option, add_implicit_index_wildcards: bool, + should_keywords_match_event_paths: bool, } impl<'path, C> EventVisitor<'path> for ScopedRuledSetEventVisitor<'path, C> @@ -199,21 +216,46 @@ where // Sanitize the segment and push it self.sanitized_segments_until_node.push(segment.sanitize()); + let true_positive_rules_count = if self.should_keywords_match_event_paths { + let current_sanitized_path = self + .sanitized_segments_until_node + .iter() + .filter_map(|sanitized_segment| { + sanitized_segment + .as_ref() + .map_or(None::>, |x| Some(x.clone())) + }) + .collect::>() + .join(UNIFIED_LINK_STR); + self.content_visitor + .find_true_positive_rules_from_current_path( + current_sanitized_path.as_str(), + &mut self.true_positive_rule_idx, + ) + } else { + 0 + }; + // The new number of active trees is the number of new trees pushed self.active_node_counter.push(NodeCounter { active_tree_count: self.tree_nodes.len() - tree_nodes_len, - true_positive_rules_count: 0, + true_positive_rules_count, }); self.path.segments.push(segment); } fn pop_segment(&mut self) { - let num_active_trees = self.active_node_counter.pop().unwrap().active_tree_count; - for _ in 0..num_active_trees { + let node_counter = self.active_node_counter.pop().unwrap(); + for _ in 0..node_counter.active_tree_count { // The rules from the last node are no longer active, so remove them. let _popped = self.tree_nodes.pop(); } + for _ in 0..node_counter.true_positive_rules_count { + // The true positive rule indices from the last node are no longer active, remove them. + let _popped = self.true_positive_rule_idx.pop(); + } + // Pop the sanitized segment self.sanitized_segments_until_node.pop(); self.path.segments.pop(); } @@ -229,6 +271,7 @@ where ExclusionCheck { tree_nodes: &self.tree_nodes, }, + &self.true_positive_rule_idx, ); if let Some(bool_set) = &mut self.bool_set { bool_set.reset(); @@ -367,6 +410,7 @@ mod test { content: &str, mut rule_iter: RuleIndexVisitor, exclusion_check: ExclusionCheck<'content_visitor>, + true_positive_rule_idx: &Vec, ) -> bool { let mut rules = vec![]; rule_iter.visit_rule_indices(|rule_index| { @@ -383,6 +427,14 @@ mod test { }); true } + + fn find_true_positive_rules_from_current_path( + &self, + sanitized_segments: &str, + current_true_positive_rule_idx: &mut Vec, + ) -> usize { + 0 + } } ruleset.visit_string_rule_combinations(