Skip to content

Commit 45fb70e

Browse files
committed
fix: fix possible duplicated matches
If a single variable produces multiple literals from which the same atoms are extracted, multiple matches can be wrongly generated for this variable. This is fixed by a unicity check before creating the Aho-Corasick.
1 parent cf2b7a4 commit 45fb70e

File tree

2 files changed

+44
-1
lines changed

2 files changed

+44
-1
lines changed

boreal/src/scanner/ac_scan.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//! Provides the [`AcScan`] object, used to scan for all variables in a single AC pass.
22
use std::collections::hash_map::Entry;
3-
use std::collections::HashMap;
3+
use std::collections::{HashMap, HashSet};
44

55
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, AhoCorasickKind};
66

@@ -63,6 +63,8 @@ impl AcScan {
6363
if var.matcher.literals.is_empty() {
6464
non_handled_var_indexes.push(variable_index);
6565
} else {
66+
let mut known_literals_of_var = HashSet::new();
67+
6668
for (literal_index, lit) in var.matcher.literals.iter().enumerate() {
6769
let (start, end) = pick_atom_in_literal(lit);
6870
let mut atom = lit[start..(lit.len() - end)].to_vec();
@@ -72,6 +74,20 @@ impl AcScan {
7274
slice_offset: (start, end),
7375
};
7476

77+
// Sometimes, two literals of the same variable can provide the same atom.
78+
// This can happen if the two literals are identical (for example, someone
79+
// like me writing a test on `/(abc|abc)/`), or if the literals are
80+
// different but contain the same atom (for example,
81+
// `{ ( 00 AB CD | AB CD 00 ) }`).
82+
//
83+
// In those cases, we must *not* use the same atom twice in the Aho-Corasick,
84+
// as this would result in two identical matches for the same variable.
85+
// To prevent this, a set is used here. Both the atom itself and its position
86+
// in the literal are important.
87+
if !known_literals_of_var.insert((atom.clone(), start)) {
88+
continue;
89+
}
90+
7591
// Ensure the literals provided to the aho corasick are not
7692
// duplicated. If multiple variables uses the same atoms,
7793
// we will iterate on every variable in this module, instead

boreal/tests/it/regex.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,3 +252,30 @@ fn test_regex_size() {
252252
// Full greedy validator
253253
check("/a+ abcd a{0,2977952116}/");
254254
}
255+
256+
#[test]
257+
fn test_regex_same_alternative() {
258+
let mut checker = Checker::new(
259+
r#"
260+
rule a {
261+
strings:
262+
$a = /a(bcd|bcd)e/
263+
condition:
264+
#a == 1
265+
}"#,
266+
);
267+
checker.check(b"0abcdef", true);
268+
269+
let mut checker = Checker::new(
270+
r#"
271+
rule a {
272+
strings:
273+
$a = /(\x00abcd|abcd\x00)/
274+
condition:
275+
#a == 1
276+
}"#,
277+
);
278+
checker.check(b"<\x00abcd>", true);
279+
checker.check(b"<abcd\x00>", true);
280+
checker.check(b"<\x00abcd\x00>", false);
281+
}

0 commit comments

Comments
 (0)