feat: using u16 instead of usize offset representation in Atom limiting a pattern to 16::MAX bytes and u16::MAX atoms per branch

WolverinDEV · WolverinDEV · commit 160a924682bc · 2024-12-05T15:26:55.000+09:00
diff --git a/bmatcher-core/src/atom.rs b/bmatcher-core/src/atom.rs
@@ -16,12 +16,12 @@ pub enum ReadWidth {
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 pub enum Atom {
     /// Match a sequence of bytes from the sequence array.
-    ByteSequence { seq_start: usize, seq_end: usize },
+    ByteSequence { seq_start: u16, seq_end: u16 },
 
     /// Skip a fixed number of bytes.
-    WildcardFixed(usize),
+    WildcardFixed(u16),
     /// Skip a variable number of bytes.
-    WildcardRange { min: usize, max: usize },
+    WildcardRange { min: u16, max: u16 },
 
     /// Jump to the relative / absolute based on the binary data the current cursor location.
     Jump(JumpType),
@@ -34,16 +34,16 @@ pub enum Atom {
     /// and then continue where we left of.
     Branch {
         /// Length of the left subpattern
-        left_len: usize,
+        left_len: u16,
 
         /// Length of the right subpattern
-        right_len: usize,
+        right_len: u16,
     },
 
     /// Push the cursor location to the cursor stack
     CursorPush,
     /// Pop the cursor location from the cursor stack and advance by X bytes
-    CursorPop { advance: usize },
+    CursorPop { advance: u16 },
 
     /// Save the current cursor position to the save stack
     SaveCursor,
diff --git a/bmatcher-core/src/compiler/optimizer.rs b/bmatcher-core/src/compiler/optimizer.rs
@@ -41,8 +41,8 @@ impl Optimizer {
 
             branches.push(Branch {
                 atom_index: index,
-                right_index: index + 1 + *left_len,
-                end_index: index + 1 + *left_len + *right_len,
+                right_index: index + 1 + *left_len as usize,
+                end_index: index + 1 + *left_len as usize + *right_len as usize,
             });
         }
 
@@ -83,8 +83,8 @@ impl Optimizer {
     fn fixup_branches(&mut self) {
         for branch in self.branches.iter() {
             self.atoms[branch.atom_index] = Atom::Branch {
-                left_len: branch.right_index - branch.atom_index - 1,
-                right_len: branch.end_index - branch.right_index,
+                left_len: (branch.right_index - branch.atom_index - 1) as u16,
+                right_len: (branch.end_index - branch.right_index) as u16,
             };
         }
     }
diff --git a/bmatcher-core/src/compiler/parser.rs b/bmatcher-core/src/compiler/parser.rs
@@ -26,6 +26,8 @@ pub enum ParseError {
 
     RangeBoundInvalid(ParseIntError),
     RangeEndMustBeGraterThenStart,
+
+    SequenceTooLarge,
 }
 
 pub struct PatternParser<'a> {
@@ -120,10 +122,10 @@ impl<'a> PatternParser<'a> {
             ));
         };
 
+        let token_range = self.lexer.token_range();
         let bytes_start = self.byte_sequence.len();
         let mut values = value.char_indices();
         while let Some((upper_index, upper)) = values.next() {
-            let token_range = self.lexer.token_range();
             let Some((lower_index, lower)) = values.next() else {
                 /* byte sequence must always be a multiple of 2 */
                 return Err(PositionedError::new(
@@ -155,9 +157,16 @@ impl<'a> PatternParser<'a> {
         }
 
         let bytes_end = self.byte_sequence.len();
+        if bytes_start > u16::MAX as usize || bytes_end > u16::MAX as usize {
+            return Err(PositionedError::new(
+                token_range,
+                ParseError::SequenceTooLarge,
+            ));
+        }
+
         self.atoms.push(Atom::ByteSequence {
-            seq_start: bytes_start,
-            seq_end: bytes_end,
+            seq_start: bytes_start as u16,
+            seq_end: bytes_end as u16,
         });
 
         Ok(())
@@ -270,8 +279,15 @@ impl<'a> PatternParser<'a> {
             }
 
             let left_branch_len = self.atoms.len() - branch_atom_index - 1;
+            if left_branch_len > u16::MAX as usize {
+                return Err(PositionedError::new(
+                    self.lexer.token_range(),
+                    ParseError::SequenceTooLarge,
+                ));
+            }
+
             if let Atom::Branch { left_len, .. } = &mut self.atoms[branch_atom_index] {
-                *left_len = left_branch_len;
+                *left_len = left_branch_len as u16;
             } else {
                 unreachable!("atom should be a branch");
             }
@@ -286,7 +302,15 @@ impl<'a> PatternParser<'a> {
                 right_len,
             } = &mut self.atoms[branch_atom_index]
             {
-                *right_len = atom_count - *left_len - branch_atom_index - 1;
+                let right_branch_len = atom_count - *left_len as usize - branch_atom_index - 1;
+                if right_branch_len > u16::MAX as usize {
+                    return Err(PositionedError::new(
+                        self.lexer.token_range(),
+                        ParseError::SequenceTooLarge,
+                    ));
+                }
+
+                *right_len = right_branch_len as u16;
             } else {
                 unreachable!("atom should be a branch");
             }
@@ -310,7 +334,7 @@ impl<'a> PatternParser<'a> {
             ));
         };
 
-        let range_start = range_start.parse::<usize>().map_err(|err| {
+        let range_start = range_start.parse::<u16>().map_err(|err| {
             PositionedError::new(self.lexer.token_range(), ParseError::RangeBoundInvalid(err))
         })?;
 
@@ -327,7 +351,7 @@ impl<'a> PatternParser<'a> {
                     ));
                 };
 
-                let range_end = range_end.parse::<usize>().map_err(|err| {
+                let range_end = range_end.parse::<u16>().map_err(|err| {
                     PositionedError::new(
                         self.lexer.token_range(),
                         ParseError::RangeBoundInvalid(err),
diff --git a/bmatcher-core/src/matcher.rs b/bmatcher-core/src/matcher.rs
@@ -44,7 +44,8 @@ impl<'a> BinaryMatcher<'a> {
         while atom_cursor < atoms.len() {
             match atoms[atom_cursor] {
                 Atom::ByteSequence { seq_start, seq_end } => {
-                    let expected_bytes = &self.pattern.byte_sequence()[seq_start..seq_end];
+                    let expected_bytes =
+                        &self.pattern.byte_sequence()[seq_start as usize..seq_end as usize];
                     let actual_bytes = self.target.subrange(data_cursor, expected_bytes.len())?;
 
                     if expected_bytes != actual_bytes {
@@ -56,7 +57,7 @@ impl<'a> BinaryMatcher<'a> {
                 }
                 Atom::WildcardFixed(length) => {
                     atom_cursor += 1;
-                    data_cursor += length;
+                    data_cursor += length as usize;
                 }
                 Atom::WildcardRange { min, max } => {
                     let save_stack_size = self.save_stack.len();
@@ -65,8 +66,8 @@ impl<'a> BinaryMatcher<'a> {
                     for offset in min..=max {
                         self.save_stack.truncate(save_stack_size);
                         self.cursor_stack.truncate(cursor_stack_size);
-                        if let Some(data_cursor) =
-                            self.match_atoms(data_cursor + offset, &atoms[atom_cursor + 1..])
+                        if let Some(data_cursor) = self
+                            .match_atoms(data_cursor + offset as usize, &atoms[atom_cursor + 1..])
                         {
                             /* match :) */
                             return Some(data_cursor);
@@ -81,7 +82,7 @@ impl<'a> BinaryMatcher<'a> {
                     atom_cursor += 1;
                 }
                 Atom::CursorPop { advance } => {
-                    data_cursor = self.cursor_stack.pop().unwrap() + advance;
+                    data_cursor = self.cursor_stack.pop().unwrap() + advance as usize;
                     atom_cursor += 1;
                 }
 
@@ -94,7 +95,7 @@ impl<'a> BinaryMatcher<'a> {
 
                     data_cursor = if let Some(data_cursor) = self.match_atoms(
                         data_cursor,
-                        &atoms[atom_cursor + 1..atom_cursor + 1 + left_len],
+                        &atoms[atom_cursor + 1..atom_cursor + 1 + left_len as usize],
                     ) {
                         /* match for left hand side */
                         data_cursor
@@ -105,12 +106,12 @@ impl<'a> BinaryMatcher<'a> {
 
                         self.match_atoms(
                             data_cursor,
-                            &atoms[atom_cursor + 1 + left_len
-                                ..atom_cursor + 1 + left_len + right_len],
+                            &atoms[atom_cursor + 1 + left_len as usize
+                                ..atom_cursor + 1 + left_len as usize + right_len as usize],
                         )?
                     };
 
-                    atom_cursor += 1 + left_len + right_len;
+                    atom_cursor += 1 + left_len as usize + right_len as usize;
                 }
 
                 Atom::Jump(mode) => {

Original file line number	Diff line number	Diff line change
`@@ -41,8 +41,8 @@ impl Optimizer {`
`41`	`41`
`42`	`42`	`branches.push(Branch {`
`43`	`43`	`atom_index: index,`
`44`		`- right_index: index + 1 + *left_len,`
`45`		`- end_index: index + 1 + left_len + right_len,`
	`44`	`+ right_index: index + 1 + *left_len as usize,`
	`45`	`+ end_index: index + 1 + left_len as usize + right_len as usize,`
`46`	`46`	`});`
`47`	`47`	`}`
`48`	`48`
`@@ -83,8 +83,8 @@ impl Optimizer {`
`83`	`83`	`fn fixup_branches(&mut self) {`
`84`	`84`	`for branch in self.branches.iter() {`
`85`	`85`	`self.atoms[branch.atom_index] = Atom::Branch {`
`86`		`- left_len: branch.right_index - branch.atom_index - 1,`
`87`		`- right_len: branch.end_index - branch.right_index,`
	`86`	`+ left_len: (branch.right_index - branch.atom_index - 1) as u16,`
	`87`	`+ right_len: (branch.end_index - branch.right_index) as u16,`
`88`	`88`	`};`
`89`	`89`	`}`
`90`	`90`	`}`