From 4e52ee960a2a59323e9376eddb47cc4d72f0edc9 Mon Sep 17 00:00:00 2001 From: Alexander Neubeck Date: Tue, 16 Jul 2024 16:49:08 +0200 Subject: [PATCH 1/4] add an alternative API --- src/bytewise.rs | 13 +++++++++++-- src/bytewise/iter.rs | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/src/bytewise.rs b/src/bytewise.rs index 3b300a3..3a089a5 100644 --- a/src/bytewise.rs +++ b/src/bytewise.rs @@ -16,8 +16,7 @@ use crate::utils::FromU32; use crate::{MatchKind, Output}; pub use builder::DoubleArrayAhoCorasickBuilder; use iter::{ - FindIterator, FindOverlappingIterator, FindOverlappingNoSuffixIterator, LestmostFindIterator, - U8SliceIterator, + FindIterator, FindOverlappingIterator, FindOverlappingNoSuffixIterator, LestmostFindIterator, OverlappingStepper, U8SliceIterator }; // The root index position. @@ -287,6 +286,16 @@ impl DoubleArrayAhoCorasick { } } + /// + pub fn overlapping_stepper(&self) -> OverlappingStepper { + OverlappingStepper { + pma: self, + state_id: ROOT_STATE_IDX, + output_pos: None, + pos: 0, + } + } + /// Returns an iterator of overlapping matches in the given haystack iterator. /// /// # Arguments diff --git a/src/bytewise/iter.rs b/src/bytewise/iter.rs index 58cae5d..fa94e27 100644 --- a/src/bytewise/iter.rs +++ b/src/bytewise/iter.rs @@ -84,6 +84,49 @@ where } } +/// In contrast to the iterator APIs, this one requires the caller to feed in bytes +/// and take out matches. +pub struct OverlappingStepper<'a, V> { + pub(crate) pma: &'a DoubleArrayAhoCorasick, + pub(crate) state_id: u32, + pub(crate) pos: usize, + pub(crate) output_pos: Option, +} + +impl<'a, V: Copy> OverlappingStepper<'a, V> { + /// + pub fn consume(&mut self, c: u8) { + // self.state_id is always smaller than self.pma.states.len() because + // self.pma.next_state_id_unchecked() ensures to return such a value. + self.state_id = unsafe { self.pma.next_state_id_unchecked(self.state_id, c) }; + self.output_pos = unsafe { + self.pma + .states + .get_unchecked(usize::from_u32(self.state_id)) + .output_pos() + }; + self.pos += 1; + } + + /// + pub fn next(&mut self) -> Option> { + let output_pos = self.output_pos?; + // output_pos.get() is always smaller than self.pma.outputs.len() because + // Output::parent() ensures to return such a value when it is Some. + let out = unsafe { + self.pma + .outputs + .get_unchecked(usize::from_u32(output_pos.get() - 1)) + }; + self.output_pos = out.parent(); + Some(Match { + length: usize::from_u32(out.length()), + end: self.pos, + value: out.value(), + }) + } +} + /// Iterator created by [`DoubleArrayAhoCorasick::find_overlapping_iter()`]. pub struct FindOverlappingIterator<'a, P, V> { pub(crate) pma: &'a DoubleArrayAhoCorasick, From 22f471532a25d90a320eae0902c759db2b8fe962 Mon Sep 17 00:00:00 2001 From: Alexander Neubeck Date: Tue, 16 Jul 2024 17:03:33 +0200 Subject: [PATCH 2/4] Update iter.rs --- src/bytewise/iter.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/bytewise/iter.rs b/src/bytewise/iter.rs index fa94e27..c072a58 100644 --- a/src/bytewise/iter.rs +++ b/src/bytewise/iter.rs @@ -95,6 +95,7 @@ pub struct OverlappingStepper<'a, V> { impl<'a, V: Copy> OverlappingStepper<'a, V> { /// + #[inline(always)] pub fn consume(&mut self, c: u8) { // self.state_id is always smaller than self.pma.states.len() because // self.pma.next_state_id_unchecked() ensures to return such a value. @@ -109,6 +110,7 @@ impl<'a, V: Copy> OverlappingStepper<'a, V> { } /// + #[inline(always)] pub fn next(&mut self) -> Option> { let output_pos = self.output_pos?; // output_pos.get() is always smaller than self.pma.outputs.len() because From fdefb9c9b5302eb64d7c35b3790636096015cdac Mon Sep 17 00:00:00 2001 From: Alexander Neubeck Date: Wed, 14 Aug 2024 13:13:40 +0200 Subject: [PATCH 3/4] more barebone api --- src/bytewise.rs | 12 ++++-------- src/bytewise/iter.rs | 46 ++++++++++++++++++++++++++++---------------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/src/bytewise.rs b/src/bytewise.rs index 3a089a5..12f0c9e 100644 --- a/src/bytewise.rs +++ b/src/bytewise.rs @@ -16,7 +16,8 @@ use crate::utils::FromU32; use crate::{MatchKind, Output}; pub use builder::DoubleArrayAhoCorasickBuilder; use iter::{ - FindIterator, FindOverlappingIterator, FindOverlappingNoSuffixIterator, LestmostFindIterator, OverlappingStepper, U8SliceIterator + FindIterator, FindOverlappingIterator, FindOverlappingNoSuffixIterator, LestmostFindIterator, + U8SliceIterator, }; // The root index position. @@ -287,13 +288,8 @@ impl DoubleArrayAhoCorasick { } /// - pub fn overlapping_stepper(&self) -> OverlappingStepper { - OverlappingStepper { - pma: self, - state_id: ROOT_STATE_IDX, - output_pos: None, - pos: 0, - } + pub fn start_state(&self) -> u32 { + ROOT_STATE_IDX } /// Returns an iterator of overlapping matches in the given haystack iterator. diff --git a/src/bytewise/iter.rs b/src/bytewise/iter.rs index c072a58..834522a 100644 --- a/src/bytewise/iter.rs +++ b/src/bytewise/iter.rs @@ -84,34 +84,46 @@ where } } -/// In contrast to the iterator APIs, this one requires the caller to feed in bytes -/// and take out matches. -pub struct OverlappingStepper<'a, V> { - pub(crate) pma: &'a DoubleArrayAhoCorasick, - pub(crate) state_id: u32, - pub(crate) pos: usize, - pub(crate) output_pos: Option, +/// Iterator returning all the matches at a given position. +pub struct OverlappingStepperIterator<'a, V> { + pma: &'a DoubleArrayAhoCorasick, + pos: usize, + output_pos: Option, } -impl<'a, V: Copy> OverlappingStepper<'a, V> { +impl DoubleArrayAhoCorasick { /// #[inline(always)] - pub fn consume(&mut self, c: u8) { + pub fn consume( + &mut self, + state_id: u32, + pos: usize, + c: u8, + ) -> (u32, OverlappingStepperIterator) { // self.state_id is always smaller than self.pma.states.len() because // self.pma.next_state_id_unchecked() ensures to return such a value. - self.state_id = unsafe { self.pma.next_state_id_unchecked(self.state_id, c) }; - self.output_pos = unsafe { - self.pma - .states - .get_unchecked(usize::from_u32(self.state_id)) + let state_id = unsafe { self.next_state_id_unchecked(state_id, c) }; + let output_pos = unsafe { + self.states + .get_unchecked(usize::from_u32(state_id)) .output_pos() }; - self.pos += 1; + ( + state_id, + OverlappingStepperIterator { + pma: self, + pos, + output_pos, + }, + ) } +} + +impl<'a, V: Copy> Iterator for OverlappingStepperIterator<'a, V> { + type Item = Match; - /// #[inline(always)] - pub fn next(&mut self) -> Option> { + fn next(&mut self) -> Option> { let output_pos = self.output_pos?; // output_pos.get() is always smaller than self.pma.outputs.len() because // Output::parent() ensures to return such a value when it is Some. From ac44a471a7be5a139535173073b8f1cd2e33bcbd Mon Sep 17 00:00:00 2001 From: Alexander Neubeck Date: Wed, 14 Aug 2024 13:20:33 +0200 Subject: [PATCH 4/4] make non-mutable --- src/bytewise/iter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bytewise/iter.rs b/src/bytewise/iter.rs index 834522a..5724683 100644 --- a/src/bytewise/iter.rs +++ b/src/bytewise/iter.rs @@ -95,7 +95,7 @@ impl DoubleArrayAhoCorasick { /// #[inline(always)] pub fn consume( - &mut self, + &self, state_id: u32, pos: usize, c: u8,