@@ -167,6 +167,74 @@ impl Default for Utf8CaseInsensitiveNeedleMetadata {
167167 }
168168}
169169
170+ /// Pre-compiled case-insensitive search pattern for UTF-8 strings.
171+ ///
172+ /// Caches metadata for efficient repeated searches with the same needle.
173+ /// Useful when searching multiple haystacks for the same pattern.
174+ ///
175+ /// # Examples
176+ ///
177+ /// ```
178+ /// use stringzilla::stringzilla::{utf8_case_insensitive_find, Utf8CaseInsensitiveNeedle};
179+ ///
180+ /// let needle = Utf8CaseInsensitiveNeedle::new(b"hello");
181+ /// let haystack1 = b"Hello World";
182+ /// let haystack2 = b"HELLO there";
183+ ///
184+ /// // Metadata is computed once on first search, reused for subsequent searches
185+ /// let result1 = utf8_case_insensitive_find(haystack1, &needle);
186+ /// let result2 = utf8_case_insensitive_find(haystack2, &needle);
187+ ///
188+ /// assert!(result1.is_some());
189+ /// assert!(result2.is_some());
190+ /// ```
191+ pub struct Utf8CaseInsensitiveNeedle < ' a > {
192+ needle : & ' a [ u8 ] ,
193+ metadata : UnsafeCell < Utf8CaseInsensitiveNeedleMetadata > ,
194+ }
195+
196+ impl < ' a > Utf8CaseInsensitiveNeedle < ' a > {
197+ /// Creates a new pre-compiled case-insensitive needle.
198+ ///
199+ /// The metadata will be computed lazily on first use.
200+ #[ inline]
201+ pub fn new ( needle : & ' a [ u8 ] ) -> Self {
202+ Self {
203+ needle,
204+ metadata : UnsafeCell :: new ( Utf8CaseInsensitiveNeedleMetadata :: default ( ) ) ,
205+ }
206+ }
207+
208+ /// Returns the needle bytes.
209+ #[ inline]
210+ pub fn as_bytes ( & self ) -> & [ u8 ] {
211+ self . needle
212+ }
213+
214+ /// Returns the length of the needle in bytes.
215+ #[ inline]
216+ pub fn len ( & self ) -> usize {
217+ self . needle . len ( )
218+ }
219+
220+ /// Returns true if the needle is empty.
221+ #[ inline]
222+ pub fn is_empty ( & self ) -> bool {
223+ self . needle . is_empty ( )
224+ }
225+
226+ /// Internal: returns a mutable pointer to the metadata for FFI calls.
227+ #[ inline]
228+ pub ( crate ) fn metadata_ptr ( & self ) -> * mut Utf8CaseInsensitiveNeedleMetadata {
229+ self . metadata . get ( )
230+ }
231+ }
232+
233+ // Safety: The metadata is only mutated through FFI during search operations,
234+ // which internally synchronize access. The needle reference is immutable.
235+ unsafe impl < ' a > Send for Utf8CaseInsensitiveNeedle < ' a > { }
236+ unsafe impl < ' a > Sync for Utf8CaseInsensitiveNeedle < ' a > { }
237+
170238/// Incremental hasher state for StringZilla's 64-bit hash.
171239///
172240/// Use `Hasher::new(seed)` to construct, then call `update(&mut self, data)`
@@ -319,6 +387,7 @@ impl<T: AsRef<[u8]>> From<T> for Byteset {
319387 }
320388}
321389
390+ use core:: cell:: UnsafeCell ;
322391use core:: cmp:: Ordering ;
323392use core:: ffi:: { c_char, c_void, CStr } ;
324393use core:: fmt:: { self , Write } ;
@@ -994,6 +1063,8 @@ where
9941063///
9951064/// # Examples
9961065///
1066+ /// Basic usage with string slices:
1067+ ///
9971068/// ```
9981069/// use stringzilla::stringzilla as sz;
9991070/// let haystack = "Hello WORLD";
@@ -1003,32 +1074,87 @@ where
10031074/// }
10041075/// ```
10051076///
1077+ /// With a pre-compiled needle for repeated searches:
1078+ ///
1079+ /// ```
1080+ /// use stringzilla::stringzilla::{utf8_case_insensitive_find, Utf8CaseInsensitiveNeedle};
1081+ ///
1082+ /// let needle = Utf8CaseInsensitiveNeedle::new(b"hello");
1083+ ///
1084+ /// // Metadata is computed once, reused for subsequent searches
1085+ /// let result1 = utf8_case_insensitive_find(b"Hello World", &needle);
1086+ /// let result2 = utf8_case_insensitive_find(b"HELLO there", &needle);
1087+ ///
1088+ /// assert_eq!(result1, Some((0, 5)));
1089+ /// assert_eq!(result2, Some((0, 5)));
1090+ /// ```
1091+ ///
10061092pub fn utf8_case_insensitive_find < H , N > ( haystack : H , needle : N ) -> Option < ( usize , usize ) >
10071093where
10081094 H : AsRef < [ u8 ] > ,
1009- N : AsRef < [ u8 ] > ,
1095+ N : Utf8CaseInsensitiveNeedleArg ,
10101096{
1011- let haystack_ref = haystack. as_ref ( ) ;
1012- let needle_ref = needle. as_ref ( ) ;
1013- let mut matched_length: usize = 0 ;
1097+ needle. find_case_insensitive_in ( haystack. as_ref ( ) )
1098+ }
10141099
1015- let mut needle_metadata = Utf8CaseInsensitiveNeedleMetadata :: default ( ) ;
1016- let result = unsafe {
1017- sz_utf8_case_insensitive_find (
1018- haystack_ref. as_ptr ( ) as * const c_void ,
1019- haystack_ref. len ( ) ,
1020- needle_ref. as_ptr ( ) as * const c_void ,
1021- needle_ref. len ( ) ,
1022- & mut needle_metadata,
1023- & mut matched_length,
1024- )
1025- } ;
1100+ /// Trait for types that can be used as a case-insensitive search needle.
1101+ ///
1102+ /// This trait is implemented for:
1103+ /// - Any type implementing `AsRef<[u8]>` (strings, byte slices, etc.)
1104+ /// - [`Utf8CaseInsensitiveNeedle`] references for efficient repeated searches
1105+ pub trait Utf8CaseInsensitiveNeedleArg {
1106+ /// Performs the case-insensitive search in the given haystack.
1107+ fn find_case_insensitive_in ( self , haystack : & [ u8 ] ) -> Option < ( usize , usize ) > ;
1108+ }
10261109
1027- if result. is_null ( ) {
1028- None
1029- } else {
1030- let offset = unsafe { result. offset_from ( haystack_ref. as_ptr ( ) as * const c_void ) } ;
1031- Some ( ( offset as usize , matched_length) )
1110+ impl < T : AsRef < [ u8 ] > > Utf8CaseInsensitiveNeedleArg for T {
1111+ fn find_case_insensitive_in ( self , haystack : & [ u8 ] ) -> Option < ( usize , usize ) > {
1112+ let needle_ref = self . as_ref ( ) ;
1113+ let mut matched_length: usize = 0 ;
1114+ let mut needle_metadata = Utf8CaseInsensitiveNeedleMetadata :: default ( ) ;
1115+
1116+ let result = unsafe {
1117+ sz_utf8_case_insensitive_find (
1118+ haystack. as_ptr ( ) as * const c_void ,
1119+ haystack. len ( ) ,
1120+ needle_ref. as_ptr ( ) as * const c_void ,
1121+ needle_ref. len ( ) ,
1122+ & mut needle_metadata,
1123+ & mut matched_length,
1124+ )
1125+ } ;
1126+
1127+ if result. is_null ( ) {
1128+ None
1129+ } else {
1130+ let offset = unsafe { result. offset_from ( haystack. as_ptr ( ) as * const c_void ) } ;
1131+ Some ( ( offset as usize , matched_length) )
1132+ }
1133+ }
1134+ }
1135+
1136+ impl < ' a , ' b > Utf8CaseInsensitiveNeedleArg for & ' b Utf8CaseInsensitiveNeedle < ' a > {
1137+ fn find_case_insensitive_in ( self , haystack : & [ u8 ] ) -> Option < ( usize , usize ) > {
1138+ let needle_bytes = self . as_bytes ( ) ;
1139+ let mut matched_length: usize = 0 ;
1140+
1141+ let result = unsafe {
1142+ sz_utf8_case_insensitive_find (
1143+ haystack. as_ptr ( ) as * const c_void ,
1144+ haystack. len ( ) ,
1145+ needle_bytes. as_ptr ( ) as * const c_void ,
1146+ needle_bytes. len ( ) ,
1147+ & mut * self . metadata_ptr ( ) ,
1148+ & mut matched_length,
1149+ )
1150+ } ;
1151+
1152+ if result. is_null ( ) {
1153+ None
1154+ } else {
1155+ let offset = unsafe { result. offset_from ( haystack. as_ptr ( ) as * const c_void ) } ;
1156+ Some ( ( offset as usize , matched_length) )
1157+ }
10321158 }
10331159}
10341160
0 commit comments