Skip to content

Commit 45e3c92

Browse files
committed
Add: Reusable case-insensitive needles with metadata for Rust & C++
1 parent 999ec64 commit 45e3c92

File tree

2 files changed

+209
-20
lines changed

2 files changed

+209
-20
lines changed

include/stringzilla/stringzilla.hpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ template <typename>
6161
class basic_string_slice;
6262
template <typename, typename>
6363
class basic_string;
64+
template <typename>
65+
class utf8_case_insensitive_needle;
6466

6567
using string_span = basic_string_slice<char>;
6668
using string_view = basic_string_slice<char const>;
@@ -1664,6 +1666,52 @@ struct concatenation {
16641666

16651667
#pragma endregion
16661668

1669+
#pragma region Case-Insensitive Search Pattern
1670+
1671+
/**
1672+
* @brief Pre-compiled case-insensitive search pattern for UTF-8 strings.
1673+
*
1674+
* Caches metadata for efficient repeated searches with the same needle.
1675+
* Useful when searching multiple haystacks for the same pattern.
1676+
*
1677+
* @code{.cpp}
1678+
* sz::utf8_case_insensitive_needle pattern("hello");
1679+
* for (auto const& haystack : haystacks) {
1680+
* auto match = haystack.utf8_case_insensitive_find(pattern);
1681+
* if (match) { ... }
1682+
* }
1683+
* @endcode
1684+
*
1685+
* @tparam char_type_ The character type, usually `char const` or `char`.
1686+
*/
1687+
template <typename char_type_ = char const>
1688+
class utf8_case_insensitive_needle {
1689+
static_assert(sizeof(char_type_) == 1, "Characters must be a single byte long");
1690+
1691+
using char_type = char_type_;
1692+
1693+
char_type *needle_;
1694+
std::size_t length_;
1695+
mutable sz_utf8_case_insensitive_needle_metadata_t metadata_;
1696+
1697+
public:
1698+
utf8_case_insensitive_needle(char_type *needle, std::size_t length) noexcept
1699+
: needle_(needle), length_(length), metadata_ {} {}
1700+
1701+
utf8_case_insensitive_needle(basic_string_slice<char_type> needle) noexcept
1702+
: needle_(needle.data()), length_(needle.size()), metadata_ {} {}
1703+
1704+
template <std::size_t length_>
1705+
utf8_case_insensitive_needle(char_type (&needle)[length_]) noexcept
1706+
: needle_(needle), length_(length_ - 1), metadata_ {} {}
1707+
1708+
char_type *data() const noexcept { return needle_; }
1709+
std::size_t size() const noexcept { return length_; }
1710+
sz_utf8_case_insensitive_needle_metadata_t const &metadata_ref() const noexcept { return metadata_; }
1711+
};
1712+
1713+
#pragma endregion
1714+
16671715
#pragma region String Views and Spans
16681716

16691717
/**
@@ -2333,6 +2381,21 @@ class basic_string_slice {
23332381
return {static_cast<size_type>(ptr - start_), match_length};
23342382
}
23352383

2384+
/**
2385+
* @brief Find the byte offset of the first occurrence of a pre-compiled case-insensitive pattern.
2386+
* @param[in] needle A pre-compiled pattern with cached metadata for efficient repeated searches.
2387+
* @return Match info with offset and length, or @c npos offset if not found.
2388+
*/
2389+
template <typename needle_char_type_>
2390+
sized_match_t utf8_case_insensitive_find(
2391+
utf8_case_insensitive_needle<needle_char_type_> const &needle) const noexcept {
2392+
sz_size_t match_length = 0;
2393+
auto ptr = sz_utf8_case_insensitive_find(start_, length_, needle.data(), needle.size(), &needle.metadata_ref(),
2394+
&match_length);
2395+
if (!ptr) return {npos, static_cast<size_type>(0)};
2396+
return {static_cast<size_type>(ptr - start_), match_length};
2397+
}
2398+
23362399
/**
23372400
* @brief Iterate over UTF-8 characters (codepoints) in the string.
23382401
* @return A range view over UTF-32 codepoints decoded from UTF-8 bytes.

rust/stringzilla.rs

Lines changed: 146 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,74 @@ impl Default for Utf8CaseInsensitiveNeedleMetadata {
167167
}
168168
}
169169

170+
/// Pre-compiled case-insensitive search pattern for UTF-8 strings.
171+
///
172+
/// Caches metadata for efficient repeated searches with the same needle.
173+
/// Useful when searching multiple haystacks for the same pattern.
174+
///
175+
/// # Examples
176+
///
177+
/// ```
178+
/// use stringzilla::stringzilla::{utf8_case_insensitive_find, Utf8CaseInsensitiveNeedle};
179+
///
180+
/// let needle = Utf8CaseInsensitiveNeedle::new(b"hello");
181+
/// let haystack1 = b"Hello World";
182+
/// let haystack2 = b"HELLO there";
183+
///
184+
/// // Metadata is computed once on first search, reused for subsequent searches
185+
/// let result1 = utf8_case_insensitive_find(haystack1, &needle);
186+
/// let result2 = utf8_case_insensitive_find(haystack2, &needle);
187+
///
188+
/// assert!(result1.is_some());
189+
/// assert!(result2.is_some());
190+
/// ```
191+
pub struct Utf8CaseInsensitiveNeedle<'a> {
192+
needle: &'a [u8],
193+
metadata: UnsafeCell<Utf8CaseInsensitiveNeedleMetadata>,
194+
}
195+
196+
impl<'a> Utf8CaseInsensitiveNeedle<'a> {
197+
/// Creates a new pre-compiled case-insensitive needle.
198+
///
199+
/// The metadata will be computed lazily on first use.
200+
#[inline]
201+
pub fn new(needle: &'a [u8]) -> Self {
202+
Self {
203+
needle,
204+
metadata: UnsafeCell::new(Utf8CaseInsensitiveNeedleMetadata::default()),
205+
}
206+
}
207+
208+
/// Returns the needle bytes.
209+
#[inline]
210+
pub fn as_bytes(&self) -> &[u8] {
211+
self.needle
212+
}
213+
214+
/// Returns the length of the needle in bytes.
215+
#[inline]
216+
pub fn len(&self) -> usize {
217+
self.needle.len()
218+
}
219+
220+
/// Returns true if the needle is empty.
221+
#[inline]
222+
pub fn is_empty(&self) -> bool {
223+
self.needle.is_empty()
224+
}
225+
226+
/// Internal: returns a mutable pointer to the metadata for FFI calls.
227+
#[inline]
228+
pub(crate) fn metadata_ptr(&self) -> *mut Utf8CaseInsensitiveNeedleMetadata {
229+
self.metadata.get()
230+
}
231+
}
232+
233+
// Safety: The metadata is only mutated through FFI during search operations,
234+
// which internally synchronize access. The needle reference is immutable.
235+
unsafe impl<'a> Send for Utf8CaseInsensitiveNeedle<'a> {}
236+
unsafe impl<'a> Sync for Utf8CaseInsensitiveNeedle<'a> {}
237+
170238
/// Incremental hasher state for StringZilla's 64-bit hash.
171239
///
172240
/// Use `Hasher::new(seed)` to construct, then call `update(&mut self, data)`
@@ -319,6 +387,7 @@ impl<T: AsRef<[u8]>> From<T> for Byteset {
319387
}
320388
}
321389

390+
use core::cell::UnsafeCell;
322391
use core::cmp::Ordering;
323392
use core::ffi::{c_char, c_void, CStr};
324393
use core::fmt::{self, Write};
@@ -994,6 +1063,8 @@ where
9941063
///
9951064
/// # Examples
9961065
///
1066+
/// Basic usage with string slices:
1067+
///
9971068
/// ```
9981069
/// use stringzilla::stringzilla as sz;
9991070
/// let haystack = "Hello WORLD";
@@ -1003,32 +1074,87 @@ where
10031074
/// }
10041075
/// ```
10051076
///
1077+
/// With a pre-compiled needle for repeated searches:
1078+
///
1079+
/// ```
1080+
/// use stringzilla::stringzilla::{utf8_case_insensitive_find, Utf8CaseInsensitiveNeedle};
1081+
///
1082+
/// let needle = Utf8CaseInsensitiveNeedle::new(b"hello");
1083+
///
1084+
/// // Metadata is computed once, reused for subsequent searches
1085+
/// let result1 = utf8_case_insensitive_find(b"Hello World", &needle);
1086+
/// let result2 = utf8_case_insensitive_find(b"HELLO there", &needle);
1087+
///
1088+
/// assert_eq!(result1, Some((0, 5)));
1089+
/// assert_eq!(result2, Some((0, 5)));
1090+
/// ```
1091+
///
10061092
pub fn utf8_case_insensitive_find<H, N>(haystack: H, needle: N) -> Option<(usize, usize)>
10071093
where
10081094
H: AsRef<[u8]>,
1009-
N: AsRef<[u8]>,
1095+
N: Utf8CaseInsensitiveNeedleArg,
10101096
{
1011-
let haystack_ref = haystack.as_ref();
1012-
let needle_ref = needle.as_ref();
1013-
let mut matched_length: usize = 0;
1097+
needle.find_case_insensitive_in(haystack.as_ref())
1098+
}
10141099

1015-
let mut needle_metadata = Utf8CaseInsensitiveNeedleMetadata::default();
1016-
let result = unsafe {
1017-
sz_utf8_case_insensitive_find(
1018-
haystack_ref.as_ptr() as *const c_void,
1019-
haystack_ref.len(),
1020-
needle_ref.as_ptr() as *const c_void,
1021-
needle_ref.len(),
1022-
&mut needle_metadata,
1023-
&mut matched_length,
1024-
)
1025-
};
1100+
/// Trait for types that can be used as a case-insensitive search needle.
1101+
///
1102+
/// This trait is implemented for:
1103+
/// - Any type implementing `AsRef<[u8]>` (strings, byte slices, etc.)
1104+
/// - [`Utf8CaseInsensitiveNeedle`] references for efficient repeated searches
1105+
pub trait Utf8CaseInsensitiveNeedleArg {
1106+
/// Performs the case-insensitive search in the given haystack.
1107+
fn find_case_insensitive_in(self, haystack: &[u8]) -> Option<(usize, usize)>;
1108+
}
10261109

1027-
if result.is_null() {
1028-
None
1029-
} else {
1030-
let offset = unsafe { result.offset_from(haystack_ref.as_ptr() as *const c_void) };
1031-
Some((offset as usize, matched_length))
1110+
impl<T: AsRef<[u8]>> Utf8CaseInsensitiveNeedleArg for T {
1111+
fn find_case_insensitive_in(self, haystack: &[u8]) -> Option<(usize, usize)> {
1112+
let needle_ref = self.as_ref();
1113+
let mut matched_length: usize = 0;
1114+
let mut needle_metadata = Utf8CaseInsensitiveNeedleMetadata::default();
1115+
1116+
let result = unsafe {
1117+
sz_utf8_case_insensitive_find(
1118+
haystack.as_ptr() as *const c_void,
1119+
haystack.len(),
1120+
needle_ref.as_ptr() as *const c_void,
1121+
needle_ref.len(),
1122+
&mut needle_metadata,
1123+
&mut matched_length,
1124+
)
1125+
};
1126+
1127+
if result.is_null() {
1128+
None
1129+
} else {
1130+
let offset = unsafe { result.offset_from(haystack.as_ptr() as *const c_void) };
1131+
Some((offset as usize, matched_length))
1132+
}
1133+
}
1134+
}
1135+
1136+
impl<'a, 'b> Utf8CaseInsensitiveNeedleArg for &'b Utf8CaseInsensitiveNeedle<'a> {
1137+
fn find_case_insensitive_in(self, haystack: &[u8]) -> Option<(usize, usize)> {
1138+
let needle_bytes = self.as_bytes();
1139+
let mut matched_length: usize = 0;
1140+
1141+
let result = unsafe {
1142+
sz_utf8_case_insensitive_find(
1143+
haystack.as_ptr() as *const c_void,
1144+
haystack.len(),
1145+
needle_bytes.as_ptr() as *const c_void,
1146+
needle_bytes.len(),
1147+
&mut *self.metadata_ptr(),
1148+
&mut matched_length,
1149+
)
1150+
};
1151+
1152+
if result.is_null() {
1153+
None
1154+
} else {
1155+
let offset = unsafe { result.offset_from(haystack.as_ptr() as *const c_void) };
1156+
Some((offset as usize, matched_length))
1157+
}
10321158
}
10331159
}
10341160

0 commit comments

Comments
 (0)