diff --git a/c/stringzilla.c b/c/stringzilla.c index 5bbe79b1..8c34dc8d 100644 --- a/c/stringzilla.c +++ b/c/stringzilla.c @@ -63,6 +63,7 @@ typedef struct sz_implementations_t { sz_sequence_argsort_t sequence_argsort; sz_sequence_intersect_t sequence_intersect; + sz_sequence_hashes_t sequence_hashes; sz_pgrams_sort_t pgrams_sort; } sz_implementations_t; @@ -100,6 +101,7 @@ static void sz_dispatch_table_update_implementation_(sz_capability_t caps) { impl->sequence_argsort = sz_sequence_argsort_serial; impl->sequence_intersect = sz_sequence_intersect_serial; + impl->sequence_hashes = sz_sequence_hashes_serial; impl->pgrams_sort = sz_pgrams_sort_serial; #if SZ_USE_HASWELL @@ -169,6 +171,7 @@ static void sz_dispatch_table_update_implementation_(sz_capability_t caps) { impl->fill_random = sz_fill_random_ice; impl->sequence_intersect = sz_sequence_intersect_ice; + impl->sequence_hashes = sz_sequence_hashes_ice; } #endif @@ -220,6 +223,7 @@ static void sz_dispatch_table_update_implementation_(sz_capability_t caps) { impl->sequence_argsort = sz_sequence_argsort_sve; impl->sequence_intersect = sz_sequence_intersect_sve; + impl->sequence_hashes = sz_sequence_hashes_sve; impl->pgrams_sort = sz_pgrams_sort_sve; } #endif @@ -385,6 +389,11 @@ SZ_DYNAMIC sz_status_t sz_sequence_intersect(sz_sequence_t const *first_array, s first_positions, second_positions); } +SZ_DYNAMIC void sz_sequence_hashes(sz_cptr_t const *starts, sz_size_t const *lengths, sz_size_t count, sz_u64_t seed, + sz_u64_t *hashes) { + sz_dispatch_table.sequence_hashes(starts, lengths, count, seed, hashes); +} + // Provide overrides for the libc mem* functions #if SZ_OVERRIDE_LIBC && !defined(__CYGWIN__) diff --git a/include/stringzilla/intersect.h b/include/stringzilla/intersect.h index 85e8e5bc..3d4f3c1f 100644 --- a/include/stringzilla/intersect.h +++ b/include/stringzilla/intersect.h @@ -84,6 +84,22 @@ SZ_DYNAMIC sz_status_t sz_sequence_intersect(sz_sequence_t const *first_sequence sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_size, sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions); +/** + * @brief Computes 64-bit hashes for a batch of strings provided as pointer arrays. + * + * @param[in] starts Array of pointers to the start of each string. + * @param[in] lengths Array of lengths for each string. + * @param[in] count Number of strings to hash. + * @param[in] seed Optional seed for the hash function to avoid attacks. + * @param[out] hashes Output array of 64-bit hashes (must fit `count` entries). + * + * @note The algorithm has linear time complexity and uses SIMD batching for short strings (≤16 bytes). + * @note Selects the fastest implementation at compile- or run-time based on `SZ_DYNAMIC_DISPATCH`. + * @sa sz_sequence_hashes_serial, sz_sequence_hashes_ice, sz_sequence_hashes_sve + */ +SZ_DYNAMIC void sz_sequence_hashes(sz_cptr_t const *starts, sz_size_t const *lengths, sz_size_t count, sz_u64_t seed, + sz_u64_t *hashes); + /** * @brief Defines various JOIN semantics for string sequences, including handling of duplicates. * @sa sz_join_inner_strict_k, sz_join_inner_k, sz_join_left_outer_k, sz_join_right_outer_k, sz_join_full_outer_k, @@ -237,6 +253,10 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice( / sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_size, // sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions); +/** @copydoc sz_sequence_hashes */ +SZ_PUBLIC void sz_sequence_hashes_ice(sz_cptr_t const *starts, sz_size_t const *lengths, sz_size_t count, sz_u64_t seed, + sz_u64_t *hashes); + #endif #if SZ_USE_SVE @@ -247,12 +267,21 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_sve( / sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_size, // sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions); +/** @copydoc sz_sequence_hashes */ +SZ_PUBLIC void sz_sequence_hashes_sve(sz_cptr_t const *starts, sz_size_t const *lengths, sz_size_t count, sz_u64_t seed, + sz_u64_t *hashes); + #endif #pragma endregion #pragma region Serial Implementation +SZ_PUBLIC void sz_sequence_hashes_serial(sz_cptr_t const *starts, sz_size_t const *lengths, sz_size_t count, + sz_u64_t seed, sz_u64_t *hashes) { + for (sz_size_t i = 0; i < count; ++i) { hashes[i] = sz_hash(starts[i], lengths[i], seed); } +} + SZ_PUBLIC sz_status_t sz_sequence_intersect_serial( // sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, // sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_count_ptr, // @@ -379,14 +408,74 @@ SZ_INTERNAL int sz_u64x4_contains_collisions_haswell_(__m256i v) { return mask; } +SZ_PUBLIC void sz_sequence_hashes_ice(sz_cptr_t const *starts, sz_size_t const *lengths, sz_size_t count, sz_u64_t seed, + sz_u64_t *hashes) { + + // Conceptually the Ice Lake variant is similar to the serial one, except it takes advantage of: + // - computing 4x individual high-quality hashes with `_mm512_aesenc_epi128`. + sz_align_(64) sz_hash_minimal_x4_t_ batch_hashes_states_initial; + sz_hash_minimal_x4_init_ice_(&batch_hashes_states_initial, seed); + + for (sz_size_t position = 0; position < count;) { + sz_string_view_t batch[4]; + sz_size_t batch_indices[4]; + sz_size_t batch_size = 0; + + // Fill batch with up to 4 short strings (≤16 bytes) + for (; position < count && batch_size < 4; position++) { + sz_size_t length = lengths[position]; + sz_cptr_t start = starts[position]; + + if (length <= 16) { + batch[batch_size].start = start; + batch[batch_size].length = length; + batch_indices[batch_size] = position; + batch_size++; + } + else { + // Long string: hash immediately and continue scanning + hashes[position] = sz_hash(start, length, seed); + } + } + + // Process the batch if we collected any short strings + if (batch_size == 0) continue; + + // For partial batches, use scalar fallback + if (batch_size < 4) { + for (sz_size_t i = 0; i < batch_size; i++) + hashes[batch_indices[i]] = sz_hash(batch[i].start, batch[i].length, seed); + continue; + } + + // Vectorized path: exactly 4 short strings + sz_u256_vec_t batch_hashes; + sz_u512_vec_t batch_prefixes; + batch_prefixes.xmms[0] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[0].length), batch[0].start); + batch_prefixes.xmms[1] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[1].length), batch[1].start); + batch_prefixes.xmms[2] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[2].length), batch[2].start); + batch_prefixes.xmms[3] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[3].length), batch[3].start); + + sz_align_(64) sz_hash_minimal_x4_t_ batch_hashes_states = batch_hashes_states_initial; + sz_hash_minimal_x4_update_ice_(&batch_hashes_states, batch_prefixes.zmm); + batch_hashes.ymm = sz_hash_minimal_x4_finalize_ice_(&batch_hashes_states, batch[0].length, batch[1].length, + batch[2].length, batch[3].length); + + hashes[batch_indices[0]] = batch_hashes.u64s[0]; + hashes[batch_indices[1]] = batch_hashes.u64s[1]; + hashes[batch_indices[2]] = batch_hashes.u64s[2]; + hashes[batch_indices[3]] = batch_hashes.u64s[3]; + } +} + SZ_PUBLIC sz_status_t sz_sequence_intersect_ice( // sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, // sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_count_ptr, // sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions) { // To join to unordered sets of strings, the simplest approach would be to hash them into a dynamically - // allocated hash table and then iterate over the second set, checking for the presence of each element in the - // hash table. This would require O(N) memory and O(N) time complexity, where N is the smaller set. + // allocated hash table and then iterate over the second set, checking for the presence of each element + // in the hash table. This would require O(N) memory and O(N) time complexity, where N is the smaller set. sz_sequence_t const *small_sequence, *large_sequence; sz_sorted_idx_t *small_positions, *large_positions; if (first_sequence->count <= second_sequence->count) { @@ -431,7 +520,7 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice( // // For larger entries, we will use a separate loop afterwards to decrease the likelihood of collisions // on the shorter entries, that can benefit from vectorized processing. - sz_hash_minimal_x4_t_ batch_hashes_states_initial; + sz_align_(64) sz_hash_minimal_x4_t_ batch_hashes_states_initial; sz_hash_minimal_x4_init_ice_(&batch_hashes_states_initial, seed); sz_size_t count_longer = 0; for (sz_size_t small_position = 0; small_position < small_sequence->count;) { @@ -476,7 +565,7 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice( batch_prefixes.xmms[3] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[3].length), batch[3].start); // Reuse the already computed state for hashes - sz_hash_minimal_x4_t_ batch_hashes_states = batch_hashes_states_initial; + sz_align_(64) sz_hash_minimal_x4_t_ batch_hashes_states = batch_hashes_states_initial; sz_hash_minimal_x4_update_ice_(&batch_hashes_states, batch_prefixes.zmm); batch_hashes.ymm = sz_hash_minimal_x4_finalize_ice_(&batch_hashes_states, batch[0].length, batch[1].length, batch[2].length, batch[3].length); @@ -578,7 +667,7 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice( batch_prefixes.xmms[3] = _mm_maskz_loadu_epi8(sz_u16_mask_until_(batch[3].length), batch[3].start); // Reuse the already computed state for hashes - sz_hash_minimal_x4_t_ batch_hashes_states = batch_hashes_states_initial; + sz_align_(64) sz_hash_minimal_x4_t_ batch_hashes_states = batch_hashes_states_initial; sz_hash_minimal_x4_update_ice_(&batch_hashes_states, batch_prefixes.zmm); batch_hashes.ymm = sz_hash_minimal_x4_finalize_ice_(&batch_hashes_states, batch[0].length, batch[1].length, batch[2].length, batch[3].length); @@ -742,6 +831,12 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice( #pragma GCC target("arch=armv8.2-a+sve") #endif +SZ_PUBLIC void sz_sequence_hashes_sve(sz_cptr_t const *starts, sz_size_t const *lengths, sz_size_t count, sz_u64_t seed, + sz_u64_t *hashes) { + // TODO: Finalize `sz_hash_sve2_upto16x16_` and integrate here + sz_sequence_hashes_serial(starts, lengths, count, seed, hashes); +} + SZ_PUBLIC sz_status_t sz_sequence_intersect_sve(sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, // sz_memory_allocator_t *alloc, sz_u64_t seed, @@ -768,6 +863,16 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_sve(sz_sequence_t const *first_seque #pragma region Compile Time Dispatching #if !SZ_DYNAMIC_DISPATCH +SZ_DYNAMIC sz_status_t sz_sequence_hashes(sz_sequence_t const *sequence, sz_u64_t seed, sz_u64_t *hashes) { +#if SZ_USE_SKYLAKE + return sz_sequence_hashes_ice(sequence, seed, hashes); +#elif SZ_USE_SVE + return sz_sequence_hashes_sve(sequence, seed, hashes); +#else + return sz_sequence_hashes_serial(sequence, seed, hashes); +#endif +} + SZ_DYNAMIC sz_status_t sz_sequence_intersect(sz_sequence_t const *first_sequence, sz_sequence_t const *second_sequence, sz_memory_allocator_t *alloc, sz_u64_t seed, sz_size_t *intersection_size, sz_sorted_idx_t *first_positions, sz_sorted_idx_t *second_positions) { diff --git a/include/stringzilla/types.h b/include/stringzilla/types.h index 92cd7e7c..156fa820 100644 --- a/include/stringzilla/types.h +++ b/include/stringzilla/types.h @@ -770,6 +770,9 @@ typedef sz_status_t (*sz_sequence_intersect_t)(struct sz_sequence_t const *, str sz_memory_allocator_t *, sz_u64_t, sz_size_t *, sz_sorted_idx_t *, sz_sorted_idx_t *); +/** @brief Signature of `sz_sequence_hashes`. */ +typedef void (*sz_sequence_hashes_t)(sz_cptr_t const *, sz_size_t const *, sz_size_t, sz_u64_t, sz_u64_t *); + #pragma endregion #pragma region Helper Structures diff --git a/rust/stringzilla.rs b/rust/stringzilla.rs index 1beb3e8e..7ffbce0c 100644 --- a/rust/stringzilla.rs +++ b/rust/stringzilla.rs @@ -239,6 +239,14 @@ extern "C" { second_positions: *mut SortedIdx, ) -> Status; + pub(crate) fn sz_sequence_hashes( + starts: *const *const u8, + lengths: *const usize, + count: usize, + seed: u64, + hashes: *mut u64, + ); + } impl SemVer { @@ -681,6 +689,134 @@ where hash_with_seed(text, 0) } +/// Iterator adapter that computes hashes in batches for efficiency. +/// Uses SIMD-accelerated batch hashing for strings ≤16 bytes. +pub struct SzHashes +where + I: Iterator, + I::Item: AsRef<[u8]>, +{ + source: I, + seed: u64, + items: [Option; BATCH_SIZE], + starts: [*const u8; BATCH_SIZE], + lengths: [usize; BATCH_SIZE], + hashes: [u64; BATCH_SIZE], + batch_len: usize, + batch_pos: usize, +} + +impl SzHashes +where + I: Iterator, + I::Item: AsRef<[u8]>, +{ + fn new(source: I, seed: u64) -> Self { + Self { + source, + seed, + items: core::array::from_fn(|_| None), + starts: [core::ptr::null(); BATCH_SIZE], + lengths: [0; BATCH_SIZE], + hashes: [0; BATCH_SIZE], + batch_len: 0, + batch_pos: 0, + } + } + + fn fill_batch(&mut self) -> bool { + self.batch_len = 0; + self.batch_pos = 0; + + // Collect batch of items and extract pointers upfront + for i in 0..BATCH_SIZE { + self.items[i] = self.source.next(); + if let Some(ref item) = self.items[i] { + let slice = item.as_ref(); + self.starts[i] = slice.as_ptr(); + self.lengths[i] = slice.len(); + self.batch_len += 1; + } else { + break; + } + } + + if self.batch_len > 0 { + // Call C function directly with pointer arrays - no callbacks! + unsafe { + sz_sequence_hashes( + self.starts.as_ptr(), + self.lengths.as_ptr(), + self.batch_len, + self.seed, + self.hashes.as_mut_ptr(), + ); + } + true + } else { + false + } + } +} + +impl Iterator for SzHashes +where + I: Iterator, + I::Item: AsRef<[u8]>, +{ + type Item = u64; + + fn next(&mut self) -> Option { + if self.batch_pos >= self.batch_len { + if !self.fill_batch() { + return None; + } + } + + let hash = self.hashes[self.batch_pos]; + self.batch_pos += 1; + Some(hash) + } +} + +/// Extension trait for iterator-based batched hashing. +pub trait SzHashExt: Iterator { + /// Compute hashes for iterator items in batches for efficiency. + /// Uses default batch size of 32. + /// + /// # Examples + /// ``` + /// use stringzilla::stringzilla::SzHashExt; + /// let strings = vec!["apple", "banana", "cherry"]; + /// let hashes: Vec = strings.iter().sz_hashes(0).collect(); + /// ``` + fn sz_hashes(self, seed: u64) -> SzHashes + where + Self: Sized, + Self::Item: AsRef<[u8]>, + { + SzHashes::new(self, seed) + } + + /// Compute hashes for iterator items in batches with custom batch size. + /// + /// # Examples + /// ``` + /// use stringzilla::stringzilla::SzHashExt; + /// let strings = vec!["apple", "banana", "cherry"]; + /// let hashes: Vec = strings.iter().sz_hashes_with_batch_size::<64>(0).collect(); + /// ``` + fn sz_hashes_with_batch_size(self, seed: u64) -> SzHashes + where + Self: Sized, + Self::Item: AsRef<[u8]>, + { + SzHashes::new(self, seed) + } +} + +impl SzHashExt for I where I: Iterator {} + /// Locates the first matching substring within `haystack` that equals `needle`. /// This function is similar to the `memmem()` function in LibC, but, unlike `strstr()`, /// it requires the length of both haystack and needle to be known beforehand. @@ -1884,6 +2020,37 @@ mod tests { } } + #[test] + fn batched_hashing() { + use crate::stringzilla::SzHashExt; + + let strings = vec!["apple", "banana", "cherry", "date", "elderberry"]; + + // Compute hashes using batched iterator with default batch size + let batched_hashes: Vec = strings.iter().sz_hashes(0).collect(); + + // Compute hashes individually for comparison + let individual_hashes: Vec = strings.iter().map(|s| sz::hash(s)).collect(); + + // They should match + assert_eq!(batched_hashes.len(), individual_hashes.len()); + for (batched, individual) in batched_hashes.iter().zip(individual_hashes.iter()) { + assert_eq!(batched, individual); + } + + // Test with custom batch size + let batched_hashes_2: Vec = strings.iter().sz_hashes_with_batch_size::<2>(0).collect(); + assert_eq!(batched_hashes, batched_hashes_2); + + // Test with seed + let batched_hashes_seed: Vec = strings.iter().sz_hashes(42).collect(); + let individual_hashes_seed: Vec = strings.iter().map(|s| sz::hash_with_seed(s, 42)).collect(); + assert_eq!(batched_hashes_seed, individual_hashes_seed); + + // Different seeds should produce different hashes + assert_ne!(batched_hashes, batched_hashes_seed); + } + #[test] fn streaming_hash() { let mut hasher = sz::Hasher::new(123);