Skip to content

Commit 6fa9c1a

Browse files
authored
Optimize hashing for StringView and ByteView (15-70% faster) (#19374)
## Which issue does this PR close? - builds on #19373 - part of #18411 - Broken out of #19344 - Closes #19344 ## Rationale for this change While looking at performance as part of #18411, I noticed we could speed up string view hashing by optimizing for small strings ## What changes are included in this PR? Optimize StringView hashing, specifically by using the inlined view for short strings ## Are these changes tested? Functionally by existing coverage Performance by benchmarks (added in #19373) which show * 15%-20% faster for mixed short/long strings * 50%-70% faster for "short" arrays where we know there are no strings longer than 12 bytes ``` utf8_view (small): multiple, no nulls 1.00 47.9±1.71µs ? ?/sec 4.00 191.6±1.15µs ? ?/sec utf8_view (small): multiple, nulls 1.00 78.4±0.48µs ? ?/sec 3.08 241.6±1.11µs ? ?/sec utf8_view (small): single, no nulls 1.00 13.9±0.19µs ? ?/sec 4.29 59.7±0.30µs ? ?/sec utf8_view (small): single, nulls 1.00 23.8±0.20µs ? ?/sec 3.10 73.7±1.03µs ? ?/sec utf8_view: multiple, no nulls 1.00 235.4±2.14µs ? ?/sec 1.11 262.2±1.34µs ? ?/sec utf8_view: multiple, nulls 1.00 227.2±2.11µs ? ?/sec 1.34 303.9±2.23µs ? ?/sec utf8_view: single, no nulls 1.00 71.6±0.74µs ? ?/sec 1.05 75.2±1.27µs ? ?/sec utf8_view: single, nulls 1.00 71.5±1.92µs ? ?/sec 1.28 91.6±4.65µs ``` <details><summary>Details</summary> <p> ``` Gnuplot not found, using plotters backend utf8_view: single, no nulls time: [20.872 µs 20.906 µs 20.944 µs] change: [−15.863% −15.614% −15.331%] (p = 0.00 < 0.05) Performance has improved. Found 13 outliers among 100 measurements (13.00%) 8 (8.00%) high mild 5 (5.00%) high severe utf8_view: single, nulls time: [22.968 µs 23.050 µs 23.130 µs] change: [−17.796% −17.384% −16.918%] (p = 0.00 < 0.05) Performance has improved. Found 7 outliers among 100 measurements (7.00%) 3 (3.00%) high mild 4 (4.00%) high severe utf8_view: multiple, no nulls time: [66.005 µs 66.155 µs 66.325 µs] change: [−19.077% −18.785% −18.512%] (p = 0.00 < 0.05) Performance has improved. utf8_view: multiple, nulls time: [72.155 µs 72.375 µs 72.649 µs] change: [−17.944% −17.612% −17.266%] (p = 0.00 < 0.05) Performance has improved. Found 11 outliers among 100 measurements (11.00%) 6 (6.00%) high mild 5 (5.00%) high severe utf8_view (small): single, no nulls time: [6.1401 µs 6.1563 µs 6.1747 µs] change: [−69.623% −69.484% −69.333%] (p = 0.00 < 0.05) Performance has improved. Found 6 outliers among 100 measurements (6.00%) 3 (3.00%) high mild 3 (3.00%) high severe utf8_view (small): single, nulls time: [10.234 µs 10.250 µs 10.270 µs] change: [−53.969% −53.815% −53.666%] (p = 0.00 < 0.05) Performance has improved. Found 5 outliers among 100 measurements (5.00%) 5 (5.00%) high severe utf8_view (small): multiple, no nulls time: [20.853 µs 20.905 µs 20.961 µs] change: [−66.006% −65.883% −65.759%] (p = 0.00 < 0.05) Performance has improved. Found 9 outliers among 100 measurements (9.00%) 7 (7.00%) high mild 2 (2.00%) high severe utf8_view (small): multiple, nulls time: [32.519 µs 32.600 µs 32.675 µs] change: [−53.937% −53.581% −53.232%] (p = 0.00 < 0.05) Performance has improved. Found 1 outliers among 100 measurements (1.00%) 1 (1.00%) high mild ``` </p> </details> ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent 8cc8c11 commit 6fa9c1a

File tree

1 file changed

+125
-11
lines changed

1 file changed

+125
-11
lines changed

datafusion/common/src/hash_utils.rs

Lines changed: 125 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ macro_rules! hash_value {
162162
})+
163163
};
164164
}
165-
hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
165+
hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64, u128);
166166
hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);
167167

168168
macro_rules! hash_float_value {
@@ -269,6 +269,127 @@ fn hash_array<T>(
269269
}
270270
}
271271

272+
/// Hash a StringView or BytesView array
273+
///
274+
/// Templated to optimize inner loop based on presence of nulls and external buffers.
275+
///
276+
/// HAS_NULLS: do we have to check null in the inner loop
277+
/// HAS_BUFFERS: if true, array has external buffers; if false, all strings are inlined/ less then 12 bytes
278+
/// REHASH: if true, combining with existing hash, otherwise initializing
279+
#[inline(never)]
280+
fn hash_string_view_array_inner<
281+
T: ByteViewType,
282+
const HAS_NULLS: bool,
283+
const HAS_BUFFERS: bool,
284+
const REHASH: bool,
285+
>(
286+
array: &GenericByteViewArray<T>,
287+
random_state: &RandomState,
288+
hashes_buffer: &mut [u64],
289+
) {
290+
assert_eq!(
291+
hashes_buffer.len(),
292+
array.len(),
293+
"hashes_buffer and array should be of equal length"
294+
);
295+
296+
let buffers = array.data_buffers();
297+
let view_bytes = |view_len: u32, view: u128| {
298+
let view = ByteView::from(view);
299+
let offset = view.offset as usize;
300+
// SAFETY: view is a valid view as it came from the array
301+
unsafe {
302+
let data = buffers.get_unchecked(view.buffer_index as usize);
303+
data.get_unchecked(offset..offset + view_len as usize)
304+
}
305+
};
306+
307+
let hashes_and_views = hashes_buffer.iter_mut().zip(array.views().iter());
308+
for (i, (hash, &v)) in hashes_and_views.enumerate() {
309+
if HAS_NULLS && array.is_null(i) {
310+
continue;
311+
}
312+
let view_len = v as u32;
313+
// all views are inlined, no need to access external buffers
314+
if !HAS_BUFFERS || view_len <= 12 {
315+
if REHASH {
316+
*hash = combine_hashes(v.hash_one(random_state), *hash);
317+
} else {
318+
*hash = v.hash_one(random_state);
319+
}
320+
continue;
321+
}
322+
// view is not inlined, so we need to hash the bytes as well
323+
let value = view_bytes(view_len, v);
324+
if REHASH {
325+
*hash = combine_hashes(value.hash_one(random_state), *hash);
326+
} else {
327+
*hash = value.hash_one(random_state);
328+
}
329+
}
330+
}
331+
332+
/// Builds hash values for array views and writes them into `hashes_buffer`
333+
/// If `rehash==true` this combines the previous hash value in the buffer
334+
/// with the new hash using `combine_hashes`
335+
#[cfg(not(feature = "force_hash_collisions"))]
336+
fn hash_generic_byte_view_array<T: ByteViewType>(
337+
array: &GenericByteViewArray<T>,
338+
random_state: &RandomState,
339+
hashes_buffer: &mut [u64],
340+
rehash: bool,
341+
) {
342+
// instantiate the correct version based on presence of nulls and external buffers
343+
match (
344+
array.null_count() != 0,
345+
!array.data_buffers().is_empty(),
346+
rehash,
347+
) {
348+
// no nulls or buffers ==> hash the inlined views directly
349+
// don't call the inner function as Rust seems better able to inline this simpler code (2-3% faster)
350+
(false, false, false) => {
351+
for (hash, &view) in hashes_buffer.iter_mut().zip(array.views().iter()) {
352+
*hash = view.hash_one(random_state);
353+
}
354+
}
355+
(false, false, true) => {
356+
for (hash, &view) in hashes_buffer.iter_mut().zip(array.views().iter()) {
357+
*hash = combine_hashes(view.hash_one(random_state), *hash);
358+
}
359+
}
360+
(false, true, false) => hash_string_view_array_inner::<T, false, true, false>(
361+
array,
362+
random_state,
363+
hashes_buffer,
364+
),
365+
(false, true, true) => hash_string_view_array_inner::<T, false, true, true>(
366+
array,
367+
random_state,
368+
hashes_buffer,
369+
),
370+
(true, false, false) => hash_string_view_array_inner::<T, true, false, false>(
371+
array,
372+
random_state,
373+
hashes_buffer,
374+
),
375+
(true, false, true) => hash_string_view_array_inner::<T, true, false, true>(
376+
array,
377+
random_state,
378+
hashes_buffer,
379+
),
380+
(true, true, false) => hash_string_view_array_inner::<T, true, true, false>(
381+
array,
382+
random_state,
383+
hashes_buffer,
384+
),
385+
(true, true, true) => hash_string_view_array_inner::<T, true, true, true>(
386+
array,
387+
random_state,
388+
hashes_buffer,
389+
),
390+
}
391+
}
392+
272393
/// Helper function to update hash for a dictionary key if the value is valid
273394
#[cfg(not(feature = "force_hash_collisions"))]
274395
#[inline]
@@ -568,10 +689,10 @@ fn hash_single_array(
568689
DataType::Null => hash_null(random_state, hashes_buffer, rehash),
569690
DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash),
570691
DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash),
571-
DataType::Utf8View => hash_array(&as_string_view_array(array)?, random_state, hashes_buffer, rehash),
692+
DataType::Utf8View => hash_generic_byte_view_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash),
572693
DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash),
573694
DataType::Binary => hash_array(&as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
574-
DataType::BinaryView => hash_array(&as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
695+
DataType::BinaryView => hash_generic_byte_view_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
575696
DataType::LargeBinary => hash_array(&as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
576697
DataType::FixedSizeBinary(_) => {
577698
let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
@@ -767,18 +888,13 @@ mod tests {
767888

768889
let binary_array: ArrayRef =
769890
Arc::new(binary.iter().cloned().collect::<$ARRAY>());
770-
let ref_array: ArrayRef =
771-
Arc::new(binary.iter().cloned().collect::<BinaryArray>());
772891

773892
let random_state = RandomState::with_seeds(0, 0, 0, 0);
774893

775894
let mut binary_hashes = vec![0; binary.len()];
776895
create_hashes(&[binary_array], &random_state, &mut binary_hashes)
777896
.unwrap();
778897

779-
let mut ref_hashes = vec![0; binary.len()];
780-
create_hashes(&[ref_array], &random_state, &mut ref_hashes).unwrap();
781-
782898
// Null values result in a zero hash,
783899
for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
784900
match val {
@@ -787,9 +903,6 @@ mod tests {
787903
}
788904
}
789905

790-
// same logical values should hash to the same hash value
791-
assert_eq!(binary_hashes, ref_hashes);
792-
793906
// Same values should map to same hash values
794907
assert_eq!(binary[0], binary[5]);
795908
assert_eq!(binary[4], binary[6]);
@@ -801,6 +914,7 @@ mod tests {
801914
}
802915

803916
create_hash_binary!(binary_array, BinaryArray);
917+
create_hash_binary!(large_binary_array, LargeBinaryArray);
804918
create_hash_binary!(binary_view_array, BinaryViewArray);
805919

806920
#[test]

0 commit comments

Comments
 (0)