Skip to content

Commit c53c593

Browse files
committed
datafusion/common: Add support for hashing ListView arrays
1 parent 79869a7 commit c53c593

File tree

1 file changed

+144
-2
lines changed

1 file changed

+144
-2
lines changed

datafusion/common/src/hash_utils.rs

Lines changed: 144 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@ use arrow::{downcast_dictionary_array, downcast_primitive_array};
2727
#[cfg(not(feature = "force_hash_collisions"))]
2828
use crate::cast::{
2929
as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
30-
as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
31-
as_string_array, as_string_view_array, as_struct_array, as_union_array,
30+
as_generic_binary_array, as_large_list_array, as_large_list_view_array,
31+
as_list_array, as_list_view_array, as_map_array, as_string_array,
32+
as_string_view_array, as_struct_array, as_union_array,
3233
};
3334
use crate::error::Result;
3435
use crate::error::{_internal_datafusion_err, _internal_err};
@@ -538,6 +539,45 @@ where
538539
Ok(())
539540
}
540541

542+
#[cfg(not(feature = "force_hash_collisions"))]
543+
fn hash_list_view_array<OffsetSize>(
544+
array: &GenericListViewArray<OffsetSize>,
545+
random_state: &RandomState,
546+
hashes_buffer: &mut [u64],
547+
) -> Result<()>
548+
where
549+
OffsetSize: OffsetSizeTrait,
550+
{
551+
let values = array.values();
552+
let offsets = array.value_offsets();
553+
let sizes = array.value_sizes();
554+
let nulls = array.nulls();
555+
let mut values_hashes = vec![0u64; values.len()];
556+
create_hashes([values], random_state, &mut values_hashes)?;
557+
if let Some(nulls) = nulls {
558+
for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() {
559+
if nulls.is_valid(i) {
560+
let hash = &mut hashes_buffer[i];
561+
let start = offset.as_usize();
562+
let end = start + size.as_usize();
563+
for values_hash in &values_hashes[start..end] {
564+
*hash = combine_hashes(*hash, *values_hash);
565+
}
566+
}
567+
}
568+
} else {
569+
for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() {
570+
let hash = &mut hashes_buffer[i];
571+
let start = offset.as_usize();
572+
let end = start + size.as_usize();
573+
for values_hash in &values_hashes[start..end] {
574+
*hash = combine_hashes(*hash, *values_hash);
575+
}
576+
}
577+
}
578+
Ok(())
579+
}
580+
541581
#[cfg(not(feature = "force_hash_collisions"))]
542582
fn hash_union_array(
543583
array: &UnionArray,
@@ -714,6 +754,14 @@ fn hash_single_array(
714754
let array = as_large_list_array(array)?;
715755
hash_list_array(array, random_state, hashes_buffer)?;
716756
}
757+
DataType::ListView(_) => {
758+
let array = as_list_view_array(array)?;
759+
hash_list_view_array(array, random_state, hashes_buffer)?;
760+
}
761+
DataType::LargeListView(_) => {
762+
let array = as_large_list_view_array(array)?;
763+
hash_list_view_array(array, random_state, hashes_buffer)?;
764+
}
717765
DataType::Map(_, _) => {
718766
let array = as_map_array(array)?;
719767
hash_map_array(array, random_state, hashes_buffer)?;
@@ -1128,6 +1176,100 @@ mod tests {
11281176
assert_eq!(hashes[1], hashes[6]); // null vs empty list
11291177
}
11301178

1179+
#[test]
1180+
// Tests actual values of hashes, which are different if forcing collisions
1181+
#[cfg(not(feature = "force_hash_collisions"))]
1182+
fn create_hashes_for_list_view_arrays() {
1183+
use arrow::buffer::{NullBuffer, ScalarBuffer};
1184+
1185+
// Create values array: [0, 1, 2, 3, null, 5]
1186+
let values = Arc::new(Int32Array::from(vec![
1187+
Some(0),
1188+
Some(1),
1189+
Some(2),
1190+
Some(3),
1191+
None,
1192+
Some(5),
1193+
])) as ArrayRef;
1194+
let field = Arc::new(Field::new("item", DataType::Int32, true));
1195+
1196+
// Create ListView with the following logical structure:
1197+
// Row 0: [0, 1, 2] (offset=0, size=3)
1198+
// Row 1: null (null bit set)
1199+
// Row 2: [3, null, 5] (offset=3, size=3)
1200+
// Row 3: [3, null, 5] (offset=3, size=3) - same as row 2
1201+
// Row 4: null (null bit set)
1202+
// Row 5: [0, 1, 2] (offset=0, size=3) - same as row 0
1203+
// Row 6: [] (offset=0, size=0) - empty list
1204+
let offsets = ScalarBuffer::from(vec![0i32, 0, 3, 3, 0, 0, 0]);
1205+
let sizes = ScalarBuffer::from(vec![3i32, 0, 3, 3, 0, 3, 0]);
1206+
let nulls = Some(NullBuffer::from(vec![true, false, true, true, false, true, true]));
1207+
1208+
let list_view_array = Arc::new(ListViewArray::new(
1209+
field,
1210+
offsets,
1211+
sizes,
1212+
values,
1213+
nulls,
1214+
)) as ArrayRef;
1215+
1216+
let random_state = RandomState::with_seeds(0, 0, 0, 0);
1217+
let mut hashes = vec![0; list_view_array.len()];
1218+
create_hashes(&[list_view_array], &random_state, &mut hashes).unwrap();
1219+
1220+
assert_eq!(hashes[0], hashes[5]); // same content [0, 1, 2]
1221+
assert_eq!(hashes[1], hashes[4]); // both null
1222+
assert_eq!(hashes[2], hashes[3]); // same content [3, null, 5]
1223+
assert_eq!(hashes[1], hashes[6]); // null vs empty list
1224+
}
1225+
1226+
#[test]
1227+
// Tests actual values of hashes, which are different if forcing collisions
1228+
#[cfg(not(feature = "force_hash_collisions"))]
1229+
fn create_hashes_for_large_list_view_arrays() {
1230+
use arrow::buffer::{NullBuffer, ScalarBuffer};
1231+
1232+
// Create values array: [0, 1, 2, 3, null, 5]
1233+
let values = Arc::new(Int32Array::from(vec![
1234+
Some(0),
1235+
Some(1),
1236+
Some(2),
1237+
Some(3),
1238+
None,
1239+
Some(5),
1240+
])) as ArrayRef;
1241+
let field = Arc::new(Field::new("item", DataType::Int32, true));
1242+
1243+
// Create LargeListView with the following logical structure:
1244+
// Row 0: [0, 1, 2] (offset=0, size=3)
1245+
// Row 1: null (null bit set)
1246+
// Row 2: [3, null, 5] (offset=3, size=3)
1247+
// Row 3: [3, null, 5] (offset=3, size=3) - same as row 2
1248+
// Row 4: null (null bit set)
1249+
// Row 5: [0, 1, 2] (offset=0, size=3) - same as row 0
1250+
// Row 6: [] (offset=0, size=0) - empty list
1251+
let offsets = ScalarBuffer::from(vec![0i64, 0, 3, 3, 0, 0, 0]);
1252+
let sizes = ScalarBuffer::from(vec![3i64, 0, 3, 3, 0, 3, 0]);
1253+
let nulls = Some(NullBuffer::from(vec![true, false, true, true, false, true, true]));
1254+
1255+
let large_list_view_array = Arc::new(LargeListViewArray::new(
1256+
field,
1257+
offsets,
1258+
sizes,
1259+
values,
1260+
nulls,
1261+
)) as ArrayRef;
1262+
1263+
let random_state = RandomState::with_seeds(0, 0, 0, 0);
1264+
let mut hashes = vec![0; large_list_view_array.len()];
1265+
create_hashes(&[large_list_view_array], &random_state, &mut hashes).unwrap();
1266+
1267+
assert_eq!(hashes[0], hashes[5]); // same content [0, 1, 2]
1268+
assert_eq!(hashes[1], hashes[4]); // both null
1269+
assert_eq!(hashes[2], hashes[3]); // same content [3, null, 5]
1270+
assert_eq!(hashes[1], hashes[6]); // null vs empty list
1271+
}
1272+
11311273
#[test]
11321274
// Tests actual values of hashes, which are different if forcing collisions
11331275
#[cfg(not(feature = "force_hash_collisions"))]

0 commit comments

Comments
 (0)