Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -343,16 +343,8 @@ impl PhysicalExpr for HashTableLookupExpr {

// Check each hash against the hash table
let mut buf = MutableBuffer::from_len_zeroed(bit_util::ceil(num_rows, 8));
for (idx, hash_value) in hash_array.values().iter().enumerate() {
// Use get_matched_indices to check - if it returns any indices, the hash exists
let (matched_indices, _) = self
.hash_map
.get_matched_indices(Box::new(std::iter::once((idx, hash_value))), None);

if !matched_indices.is_empty() {
bit_util::set_bit(buf.as_slice_mut(), idx);
}
}
self.hash_map
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of initializing here and passing a slice, it could use BooleanBuffer::collect_bool (faster).

.set_bits_if_exists(hash_array.values(), buf.as_slice_mut());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could use with_hashes / reuse hashes buffer instead of allocating a new one each time.


Ok(ColumnarValue::Array(Arc::new(
BooleanArray::new_from_packed(buf, 0, num_rows),
Expand Down
58 changes: 58 additions & 0 deletions datafusion/physical-plan/src/joins/join_hash_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use std::fmt::{self, Debug};
use std::ops::Sub;

use arrow::datatypes::ArrowNativeType;
use arrow::util::bit_util;
use hashbrown::HashTable;
use hashbrown::hash_table::Entry::{Occupied, Vacant};

Expand Down Expand Up @@ -124,6 +125,9 @@ pub trait JoinHashMapType: Send + Sync {
match_indices: &mut Vec<u64>,
) -> Option<JoinHashMapOffset>;

/// Sets bits in the provided buffer if the corresponding hash exists in the map.
fn set_bits_if_exists(&self, hash_values: &[u64], buffer: &mut [u8]);

/// Returns `true` if the join hash map contains no entries.
fn is_empty(&self) -> bool;

Expand Down Expand Up @@ -196,6 +200,10 @@ impl JoinHashMapType for JoinHashMapU32 {
)
}

fn set_bits_if_exists(&self, hash_values: &[u64], buffer: &mut [u8]) {
set_bits_if_exists::<u32>(&self.map, hash_values, buffer);
}

fn is_empty(&self) -> bool {
self.map.is_empty()
}
Expand Down Expand Up @@ -270,6 +278,10 @@ impl JoinHashMapType for JoinHashMapU64 {
)
}

fn set_bits_if_exists(&self, hash_values: &[u64], buffer: &mut [u8]) {
set_bits_if_exists::<u64>(&self.map, hash_values, buffer);
}

fn is_empty(&self) -> bool {
self.map.is_empty()
}
Expand Down Expand Up @@ -496,3 +508,49 @@ where
}
None
}

pub fn set_bits_if_exists<T>(
map: &HashTable<(u64, T)>,
hash_values: &[u64],
buffer: &mut [u8],
) {
for (i, &hash) in hash_values.iter().enumerate() {
if map.find(hash, |(h, _)| hash == *h).is_some() {
bit_util::set_bit(buffer, i);
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_set_bits_if_exists() {
let mut hash_map = JoinHashMapU32::with_capacity(10);
// Build side: insert 10, 20, 30
hash_map.update_from_iter(
Box::new([10u64, 20u64, 30u64].iter().enumerate()),
0,
);

// Probe side: test both existing and non-existing hashes
let probe_hashes = vec![10, 11, 20, 21, 30, 31];
let mut buffer = vec![0u8; 1];
hash_map.set_bits_if_exists(&probe_hashes, &mut buffer);

for (i, &hash) in probe_hashes.iter().enumerate() {
if matches!(hash, 10 | 20 | 30) {
assert!(
bit_util::get_bit(&buffer, i),
"Hash {hash} should exist in the map"
);
} else {
assert!(
!bit_util::get_bit(&buffer, i),
"Hash {hash} should NOT exist in the map"
);
}
}
}
}
6 changes: 5 additions & 1 deletion datafusion/physical-plan/src/joins/stream_join_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use std::sync::Arc;

use crate::joins::join_hash_map::{
JoinHashMapOffset, get_matched_indices, get_matched_indices_with_limit_offset,
update_from_iter,
set_bits_if_exists, update_from_iter,
};
use crate::joins::utils::{JoinFilter, JoinHashMapType};
use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder};
Expand Down Expand Up @@ -94,6 +94,10 @@ impl JoinHashMapType for PruningJoinHashMap {
)
}

fn set_bits_if_exists(&self, hash_values: &[u64], buffer: &mut [u8]) {
set_bits_if_exists::<u64>(&self.map, hash_values, buffer);
}

fn is_empty(&self) -> bool {
self.map.is_empty()
}
Expand Down
Loading