Skip to content

Commit 080b67e

Browse files
Implement String View (Utf8View/BinaryView) Optimizations
Introduces a two-stage filter for ByteView types. Stage 1 uses a fast DirectProbeFilter on masked views (len + prefix) for quick rejection; Stage 2 performs full verification only for potential long-string matches. Triggers for Utf8View and BinaryView.
1 parent f1d36a9 commit 080b67e

File tree

4 files changed

+344
-7
lines changed

4 files changed

+344
-7
lines changed

datafusion/physical-expr/src/expressions/in_list/primitive_filter.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,16 @@ where
370370
))
371371
}
372372

373+
/// Creates a DirectProbeFilter from an iterator of values.
374+
///
375+
/// This is useful when building the filter from pre-processed values
376+
/// (e.g., masked views for Utf8View).
377+
pub(crate) fn from_values(values: impl Iterator<Item = T::Native>) -> Self {
378+
// Collect into HashSet for deduplication
379+
let unique_values: HashSet<_> = values.collect();
380+
Self::from_values_inner(unique_values.into_iter(), 0)
381+
}
382+
373383
/// Internal constructor from deduplicated values
374384
fn from_values_inner(
375385
unique_values: impl Iterator<Item = T::Native>,

datafusion/physical-expr/src/expressions/in_list/result.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121
//! from IN list membership tests, handling null propagation correctly
2222
//! according to SQL three-valued logic.
2323
24-
#![expect(dead_code)]
25-
2624
use arrow::array::BooleanArray;
2725
use arrow::buffer::{BooleanBuffer, NullBuffer};
2826

datafusion/physical-expr/src/expressions/in_list/strategy.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ use super::primitive_filter::*;
2828
use super::result::handle_dictionary;
2929
use super::static_filter::StaticFilter;
3030
use super::transform::{
31-
make_bitmap_filter, make_branchless_filter, reinterpret_any_primitive_to,
31+
make_bitmap_filter, make_branchless_filter, make_byte_view_masked_filter,
32+
make_utf8view_branchless_filter, make_utf8view_hash_filter,
33+
reinterpret_any_primitive_to, utf8view_all_short_strings,
3234
};
3335

3436
// =============================================================================
@@ -104,6 +106,16 @@ pub(crate) fn instantiate_static_filter(
104106

105107
let len = in_array.len();
106108
let dt = in_array.data_type();
109+
110+
// Special case: Utf8View with short strings can be reinterpreted as i128
111+
if matches!(dt, DataType::Utf8View) && utf8view_all_short_strings(in_array.as_ref()) {
112+
return if len <= BRANCHLESS_MAX_16B {
113+
make_utf8view_branchless_filter(&in_array)
114+
} else {
115+
make_utf8view_hash_filter(&in_array)
116+
};
117+
}
118+
107119
let strategy = select_strategy(dt, len);
108120

109121
match (dt, strategy) {
@@ -124,6 +136,14 @@ pub(crate) fn instantiate_static_filter(
124136
exec_datafusion_err!("Hashed strategy selected but no filter for {:?}", dt)
125137
})?,
126138

139+
// Byte view filters (Utf8View, BinaryView)
140+
(DataType::Utf8View, Generic) => {
141+
make_byte_view_masked_filter::<StringViewType>(in_array)
142+
}
143+
(DataType::BinaryView, Generic) => {
144+
make_byte_view_masked_filter::<BinaryViewType>(in_array)
145+
}
146+
127147
// Fallback for nested/complex types and strings (Phase 4: Strings use fallback)
128148
(_, Generic) => Ok(Arc::new(NestedTypeFilter::try_new(in_array)?)),
129149
}

0 commit comments

Comments
 (0)