Skip to content

Commit 116ae12

Browse files
authored
[Arrow]Configure max deduplication length for StringView (#8990)
Configure max deduplication length when deduplicating strings while building the array # Which issue does this PR close? Configure max deduplication length when deduplicating strings while building the array - Closes #7187. # Rationale for this change Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. # What changes are included in this PR? There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. # Are these changes tested? We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? # Are there any user-facing changes? If there are user-facing changes then we may require documentation to be updated before approving the PR. If there are any breaking changes to public APIs, please call them out.
1 parent 1cc659d commit 116ae12

File tree

1 file changed

+98
-27
lines changed

1 file changed

+98
-27
lines changed

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 98 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
8787
/// Some if deduplicating strings
8888
/// map `<string hash> -> <index to the views>`
8989
string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
90+
max_deduplication_len: Option<u32>,
9091
phantom: PhantomData<T>,
9192
}
9293

@@ -107,10 +108,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
107108
current_size: STARTING_BLOCK_SIZE,
108109
},
109110
string_tracker: None,
111+
max_deduplication_len: None,
110112
phantom: Default::default(),
111113
}
112114
}
113115

116+
/// Configure max deduplication length when deduplicating strings while building the array.
117+
/// Default is None.
118+
///
119+
/// When [`Self::with_deduplicate_strings`] is enabled, the builder attempts to deduplicate
120+
/// any strings longer than 12 bytes. However, since it takes time proportional to the length
121+
/// of the string to deduplicate, setting this option limits the CPU overhead for this option.
122+
pub fn with_max_deduplication_len(self, max_deduplication_len: u32) -> Self {
123+
debug_assert!(
124+
max_deduplication_len > 0,
125+
"max_deduplication_len must be greater than 0"
126+
);
127+
Self {
128+
max_deduplication_len: Some(max_deduplication_len),
129+
..self
130+
}
131+
}
132+
114133
/// Set a fixed buffer size for variable length strings
115134
///
116135
/// The block size is the size of the buffer used to store values greater
@@ -334,35 +353,42 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
334353

335354
// Deduplication if:
336355
// (1) deduplication is enabled.
337-
// (2) len > 12
338-
if let Some((mut ht, hasher)) = self.string_tracker.take() {
339-
let hash_val = hasher.hash_one(v);
340-
let hasher_fn = |v: &_| hasher.hash_one(v);
341-
342-
let entry = ht.entry(
343-
hash_val,
344-
|idx| {
345-
let stored_value = self.get_value(*idx);
346-
v == stored_value
347-
},
348-
hasher_fn,
349-
);
350-
match entry {
351-
Entry::Occupied(occupied) => {
352-
// If the string already exists, we will directly use the view
353-
let idx = occupied.get();
354-
self.views_buffer.push(self.views_buffer[*idx]);
355-
self.null_buffer_builder.append_non_null();
356-
self.string_tracker = Some((ht, hasher));
357-
return Ok(());
358-
}
359-
Entry::Vacant(vacant) => {
360-
// o.w. we insert the (string hash -> view index)
361-
// the idx is current length of views_builder, as we are inserting a new view
362-
vacant.insert(self.views_buffer.len());
356+
// (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len`
357+
let can_deduplicate = self.string_tracker.is_some()
358+
&& self
359+
.max_deduplication_len
360+
.map(|max_length| length <= max_length)
361+
.unwrap_or(true);
362+
if can_deduplicate {
363+
if let Some((mut ht, hasher)) = self.string_tracker.take() {
364+
let hash_val = hasher.hash_one(v);
365+
let hasher_fn = |v: &_| hasher.hash_one(v);
366+
367+
let entry = ht.entry(
368+
hash_val,
369+
|idx| {
370+
let stored_value = self.get_value(*idx);
371+
v == stored_value
372+
},
373+
hasher_fn,
374+
);
375+
match entry {
376+
Entry::Occupied(occupied) => {
377+
// If the string already exists, we will directly use the view
378+
let idx = occupied.get();
379+
self.views_buffer.push(self.views_buffer[*idx]);
380+
self.null_buffer_builder.append_non_null();
381+
self.string_tracker = Some((ht, hasher));
382+
return Ok(());
383+
}
384+
Entry::Vacant(vacant) => {
385+
// o.w. we insert the (string hash -> view index)
386+
// the idx is current length of views_builder, as we are inserting a new view
387+
vacant.insert(self.views_buffer.len());
388+
}
363389
}
390+
self.string_tracker = Some((ht, hasher));
364391
}
365-
self.string_tracker = Some((ht, hasher));
366392
}
367393

368394
let required_cap = self.in_progress.len() + v.len();
@@ -636,8 +662,53 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
636662
mod tests {
637663
use core::str;
638664

665+
use arrow_buffer::ArrowNativeType;
666+
639667
use super::*;
640668

669+
#[test]
670+
fn test_string_max_deduplication_len() {
671+
let value_1 = "short";
672+
let value_2 = "not so similar string but long";
673+
let value_3 = "1234567890123";
674+
675+
let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2;
676+
677+
let mut builder = StringViewBuilder::new()
678+
.with_deduplicate_strings()
679+
.with_max_deduplication_len(max_deduplication_len);
680+
681+
assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize());
682+
assert!(value_2.len() > max_deduplication_len.as_usize());
683+
assert!(
684+
value_3.len() > MAX_INLINE_VIEW_LEN.as_usize()
685+
&& value_3.len() < max_deduplication_len.as_usize()
686+
);
687+
688+
// append value1 (short), expect it is inlined and not deduplicated
689+
builder.append_value(value_1); // view 0
690+
builder.append_value(value_1); // view 1
691+
// append value2, expect second copy is not deduplicated as it exceeds max_deduplication_len
692+
builder.append_value(value_2); // view 2
693+
builder.append_value(value_2); // view 3
694+
// append value3, expect second copy is deduplicated
695+
builder.append_value(value_3); // view 4
696+
builder.append_value(value_3); // view 5
697+
698+
let array = builder.finish();
699+
700+
// verify
701+
let v2 = ByteView::from(array.views()[2]);
702+
let v3 = ByteView::from(array.views()[3]);
703+
assert_eq!(v2.buffer_index, v3.buffer_index); // stored in same buffer
704+
assert_ne!(v2.offset, v3.offset); // different offsets --> not deduplicated
705+
706+
let v4 = ByteView::from(array.views()[4]);
707+
let v5 = ByteView::from(array.views()[5]);
708+
assert_eq!(v4.buffer_index, v5.buffer_index); // stored in same buffer
709+
assert_eq!(v4.offset, v5.offset); // same offsets --> deduplicated
710+
}
711+
641712
#[test]
642713
fn test_string_view_deduplicate() {
643714
let value_1 = "long string to test string view";

0 commit comments

Comments
 (0)