Skip to content

Commit ec9d962

Browse files
authored
feat: Add optimize ArrayOp with VBView implementation (#3825)
Adds a new OperationVTable method `optimize` which will return a new array with unused data garbage-collected. This is similar to the arrow-rs `StringViewArray::gc` method. See doc comments in the PR for more info. This implements a simple optimize operation that will fully rebuild a VarBinViewArray if there is any wasted space in the buffers. Signed-off-by: Andrew Duffy <[email protected]>
1 parent be9c2fd commit ec9d962

File tree

4 files changed

+308
-22
lines changed

4 files changed

+308
-22
lines changed

vortex-array/src/array/mod.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@ pub trait Array: 'static + private::Sealed + Send + Sync + Debug + ArrayVisitor
6262
/// Fetch the scalar at the given index.
6363
fn scalar_at(&self, index: usize) -> VortexResult<Scalar>;
6464

65+
/// Return an optimized version of the same array.
66+
///
67+
/// See [`OperationsVTable::optimize`] for more details.
68+
fn optimize(&self) -> VortexResult<ArrayRef>;
69+
6570
/// Returns whether the array is of the given encoding.
6671
fn is_encoding(&self, encoding: EncodingId) -> bool {
6772
self.encoding_id() == encoding
@@ -183,6 +188,10 @@ impl Array for Arc<dyn Array> {
183188
self.as_ref().scalar_at(index)
184189
}
185190

191+
fn optimize(&self) -> VortexResult<ArrayRef> {
192+
self.as_ref().optimize()
193+
}
194+
186195
fn is_valid(&self, index: usize) -> VortexResult<bool> {
187196
self.as_ref().is_valid(index)
188197
}
@@ -430,6 +439,39 @@ impl<V: VTable> Array for ArrayAdapter<V> {
430439
Ok(scalar)
431440
}
432441

442+
fn optimize(&self) -> VortexResult<ArrayRef> {
443+
let result = <V::OperationsVTable as OperationsVTable<V>>::optimize(&self.0)?.into_array();
444+
445+
#[cfg(debug_assertions)]
446+
{
447+
let nbytes = self.0.nbytes();
448+
let result_nbytes = result.nbytes();
449+
assert!(
450+
result_nbytes <= nbytes,
451+
"optimize() made the array larger: {} bytes -> {} bytes",
452+
nbytes,
453+
result_nbytes
454+
);
455+
}
456+
457+
assert_eq!(
458+
self.dtype(),
459+
result.dtype(),
460+
"optimize() changed DType from {} to {}",
461+
self.dtype(),
462+
result.dtype()
463+
);
464+
assert_eq!(
465+
result.len(),
466+
self.len(),
467+
"optimize() changed len from {} to {}",
468+
self.len(),
469+
result.len()
470+
);
471+
472+
Ok(result)
473+
}
474+
433475
fn is_valid(&self, index: usize) -> VortexResult<bool> {
434476
if index >= self.len() {
435477
vortex_bail!(OutOfBounds: index, 0, self.len());

vortex-array/src/arrays/varbinview/ops.rs

Lines changed: 219 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
use vortex_error::VortexResult;
4+
use vortex_error::{VortexResult, VortexUnwrap};
55
use vortex_scalar::Scalar;
66

77
use crate::arrays::{VarBinViewArray, VarBinViewVTable, varbin_scalar};
8+
use crate::builders::{ArrayBuilder, VarBinViewBuilder};
9+
use crate::validity::Validity;
810
use crate::vtable::{OperationsVTable, ValidityHelper};
911
use crate::{ArrayRef, IntoArray};
1012

@@ -24,4 +26,220 @@ impl OperationsVTable<VarBinViewVTable> for VarBinViewVTable {
2426
fn scalar_at(array: &VarBinViewArray, index: usize) -> VortexResult<Scalar> {
2527
Ok(varbin_scalar(array.bytes_at(index), array.dtype()))
2628
}
29+
30+
fn optimize(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
31+
// If there is nothing to be gained by compaction, return the original array untouched.
32+
if !should_compact(array) {
33+
return Ok(array.clone());
34+
}
35+
36+
// Compaction pathways, depend on the validity
37+
match array.validity {
38+
// The array contains no values, all buffers can be dropped.
39+
Validity::AllInvalid => Ok(VarBinViewArray::try_new(
40+
array.views().clone(),
41+
vec![],
42+
array.dtype().clone(),
43+
array.validity().clone(),
44+
)?),
45+
// Non-null pathway
46+
Validity::NonNullable | Validity::AllValid => rebuild_nonnull(array),
47+
// Nullable pathway, requires null-checks for each value
48+
Validity::Array(_) => rebuild_nullable(array),
49+
}
50+
}
51+
}
52+
53+
fn should_compact(array: &VarBinViewArray) -> bool {
54+
// If the array is entirely inlined strings, do not attempt to compact.
55+
if array.nbuffers() == 0 {
56+
return false;
57+
}
58+
59+
let bytes_referenced: u64 = count_referenced_bytes(array);
60+
let buffer_total_bytes: u64 = array.buffers.iter().map(|buf| buf.len() as u64).sum();
61+
62+
// If there is any wasted space, we want to repack.
63+
// This is very aggressive.
64+
bytes_referenced < buffer_total_bytes
65+
}
66+
67+
// count the number of bytes addressed by the views, not including null
68+
// values or any inlined strings.
69+
fn count_referenced_bytes(array: &VarBinViewArray) -> u64 {
70+
match array.validity() {
71+
Validity::AllInvalid => 0u64,
72+
_ => {
73+
array
74+
.views()
75+
.iter()
76+
.enumerate()
77+
.map(|(idx, &view)| {
78+
if !array.is_valid(idx).vortex_unwrap() || view.is_inlined() {
79+
0u64
80+
} else {
81+
// SAFETY: in this branch the view is not inlined.
82+
unsafe { view._ref }.size as u64
83+
}
84+
})
85+
.sum()
86+
}
87+
}
88+
}
89+
90+
// Nullable string array compaction pathway.
91+
// This requires a null check on every append.
92+
fn rebuild_nullable(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
93+
let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
94+
for i in 0..array.len() {
95+
if !array.is_valid(i)? {
96+
builder.append_null();
97+
} else {
98+
let bytes = array.bytes_at(i);
99+
builder.append_value(bytes.as_slice());
100+
}
101+
}
102+
103+
Ok(builder.finish_into_varbinview())
104+
}
105+
106+
// Compaction for string arrays that contain no null values. Saves a branch
107+
// for every string element.
108+
fn rebuild_nonnull(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
109+
let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
110+
for i in 0..array.len() {
111+
builder.append_value(array.bytes_at(i).as_ref());
112+
}
113+
Ok(builder.finish_into_varbinview())
114+
}
115+
116+
#[cfg(test)]
117+
mod tests {
118+
use vortex_buffer::buffer;
119+
120+
use crate::IntoArray;
121+
use crate::arrays::{VarBinViewArray, VarBinViewVTable};
122+
use crate::compute::take;
123+
124+
#[test]
125+
fn test_optimize_compacts_buffers() {
126+
// Create a VarBinViewArray with some long strings that will create multiple buffers
127+
let original = VarBinViewArray::from_iter_nullable_str([
128+
Some("short"),
129+
Some("this is a longer string that will be stored in a buffer"),
130+
Some("medium length string"),
131+
Some("another very long string that definitely needs a buffer to store it"),
132+
Some("tiny"),
133+
]);
134+
135+
// Verify it has buffers
136+
assert!(original.nbuffers() > 0);
137+
let original_buffers = original.nbuffers();
138+
139+
// Take only the first and last elements (indices 0 and 4)
140+
let indices = buffer![0u32, 4u32].into_array();
141+
let taken = take(original.as_ref(), &indices).unwrap();
142+
let taken_array = taken.as_::<VarBinViewVTable>();
143+
144+
// The taken array should still have the same number of buffers
145+
assert_eq!(taken_array.nbuffers(), original_buffers);
146+
147+
// Now optimize the taken array
148+
let optimized = taken_array.optimize().unwrap();
149+
let optimized_array = optimized.as_::<VarBinViewVTable>();
150+
151+
// The optimized array should have compacted buffers
152+
// Since both remaining strings are short, they should be inlined
153+
// so we might have 0 buffers, or 1 buffer if any were not inlined
154+
assert!(optimized_array.nbuffers() <= 1);
155+
156+
// Verify the data is still correct
157+
assert_eq!(optimized_array.len(), 2);
158+
assert_eq!(optimized_array.scalar_at(0).unwrap(), "short".into());
159+
assert_eq!(optimized_array.scalar_at(1).unwrap(), "tiny".into());
160+
}
161+
162+
#[test]
163+
fn test_optimize_with_long_strings() {
164+
// Create strings that are definitely longer than 12 bytes
165+
let long_string_1 = "this is definitely a very long string that exceeds the inline limit";
166+
let long_string_2 = "another extremely long string that also needs external buffer storage";
167+
let long_string_3 = "yet another long string for testing buffer compaction functionality";
168+
169+
let original = VarBinViewArray::from_iter_str([
170+
long_string_1,
171+
long_string_2,
172+
long_string_3,
173+
"short1",
174+
"short2",
175+
]);
176+
177+
// Take only the first and third long strings (indices 0 and 2)
178+
let indices = buffer![0u32, 2u32].into_array();
179+
let taken = take(original.as_ref(), &indices).unwrap();
180+
let taken_array = taken.as_::<VarBinViewVTable>();
181+
182+
// Optimize the taken array
183+
let optimized = taken_array.optimize().unwrap();
184+
let optimized_array = optimized.as_::<VarBinViewVTable>();
185+
186+
// The optimized array should have exactly 1 buffer (consolidated)
187+
assert_eq!(optimized_array.nbuffers(), 1);
188+
189+
// Verify the data is still correct
190+
assert_eq!(optimized_array.len(), 2);
191+
assert_eq!(optimized_array.scalar_at(0).unwrap(), long_string_1.into());
192+
assert_eq!(optimized_array.scalar_at(1).unwrap(), long_string_3.into());
193+
}
194+
195+
#[test]
196+
fn test_optimize_no_buffers() {
197+
// Create an array with only short strings (all inlined)
198+
let original = VarBinViewArray::from_iter_str(["a", "bb", "ccc", "dddd"]);
199+
200+
// This should have no buffers
201+
assert_eq!(original.nbuffers(), 0);
202+
203+
// Optimize should return the same array
204+
let optimized = original.optimize().unwrap();
205+
let optimized_array = optimized.as_::<VarBinViewVTable>();
206+
207+
assert_eq!(optimized_array.nbuffers(), 0);
208+
assert_eq!(optimized_array.len(), 4);
209+
210+
// Verify all values are preserved
211+
for i in 0..4 {
212+
assert_eq!(
213+
optimized_array.scalar_at(i).unwrap(),
214+
original.scalar_at(i).unwrap()
215+
);
216+
}
217+
}
218+
219+
#[test]
220+
fn test_optimize_single_buffer() {
221+
// Create an array that naturally has only one buffer
222+
let str1 = "this is a long string that goes into a buffer";
223+
let str2 = "another long string in the same buffer";
224+
let original = VarBinViewArray::from_iter_str([str1, str2]);
225+
226+
// Should have 1 compact buffer
227+
assert_eq!(original.nbuffers(), 1);
228+
assert_eq!(original.buffer(0).len(), str1.len() + str2.len());
229+
230+
// Optimize should return the same array (no change needed)
231+
let optimized = original.optimize().unwrap();
232+
let optimized_array = optimized.as_::<VarBinViewVTable>();
233+
234+
assert_eq!(optimized_array.nbuffers(), 1);
235+
assert_eq!(optimized_array.len(), 2);
236+
237+
// Verify all values are preserved
238+
for i in 0..2 {
239+
assert_eq!(
240+
optimized_array.scalar_at(i).unwrap(),
241+
original.scalar_at(i).unwrap()
242+
);
243+
}
244+
}
27245
}

vortex-array/src/builders/varbinview.rs

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,31 @@ impl VarBinViewBuilder {
120120
debug_assert_eq!(self.null_buffer_builder.len(), self.views_builder.len())
121121
}
122122

123+
pub fn finish_into_varbinview(&mut self) -> VarBinViewArray {
124+
self.flush_in_progress();
125+
let buffers = std::mem::take(&mut self.completed);
126+
127+
assert_eq!(
128+
self.views_builder.len(),
129+
self.null_buffer_builder.len(),
130+
"View and validity length must match"
131+
);
132+
133+
let validity = self
134+
.null_buffer_builder
135+
.finish_with_nullability(self.nullability);
136+
137+
VarBinViewArray::try_new(
138+
std::mem::take(&mut self.views_builder).freeze(),
139+
buffers,
140+
std::mem::replace(&mut self.dtype, DType::Null),
141+
validity,
142+
)
143+
.vortex_expect("VarBinViewArray components should be valid.")
144+
}
145+
}
146+
147+
impl VarBinViewBuilder {
123148
// Pushes a validity mask into the builder not affecting the views or buffers
124149
fn push_only_validity_mask(&mut self, validity_mask: Mask) {
125150
self.null_buffer_builder.append_validity_mask(validity_mask);
@@ -191,27 +216,7 @@ impl ArrayBuilder for VarBinViewBuilder {
191216
}
192217

193218
fn finish(&mut self) -> ArrayRef {
194-
self.flush_in_progress();
195-
let buffers = std::mem::take(&mut self.completed);
196-
197-
assert_eq!(
198-
self.views_builder.len(),
199-
self.null_buffer_builder.len(),
200-
"View and validity length must match"
201-
);
202-
203-
let validity = self
204-
.null_buffer_builder
205-
.finish_with_nullability(self.nullability);
206-
207-
VarBinViewArray::try_new(
208-
std::mem::take(&mut self.views_builder).freeze(),
209-
buffers,
210-
std::mem::replace(&mut self.dtype, DType::Null),
211-
validity,
212-
)
213-
.vortex_expect("VarBinViewArray components should be valid.")
214-
.into_array()
219+
self.finish_into_varbinview().into_array()
215220
}
216221
}
217222

vortex-array/src/vtable/operations.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,25 @@ pub trait OperationsVTable<V: VTable> {
2828
/// Bounds-checking has already been performed by the time this function is called,
2929
/// and the index is guaranteed to be non-null.
3030
fn scalar_at(array: &V::Array, index: usize) -> VortexResult<Scalar>;
31+
32+
/// Return an optimized copy of an Array, with any unreferenced data blocks freed and
33+
/// any extraneous information removed.
34+
///
35+
/// Many simple contiguous array types do not benefit from this operation, but it is
36+
/// especially useful for variable-length types which keep a variable number of buffers
37+
/// that can be dereferenced by other internal data structures via simple zero-copy
38+
/// filter and take operations. After multiple operations are applied to these arrays, it is
39+
/// common for the majority of owned buffer data to no longer logically be referenced.
40+
///
41+
/// This operation can be called to return a new copy of an array with the same encoding,
42+
/// but with all unreferenced data unlinked.
43+
///
44+
/// ## Default behavior
45+
///
46+
/// For most arrays that do not contain variable buffer counts, such as the canonical
47+
/// arrays, the default implementation will not attempt to perform compaction and instead
48+
/// return the original array.
49+
fn optimize(array: &V::Array) -> VortexResult<V::Array> {
50+
Ok(array.clone())
51+
}
3152
}

0 commit comments

Comments
 (0)