11// SPDX-License-Identifier: Apache-2.0
22// SPDX-FileCopyrightText: Copyright the Vortex contributors
33
4- use vortex_error:: VortexResult ;
4+ use vortex_error:: { VortexResult , VortexUnwrap } ;
55use vortex_scalar:: Scalar ;
66
77use crate :: arrays:: { VarBinViewArray , VarBinViewVTable , varbin_scalar} ;
8+ use crate :: builders:: { ArrayBuilder , VarBinViewBuilder } ;
9+ use crate :: validity:: Validity ;
810use crate :: vtable:: { OperationsVTable , ValidityHelper } ;
911use crate :: { ArrayRef , IntoArray } ;
1012
@@ -24,4 +26,220 @@ impl OperationsVTable<VarBinViewVTable> for VarBinViewVTable {
2426 fn scalar_at ( array : & VarBinViewArray , index : usize ) -> VortexResult < Scalar > {
2527 Ok ( varbin_scalar ( array. bytes_at ( index) , array. dtype ( ) ) )
2628 }
29+
30+ fn optimize ( array : & VarBinViewArray ) -> VortexResult < VarBinViewArray > {
31+ // If there is nothing to be gained by compaction, return the original array untouched.
32+ if !should_compact ( array) {
33+ return Ok ( array. clone ( ) ) ;
34+ }
35+
36+ // Compaction pathways, depend on the validity
37+ match array. validity {
38+ // The array contains no values, all buffers can be dropped.
39+ Validity :: AllInvalid => Ok ( VarBinViewArray :: try_new (
40+ array. views ( ) . clone ( ) ,
41+ vec ! [ ] ,
42+ array. dtype ( ) . clone ( ) ,
43+ array. validity ( ) . clone ( ) ,
44+ ) ?) ,
45+ // Non-null pathway
46+ Validity :: NonNullable | Validity :: AllValid => rebuild_nonnull ( array) ,
47+ // Nullable pathway, requires null-checks for each value
48+ Validity :: Array ( _) => rebuild_nullable ( array) ,
49+ }
50+ }
51+ }
52+
53+ fn should_compact ( array : & VarBinViewArray ) -> bool {
54+ // If the array is entirely inlined strings, do not attempt to compact.
55+ if array. nbuffers ( ) == 0 {
56+ return false ;
57+ }
58+
59+ let bytes_referenced: u64 = count_referenced_bytes ( array) ;
60+ let buffer_total_bytes: u64 = array. buffers . iter ( ) . map ( |buf| buf. len ( ) as u64 ) . sum ( ) ;
61+
62+ // If there is any wasted space, we want to repack.
63+ // This is very aggressive.
64+ bytes_referenced < buffer_total_bytes
65+ }
66+
67+ // count the number of bytes addressed by the views, not including null
68+ // values or any inlined strings.
69+ fn count_referenced_bytes ( array : & VarBinViewArray ) -> u64 {
70+ match array. validity ( ) {
71+ Validity :: AllInvalid => 0u64 ,
72+ _ => {
73+ array
74+ . views ( )
75+ . iter ( )
76+ . enumerate ( )
77+ . map ( |( idx, & view) | {
78+ if !array. is_valid ( idx) . vortex_unwrap ( ) || view. is_inlined ( ) {
79+ 0u64
80+ } else {
81+ // SAFETY: in this branch the view is not inlined.
82+ unsafe { view. _ref } . size as u64
83+ }
84+ } )
85+ . sum ( )
86+ }
87+ }
88+ }
89+
90+ // Nullable string array compaction pathway.
91+ // This requires a null check on every append.
92+ fn rebuild_nullable ( array : & VarBinViewArray ) -> VortexResult < VarBinViewArray > {
93+ let mut builder = VarBinViewBuilder :: with_capacity ( array. dtype ( ) . clone ( ) , array. len ( ) ) ;
94+ for i in 0 ..array. len ( ) {
95+ if !array. is_valid ( i) ? {
96+ builder. append_null ( ) ;
97+ } else {
98+ let bytes = array. bytes_at ( i) ;
99+ builder. append_value ( bytes. as_slice ( ) ) ;
100+ }
101+ }
102+
103+ Ok ( builder. finish_into_varbinview ( ) )
104+ }
105+
106+ // Compaction for string arrays that contain no null values. Saves a branch
107+ // for every string element.
108+ fn rebuild_nonnull ( array : & VarBinViewArray ) -> VortexResult < VarBinViewArray > {
109+ let mut builder = VarBinViewBuilder :: with_capacity ( array. dtype ( ) . clone ( ) , array. len ( ) ) ;
110+ for i in 0 ..array. len ( ) {
111+ builder. append_value ( array. bytes_at ( i) . as_ref ( ) ) ;
112+ }
113+ Ok ( builder. finish_into_varbinview ( ) )
114+ }
115+
116+ #[ cfg( test) ]
117+ mod tests {
118+ use vortex_buffer:: buffer;
119+
120+ use crate :: IntoArray ;
121+ use crate :: arrays:: { VarBinViewArray , VarBinViewVTable } ;
122+ use crate :: compute:: take;
123+
124+ #[ test]
125+ fn test_optimize_compacts_buffers ( ) {
126+ // Create a VarBinViewArray with some long strings that will create multiple buffers
127+ let original = VarBinViewArray :: from_iter_nullable_str ( [
128+ Some ( "short" ) ,
129+ Some ( "this is a longer string that will be stored in a buffer" ) ,
130+ Some ( "medium length string" ) ,
131+ Some ( "another very long string that definitely needs a buffer to store it" ) ,
132+ Some ( "tiny" ) ,
133+ ] ) ;
134+
135+ // Verify it has buffers
136+ assert ! ( original. nbuffers( ) > 0 ) ;
137+ let original_buffers = original. nbuffers ( ) ;
138+
139+ // Take only the first and last elements (indices 0 and 4)
140+ let indices = buffer ! [ 0u32 , 4u32 ] . into_array ( ) ;
141+ let taken = take ( original. as_ref ( ) , & indices) . unwrap ( ) ;
142+ let taken_array = taken. as_ :: < VarBinViewVTable > ( ) ;
143+
144+ // The taken array should still have the same number of buffers
145+ assert_eq ! ( taken_array. nbuffers( ) , original_buffers) ;
146+
147+ // Now optimize the taken array
148+ let optimized = taken_array. optimize ( ) . unwrap ( ) ;
149+ let optimized_array = optimized. as_ :: < VarBinViewVTable > ( ) ;
150+
151+ // The optimized array should have compacted buffers
152+ // Since both remaining strings are short, they should be inlined
153+ // so we might have 0 buffers, or 1 buffer if any were not inlined
154+ assert ! ( optimized_array. nbuffers( ) <= 1 ) ;
155+
156+ // Verify the data is still correct
157+ assert_eq ! ( optimized_array. len( ) , 2 ) ;
158+ assert_eq ! ( optimized_array. scalar_at( 0 ) . unwrap( ) , "short" . into( ) ) ;
159+ assert_eq ! ( optimized_array. scalar_at( 1 ) . unwrap( ) , "tiny" . into( ) ) ;
160+ }
161+
162+ #[ test]
163+ fn test_optimize_with_long_strings ( ) {
164+ // Create strings that are definitely longer than 12 bytes
165+ let long_string_1 = "this is definitely a very long string that exceeds the inline limit" ;
166+ let long_string_2 = "another extremely long string that also needs external buffer storage" ;
167+ let long_string_3 = "yet another long string for testing buffer compaction functionality" ;
168+
169+ let original = VarBinViewArray :: from_iter_str ( [
170+ long_string_1,
171+ long_string_2,
172+ long_string_3,
173+ "short1" ,
174+ "short2" ,
175+ ] ) ;
176+
177+ // Take only the first and third long strings (indices 0 and 2)
178+ let indices = buffer ! [ 0u32 , 2u32 ] . into_array ( ) ;
179+ let taken = take ( original. as_ref ( ) , & indices) . unwrap ( ) ;
180+ let taken_array = taken. as_ :: < VarBinViewVTable > ( ) ;
181+
182+ // Optimize the taken array
183+ let optimized = taken_array. optimize ( ) . unwrap ( ) ;
184+ let optimized_array = optimized. as_ :: < VarBinViewVTable > ( ) ;
185+
186+ // The optimized array should have exactly 1 buffer (consolidated)
187+ assert_eq ! ( optimized_array. nbuffers( ) , 1 ) ;
188+
189+ // Verify the data is still correct
190+ assert_eq ! ( optimized_array. len( ) , 2 ) ;
191+ assert_eq ! ( optimized_array. scalar_at( 0 ) . unwrap( ) , long_string_1. into( ) ) ;
192+ assert_eq ! ( optimized_array. scalar_at( 1 ) . unwrap( ) , long_string_3. into( ) ) ;
193+ }
194+
195+ #[ test]
196+ fn test_optimize_no_buffers ( ) {
197+ // Create an array with only short strings (all inlined)
198+ let original = VarBinViewArray :: from_iter_str ( [ "a" , "bb" , "ccc" , "dddd" ] ) ;
199+
200+ // This should have no buffers
201+ assert_eq ! ( original. nbuffers( ) , 0 ) ;
202+
203+ // Optimize should return the same array
204+ let optimized = original. optimize ( ) . unwrap ( ) ;
205+ let optimized_array = optimized. as_ :: < VarBinViewVTable > ( ) ;
206+
207+ assert_eq ! ( optimized_array. nbuffers( ) , 0 ) ;
208+ assert_eq ! ( optimized_array. len( ) , 4 ) ;
209+
210+ // Verify all values are preserved
211+ for i in 0 ..4 {
212+ assert_eq ! (
213+ optimized_array. scalar_at( i) . unwrap( ) ,
214+ original. scalar_at( i) . unwrap( )
215+ ) ;
216+ }
217+ }
218+
219+ #[ test]
220+ fn test_optimize_single_buffer ( ) {
221+ // Create an array that naturally has only one buffer
222+ let str1 = "this is a long string that goes into a buffer" ;
223+ let str2 = "another long string in the same buffer" ;
224+ let original = VarBinViewArray :: from_iter_str ( [ str1, str2] ) ;
225+
226+ // Should have 1 compact buffer
227+ assert_eq ! ( original. nbuffers( ) , 1 ) ;
228+ assert_eq ! ( original. buffer( 0 ) . len( ) , str1. len( ) + str2. len( ) ) ;
229+
230+ // Optimize should return the same array (no change needed)
231+ let optimized = original. optimize ( ) . unwrap ( ) ;
232+ let optimized_array = optimized. as_ :: < VarBinViewVTable > ( ) ;
233+
234+ assert_eq ! ( optimized_array. nbuffers( ) , 1 ) ;
235+ assert_eq ! ( optimized_array. len( ) , 2 ) ;
236+
237+ // Verify all values are preserved
238+ for i in 0 ..2 {
239+ assert_eq ! (
240+ optimized_array. scalar_at( i) . unwrap( ) ,
241+ original. scalar_at( i) . unwrap( )
242+ ) ;
243+ }
244+ }
27245}
0 commit comments