1717
1818//! Define the `SpillManager` struct, which is responsible for reading and writing `RecordBatch`es to raw files based on the provided configurations.
1919
20+ use arrow:: array:: StringViewArray ;
2021use arrow:: datatypes:: SchemaRef ;
2122use arrow:: record_batch:: RecordBatch ;
2223use datafusion_execution:: runtime_env:: RuntimeEnv ;
@@ -185,6 +186,8 @@ impl SpillManager {
185186
186187pub ( crate ) trait GetSlicedSize {
187188 /// Returns the size of the `RecordBatch` when sliced.
189+ /// Note: if multiple arrays or even a single array share the same data buffers, we may double count each buffer.
190+ /// Therefore, make sure we call gc() or organize_stringview_arrays() before using this method.
188191 fn get_sliced_size ( & self ) -> Result < usize > ;
189192}
190193
@@ -194,7 +197,83 @@ impl GetSlicedSize for RecordBatch {
194197 for array in self . columns ( ) {
195198 let data = array. to_data ( ) ;
196199 total += data. get_slice_memory_size ( ) ?;
200+
201+ // While StringViewArray holds large data buffer for non inlined string, the Arrow layout (BufferSpec)
202+ // does not include any data buffers. Currently, ArrayData::get_slice_memory_size()
203+ // under-counts memory size by accounting only views buffer although data buffer is cloned during slice()
204+ //
205+ // Therefore, we manually add the sum of the lengths used by all non inlined views
206+ // on top of the sliced size for views buffer. This matches the intended semantics of
207+ // "bytes needed if we materialized exactly this slice into fresh buffers".
208+ // This is a workaround until https://github.com/apache/arrow-rs/issues/8230
209+ if let Some ( sv) = array. as_any ( ) . downcast_ref :: < StringViewArray > ( ) {
210+ for buffer in sv. data_buffers ( ) {
211+ total += buffer. capacity ( ) ;
212+ }
213+ }
197214 }
198215 Ok ( total)
199216 }
200217}
218+
219+ #[ cfg( test) ]
220+ mod tests {
221+ use crate :: spill:: { get_record_batch_memory_size, spill_manager:: GetSlicedSize } ;
222+ use arrow:: datatypes:: { DataType , Field , Schema } ;
223+ use arrow:: {
224+ array:: { ArrayRef , StringViewArray } ,
225+ record_batch:: RecordBatch ,
226+ } ;
227+ use datafusion_common:: Result ;
228+ use std:: sync:: Arc ;
229+
230+ #[ test]
231+ fn check_sliced_size_for_string_view_array ( ) -> Result < ( ) > {
232+ let array_length = 50 ;
233+ let short_len = 8 ;
234+ let long_len = 25 ;
235+
236+ // Build StringViewArray that includes both inline strings and non inlined strings
237+ let strings: Vec < String > = ( 0 ..array_length)
238+ . map ( |i| {
239+ if i % 2 == 0 {
240+ "a" . repeat ( short_len)
241+ } else {
242+ "b" . repeat ( long_len)
243+ }
244+ } )
245+ . collect ( ) ;
246+
247+ let string_array = StringViewArray :: from ( strings) ;
248+ let array_ref: ArrayRef = Arc :: new ( string_array) ;
249+ let batch = RecordBatch :: try_new (
250+ Arc :: new ( Schema :: new ( vec ! [ Field :: new(
251+ "strings" ,
252+ DataType :: Utf8View ,
253+ false ,
254+ ) ] ) ) ,
255+ vec ! [ array_ref] ,
256+ )
257+ . unwrap ( ) ;
258+
259+ // We did not slice the batch, so these two memory size should be equal
260+ assert_eq ! (
261+ batch. get_sliced_size( ) . unwrap( ) ,
262+ get_record_batch_memory_size( & batch)
263+ ) ;
264+
265+ // Slice the batch into half
266+ let half_batch = batch. slice ( 0 , array_length / 2 ) ;
267+ // Now sliced_size is smaller because the views buffer is sliced
268+ assert ! (
269+ half_batch. get_sliced_size( ) . unwrap( )
270+ < get_record_batch_memory_size( & half_batch)
271+ ) ;
272+ let data = arrow:: array:: Array :: to_data ( & half_batch. column ( 0 ) ) ;
273+ let views_sliced_size = data. get_slice_memory_size ( ) ?;
274+ // The sliced size should be larger than sliced views buffer size
275+ assert ! ( views_sliced_size < half_batch. get_sliced_size( ) . unwrap( ) ) ;
276+
277+ Ok ( ( ) )
278+ }
279+ }
0 commit comments