Skip to content

Commit 3f33d3e

Browse files
committed
fix incorrect memory accounting for StringViewArray
1 parent 99daafd commit 3f33d3e

File tree

1 file changed

+79
-0
lines changed

1 file changed

+79
-0
lines changed

datafusion/physical-plan/src/spill/spill_manager.rs

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
//! Define the `SpillManager` struct, which is responsible for reading and writing `RecordBatch`es to raw files based on the provided configurations.
1919
20+
use arrow::array::StringViewArray;
2021
use arrow::datatypes::SchemaRef;
2122
use arrow::record_batch::RecordBatch;
2223
use datafusion_execution::runtime_env::RuntimeEnv;
@@ -185,6 +186,8 @@ impl SpillManager {
185186

186187
pub(crate) trait GetSlicedSize {
187188
/// Returns the size of the `RecordBatch` when sliced.
189+
/// Note: if multiple arrays or even a single array share the same data buffers, we may double count each buffer.
190+
/// Therefore, make sure we call gc() or organize_stringview_arrays() before using this method.
188191
fn get_sliced_size(&self) -> Result<usize>;
189192
}
190193

@@ -194,7 +197,83 @@ impl GetSlicedSize for RecordBatch {
194197
for array in self.columns() {
195198
let data = array.to_data();
196199
total += data.get_slice_memory_size()?;
200+
201+
// While StringViewArray holds large data buffer for non inlined string, the Arrow layout (BufferSpec)
202+
// does not include any data buffers. Currently, ArrayData::get_slice_memory_size()
203+
// under-counts memory size by accounting only views buffer although data buffer is cloned during slice()
204+
//
205+
// Therefore, we manually add the sum of the lengths used by all non inlined views
206+
// on top of the sliced size for views buffer. This matches the intended semantics of
207+
// "bytes needed if we materialized exactly this slice into fresh buffers".
208+
// This is a workaround until https://github.com/apache/arrow-rs/issues/8230
209+
if let Some(sv) = array.as_any().downcast_ref::<StringViewArray>() {
210+
for buffer in sv.data_buffers() {
211+
total += buffer.capacity();
212+
}
213+
}
197214
}
198215
Ok(total)
199216
}
200217
}
218+
219+
#[cfg(test)]
220+
mod tests {
221+
use crate::spill::{get_record_batch_memory_size, spill_manager::GetSlicedSize};
222+
use arrow::datatypes::{DataType, Field, Schema};
223+
use arrow::{
224+
array::{ArrayRef, StringViewArray},
225+
record_batch::RecordBatch,
226+
};
227+
use datafusion_common::Result;
228+
use std::sync::Arc;
229+
230+
#[test]
231+
fn check_sliced_size_for_string_view_array() -> Result<()> {
232+
let array_length = 50;
233+
let short_len = 8;
234+
let long_len = 25;
235+
236+
// Build StringViewArray that includes both inline strings and non inlined strings
237+
let strings: Vec<String> = (0..array_length)
238+
.map(|i| {
239+
if i % 2 == 0 {
240+
"a".repeat(short_len)
241+
} else {
242+
"b".repeat(long_len)
243+
}
244+
})
245+
.collect();
246+
247+
let string_array = StringViewArray::from(strings);
248+
let array_ref: ArrayRef = Arc::new(string_array);
249+
let batch = RecordBatch::try_new(
250+
Arc::new(Schema::new(vec![Field::new(
251+
"strings",
252+
DataType::Utf8View,
253+
false,
254+
)])),
255+
vec![array_ref],
256+
)
257+
.unwrap();
258+
259+
// We did not slice the batch, so these two memory size should be equal
260+
assert_eq!(
261+
batch.get_sliced_size().unwrap(),
262+
get_record_batch_memory_size(&batch)
263+
);
264+
265+
// Slice the batch into half
266+
let half_batch = batch.slice(0, array_length / 2);
267+
// Now sliced_size is smaller because the views buffer is sliced
268+
assert!(
269+
half_batch.get_sliced_size().unwrap()
270+
< get_record_batch_memory_size(&half_batch)
271+
);
272+
let data = arrow::array::Array::to_data(&half_batch.column(0));
273+
let views_sliced_size = data.get_slice_memory_size()?;
274+
// The sliced size should be larger than sliced views buffer size
275+
assert!(views_sliced_size < half_batch.get_sliced_size().unwrap());
276+
277+
Ok(())
278+
}
279+
}

0 commit comments

Comments
 (0)