@@ -24,7 +24,7 @@ use std::fmt;
2424use std:: fmt:: { Debug , Formatter } ;
2525use std:: sync:: Arc ;
2626
27- use crate :: common:: { spawn_buffered, IPCWriter } ;
27+ use crate :: common:: spawn_buffered;
2828use crate :: execution_plan:: { Boundedness , CardinalityEffect , EmissionType } ;
2929use crate :: expressions:: PhysicalSortExpr ;
3030use crate :: limit:: LimitStream ;
@@ -44,7 +44,9 @@ use crate::{
4444 Statistics ,
4545} ;
4646
47- use arrow:: array:: { Array , RecordBatch , RecordBatchOptions , UInt32Array } ;
47+ use arrow:: array:: {
48+ Array , RecordBatch , RecordBatchOptions , StringViewArray , UInt32Array ,
49+ } ;
4850use arrow:: compute:: { concat_batches, lexsort_to_indices, take_arrays, SortColumn } ;
4951use arrow:: datatypes:: { DataType , SchemaRef } ;
5052use arrow:: row:: { RowConverter , SortField } ;
@@ -300,6 +302,7 @@ impl ExternalSorter {
300302 if input. num_rows ( ) == 0 {
301303 return Ok ( ( ) ) ;
302304 }
305+
303306 self . reserve_memory_for_merge ( ) ?;
304307
305308 let size = get_reserved_byte_for_record_batch ( & input) ;
@@ -397,6 +400,8 @@ impl ExternalSorter {
397400 return Ok ( 0 ) ;
398401 }
399402
403+ self . organize_stringview_arrays ( ) ?;
404+
400405 debug ! ( "Spilling sort data of ExternalSorter to disk whilst inserting" ) ;
401406
402407 let spill_file = self . runtime . disk_manager . create_tmp_file ( "Sorting" ) ?;
@@ -414,6 +419,69 @@ impl ExternalSorter {
414419 Ok ( used)
415420 }
416421
422+ /// Reconstruct `self.in_mem_batches` to organize the payload buffers of each
423+ /// `StringViewArray` in sequential order by calling `gc()` on them.
424+ ///
425+ /// Note this is a workaround until <https://github.com/apache/arrow-rs/issues/7185> is
426+ /// available
427+ ///
428+ /// # Rationale
429+ /// After (merge-based) sorting, all batches will be sorted into a single run,
430+ /// but physically this sorted run is chunked into many small batches. For
431+ /// `StringViewArray`s inside each sorted run, their inner buffers are not
432+ /// re-constructed by default, leading to non-sequential payload locations
433+ /// (permutated by `interleave()` Arrow kernel). A single payload buffer might
434+ /// be shared by multiple `RecordBatch`es.
435+ /// When writing each batch to disk, the writer has to write all referenced buffers,
436+ /// because they have to be read back one by one to reduce memory usage. This
437+ /// causes extra disk reads and writes, and potentially execution failure.
438+ ///
439+ /// # Example
440+ /// Before sorting:
441+ /// batch1 -> buffer1
442+ /// batch2 -> buffer2
443+ ///
444+ /// sorted_batch1 -> buffer1
445+ /// -> buffer2
446+ /// sorted_batch2 -> buffer1
447+ /// -> buffer2
448+ ///
449+ /// Then when spilling each batch, the writer has to write all referenced buffers
450+ /// repeatedly.
451+ fn organize_stringview_arrays ( & mut self ) -> Result < ( ) > {
452+ let mut organized_batches = Vec :: with_capacity ( self . in_mem_batches . len ( ) ) ;
453+
454+ for batch in self . in_mem_batches . drain ( ..) {
455+ let mut new_columns: Vec < Arc < dyn Array > > =
456+ Vec :: with_capacity ( batch. num_columns ( ) ) ;
457+
458+ let mut arr_mutated = false ;
459+ for array in batch. columns ( ) {
460+ if let Some ( string_view_array) =
461+ array. as_any ( ) . downcast_ref :: < StringViewArray > ( )
462+ {
463+ let new_array = string_view_array. gc ( ) ;
464+ new_columns. push ( Arc :: new ( new_array) ) ;
465+ arr_mutated = true ;
466+ } else {
467+ new_columns. push ( Arc :: clone ( array) ) ;
468+ }
469+ }
470+
471+ let organized_batch = if arr_mutated {
472+ RecordBatch :: try_new ( batch. schema ( ) , new_columns) ?
473+ } else {
474+ batch
475+ } ;
476+
477+ organized_batches. push ( organized_batch) ;
478+ }
479+
480+ self . in_mem_batches = organized_batches;
481+
482+ Ok ( ( ) )
483+ }
484+
417485 /// Sorts the in_mem_batches in place
418486 ///
419487 /// Sorting may have freed memory, especially if fetch is `Some`. If
@@ -439,54 +507,29 @@ impl ExternalSorter {
439507 // `self.in_mem_batches` is already taken away by the sort_stream, now it is empty.
440508 // We'll gradually collect the sorted stream into self.in_mem_batches, or directly
441509 // write sorted batches to disk when the memory is insufficient.
442- let mut spill_writer: Option < IPCWriter > = None ;
443510 while let Some ( batch) = sorted_stream. next ( ) . await {
444511 let batch = batch?;
445- match & mut spill_writer {
446- None => {
447- let sorted_size = get_reserved_byte_for_record_batch ( & batch) ;
448- if self . reservation . try_grow ( sorted_size) . is_err ( ) {
449- // Directly write in_mem_batches as well as all the remaining batches in
450- // sorted_stream to disk. Further batches fetched from `sorted_stream` will
451- // be handled by the `Some(writer)` matching arm.
452- let spill_file =
453- self . runtime . disk_manager . create_tmp_file ( "Sorting" ) ?;
454- let mut writer = IPCWriter :: new ( spill_file. path ( ) , & self . schema ) ?;
455- // Flush everything in memory to the spill file
456- for batch in self . in_mem_batches . drain ( ..) {
457- writer. write ( & batch) ?;
458- }
459- // as well as the newly sorted batch
460- writer. write ( & batch) ?;
461- spill_writer = Some ( writer) ;
462- self . reservation . free ( ) ;
463- self . spills . push ( spill_file) ;
464- } else {
465- self . in_mem_batches . push ( batch) ;
466- self . in_mem_batches_sorted = true ;
467- }
468- }
469- Some ( writer) => {
470- writer. write ( & batch) ?;
471- }
512+ let sorted_size = get_reserved_byte_for_record_batch ( & batch) ;
513+ if self . reservation . try_grow ( sorted_size) . is_err ( ) {
514+ // Although the reservation is not enough, the batch is
515+ // already in memory, so it's okay to combine it with previously
516+ // sorted batches, and spill together.
517+ self . in_mem_batches . push ( batch) ;
518+ self . spill ( ) . await ?; // reservation is freed in spill()
519+ } else {
520+ self . in_mem_batches . push ( batch) ;
521+ self . in_mem_batches_sorted = true ;
472522 }
473523 }
474524
475525 // Drop early to free up memory reserved by the sorted stream, otherwise the
476526 // upcoming `self.reserve_memory_for_merge()` may fail due to insufficient memory.
477527 drop ( sorted_stream) ;
478528
479- if let Some ( writer) = & mut spill_writer {
480- writer. finish ( ) ?;
481- self . metrics . spill_count . add ( 1 ) ;
482- self . metrics . spilled_rows . add ( writer. num_rows ) ;
483- self . metrics . spilled_bytes . add ( writer. num_bytes ) ;
484- }
485-
486529 // Sorting may free up some memory especially when fetch is `Some`. If we have
487530 // not freed more than 50% of the memory, then we have to spill to free up more
488531 // memory for inserting more batches.
489- if spill_writer . is_none ( ) && self . reservation . size ( ) > before / 2 {
532+ if self . reservation . size ( ) > before / 2 {
490533 // We have not freed more than 50% of the memory, so we have to spill to
491534 // free up more memory
492535 self . spill ( ) . await ?;
@@ -1422,10 +1465,14 @@ mod tests {
14221465 let spill_count = metrics. spill_count ( ) . unwrap ( ) ;
14231466 let spilled_rows = metrics. spilled_rows ( ) . unwrap ( ) ;
14241467 let spilled_bytes = metrics. spilled_bytes ( ) . unwrap ( ) ;
1425- // Processing 840 KB of data using 400 KB of memory requires at least 2 spills
1426- // It will spill roughly 18000 rows and 800 KBytes.
1427- // We leave a little wiggle room for the actual numbers.
1428- assert ! ( ( 2 ..=10 ) . contains( & spill_count) ) ;
1468+
1469+ // This test case is processing 840KB of data using 400KB of memory. Note
1470+ // that buffered batches can't be dropped until all sorted batches are
1471+ // generated, so we can only buffer `sort_spill_reservation_bytes` of sorted
1472+ // batches.
1473+ // The number of spills is roughly calculated as:
1474+ // `number_of_batches / (sort_spill_reservation_bytes / batch_size)`
1475+ assert ! ( ( 12 ..=18 ) . contains( & spill_count) ) ;
14291476 assert ! ( ( 15000 ..=20000 ) . contains( & spilled_rows) ) ;
14301477 assert ! ( ( 700000 ..=900000 ) . contains( & spilled_bytes) ) ;
14311478
0 commit comments