@@ -33,6 +33,7 @@ use arrow::compute::{
3333use arrow:: datatypes:: SchemaRef ;
3434use arrow:: error:: Result as ArrowResult ;
3535use arrow:: record_batch:: RecordBatch ;
36+ use itertools:: Itertools ;
3637
3738use super :: { RecordBatchStream , SendableRecordBatchStream } ;
3839use crate :: error:: { DataFusionError , Result } ;
@@ -101,9 +102,77 @@ impl ExecutionPlan for MergeSortExec {
101102 }
102103
103104 fn output_hints ( & self ) -> OptimizerHints {
105+ // We do want to retain approximate sorting information. Note that the sorting algorithm's
106+ // index field in struct Key<'a> makes us see that each input stream's unused sort keys
107+ // result in sawtoothed runs.
108+
109+ // For example, if the input streams are sorted by columns A, B, C, D, E, and the sort key
110+ // is A, B, C, then we want the approximate_sort_order to be [[A, B, C], [D, E]], because
111+ // for a given value under ABC, the sort order will have multiple increasing (sawtoothing)
112+ // runs of columns DE the way the input streams get merged (due to the index field usage in
113+ // struct Key<'a>).
114+
115+ let mut hints: OptimizerHints = self . input . output_hints ( ) ;
116+ let sort_order: Vec < usize > = self . columns . iter ( ) . map ( |c| c. index ( ) ) . collect ( ) ;
117+
118+ ' fallback: {
119+ if !hints. approximate_sort_order_is_prefix || hints. approximate_sort_order . is_empty ( ) {
120+ break ' fallback;
121+ }
122+ let first_seg: & Vec < usize > = & hints. approximate_sort_order [ 0 ] ;
123+
124+ let mut sort_order_index: usize = 0 ;
125+ let mut approx_index: usize = 0 ;
126+ while sort_order_index < sort_order. len ( ) {
127+ if first_seg[ approx_index] == sort_order[ sort_order_index] {
128+ sort_order_index += 1 ;
129+ approx_index += 1 ;
130+ if approx_index == first_seg. len ( ) {
131+ break ;
132+ }
133+ } else if hints. single_value_columns . contains ( & first_seg[ approx_index] ) {
134+ approx_index += 1 ;
135+ if approx_index == first_seg. len ( ) {
136+ break ;
137+ }
138+ } else if hints. single_value_columns . contains ( & sort_order[ sort_order_index] ) {
139+ sort_order_index += 1 ;
140+ } else {
141+ // This should not happen.
142+ break ' fallback;
143+ }
144+ }
145+
146+ if approx_index > 0 {
147+ if approx_index != first_seg. len ( ) {
148+ let second_seg = first_seg[ approx_index..] . iter ( ) . map ( |& x| x) . collect_vec ( ) ;
149+ hints. approximate_sort_order . insert ( 1 , second_seg) ;
150+ hints. approximate_sort_order [ 0 ] . truncate ( approx_index) ;
151+ } else {
152+ // It would be weird if sort_order_index is not equal to sort_order.len() --
153+ // another instance of single value columns (we hope).
154+
155+ // Nothing to do here.
156+ }
157+ hints. approximate_sort_order_is_prefix = true ;
158+ } else {
159+ // approx_index == 0
160+
161+ // It's possible we sorted by some single value column, and this means subsequent
162+ // columns are sawtoothing in separate columns. Or is it? Either the input_hints's
163+ // sort_order is inconsistent with the approximate_sort_order, or we have some
164+ // particular treatment of single_value_columns in different code deciding whether
165+ // we can use a MergeExec node, that leads to this case.
166+ hints. approximate_sort_order_is_prefix = false ;
167+ }
168+
169+ return hints;
170+
171+ }
172+
104173 OptimizerHints :: new_sorted (
105- Some ( self . columns . iter ( ) . map ( |c| c . index ( ) ) . collect ( ) ) ,
106- self . input . output_hints ( ) . single_value_columns ,
174+ Some ( sort_order ) ,
175+ hints . single_value_columns ,
107176 )
108177 }
109178
@@ -616,7 +685,6 @@ impl ExecutionPlan for LastRowByUniqueKeyExec {
616685 }
617686
618687 fn output_hints ( & self ) -> OptimizerHints {
619- // Possibly, this is abandoning approximate sort order information.
620688 let input_hints = self . input . output_hints ( ) ;
621689 OptimizerHints :: new_sorted (
622690 input_hints. sort_order ,
0 commit comments