Sortedness using approximate sort order

srh · srh · commit 28892154913f · 2024-10-29T09:53:51.000-07:00
diff --git a/datafusion/src/physical_plan/coalesce_partitions.rs b/datafusion/src/physical_plan/coalesce_partitions.rs
@@ -29,6 +29,7 @@ use async_trait::async_trait;
 use arrow::record_batch::RecordBatch;
 use arrow::{datatypes::SchemaRef, error::Result as ArrowResult};
 
+use super::merge::MergeExec;
 use super::RecordBatchStream;
 use crate::error::{DataFusionError, Result};
 use crate::physical_plan::{
@@ -38,7 +39,6 @@ use crate::physical_plan::{
 use super::SendableRecordBatchStream;
 use crate::physical_plan::common::spawn_execution;
 use pin_project_lite::pin_project;
-use std::option::Option::None;
 
 /// Merge execution plan executes partitions in parallel and combines them into a single
 /// partition. No guarantees are made about the order of the resulting partition.
@@ -132,15 +132,7 @@ impl ExecutionPlan for CoalescePartitionsExec {
     }
 
     fn output_hints(&self) -> OptimizerHints {
-        let input_hints = self.input.output_hints();
-        let sort_order;
-        if self.input.output_partitioning().partition_count() <= 1 {
-            sort_order = input_hints.sort_order
-        } else {
-            sort_order = None
-        }
-        // TODO: This could do approximate sort order, no?
-        OptimizerHints::new_sorted(sort_order, input_hints.single_value_columns)
+        MergeExec::output_hints_from_input_hints(self.input.as_ref())
     }
 
     fn fmt_as(
diff --git a/datafusion/src/physical_plan/filter.rs b/datafusion/src/physical_plan/filter.rs
@@ -128,7 +128,13 @@ impl ExecutionPlan for FilterExec {
         single_value_columns.sort_unstable();
         single_value_columns.dedup();
 
-        OptimizerHints::new_sorted(inputs_hints.sort_order, single_value_columns)
+        OptimizerHints {
+            sort_order: inputs_hints.sort_order,
+            approximate_sort_order: inputs_hints.approximate_sort_order,
+            approximate_sort_order_is_strict: inputs_hints.approximate_sort_order_is_strict,
+            approximate_sort_order_is_prefix: inputs_hints.approximate_sort_order_is_prefix,
+            single_value_columns,
+        }
     }
 
     async fn execute(&self, partition: usize) -> Result<SendableRecordBatchStream> {
diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs
@@ -325,6 +325,9 @@ impl ExecutionPlan for HashAggregateExec {
             AggregateStrategy::Hash => None,
             AggregateStrategy::InplaceSorted => self.output_sort_order.clone(),
         };
+        // TODO: This could pass up self.approximate_sort_order (after saving it like
+        // self.output_sort_order).  (It is possible for self.output_sort_order to be None when
+        // there is an approximate sort order.)
         OptimizerHints::new_sorted(
             sort_order,
             Vec::new(),
diff --git a/datafusion/src/physical_plan/merge.rs b/datafusion/src/physical_plan/merge.rs
@@ -62,6 +62,28 @@ impl MergeExec {
     pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
         &self.input
     }
+
+    pub fn output_hints_from_input_hints(input: &dyn ExecutionPlan) -> OptimizerHints {
+        let input_hints = input.output_hints();
+        let sort_order;
+        let approximate_sort_order_is_strict: bool;
+        if input.output_partitioning().partition_count() <= 1 {
+            sort_order = input_hints.sort_order;
+            approximate_sort_order_is_strict = input_hints.approximate_sort_order_is_strict;
+        } else {
+            sort_order = None;
+            approximate_sort_order_is_strict = false;
+        }
+        let approximate_sort_order = input_hints.approximate_sort_order;
+
+        OptimizerHints {
+            sort_order,
+            approximate_sort_order,
+            approximate_sort_order_is_prefix: input_hints.approximate_sort_order_is_prefix & approximate_sort_order_is_strict,
+            approximate_sort_order_is_strict,
+            single_value_columns: input_hints.single_value_columns,
+        }
+    }
 }
 
 #[async_trait]
@@ -157,17 +179,7 @@ impl ExecutionPlan for MergeExec {
     }
 
     fn output_hints(&self) -> OptimizerHints {
-        let input_hints = self.input.output_hints();
-        let sort_order;
-        if self.input.output_partitioning().partition_count() <= 1 {
-            sort_order = input_hints.sort_order
-        } else {
-            sort_order = None
-        }
-        OptimizerHints::new_sorted(
-            sort_order,
-            input_hints.single_value_columns,
-        )
+        MergeExec::output_hints_from_input_hints(self.input.as_ref())
     }
 }
 
diff --git a/datafusion/src/physical_plan/merge_sort.rs b/datafusion/src/physical_plan/merge_sort.rs
@@ -616,9 +616,11 @@ impl ExecutionPlan for LastRowByUniqueKeyExec {
     }
 
     fn output_hints(&self) -> OptimizerHints {
+        // Possibly, this is abandoning approximate sort order information.
+        let input_hints = self.input.output_hints();
         OptimizerHints::new_sorted(
-            self.input.output_hints().sort_order,
-            self.input.output_hints().single_value_columns,
+            input_hints.sort_order,
+            input_hints.single_value_columns,
         )
     }
 
diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs
@@ -157,9 +157,9 @@ pub struct OptimizerHints {
 }
 
 impl OptimizerHints {
-    /// Use with None for sort_order is arguably deprecated.  Used to adapt code that preceded
-    /// approximate_sort_order information.
-    fn new_sorted(sort_order: Option<Vec<usize>>, single_value_columns: Vec<usize>) -> OptimizerHints {
+    /// Mostly used to adapt code that preceded the creation of approximate_sort_order fields --
+    /// callers may be throwing away information about approximate sort order.
+    pub fn new_sorted(sort_order: Option<Vec<usize>>, single_value_columns: Vec<usize>) -> OptimizerHints {
         let mut approximate_sort_order = Vec::new();
         let mut approximate_sort_order_is_strict = false;
         let mut approximate_sort_order_is_prefix = false;
@@ -177,6 +177,7 @@ impl OptimizerHints {
         };
         hints
     }
+
 }
 
 /// `ExecutionPlan` represent nodes in the DataFusion Physical Plan.
diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs
@@ -517,12 +517,7 @@ impl DefaultPhysicalPlanner {
                     match input_sortedness.sawtooth_levels() {
                         Some(0) => {
                             log::error!("DefaultPhysicalExpr: Perfect match for inplace aggregation");
-                            let order = input_sortedness.sort_order[0]
-                                .iter()
-                                .map(|(_sort_key_offset, group_key_offset)| {
-                                    *group_key_offset
-                                })
-                                .collect_vec();
+                            let order = input_sortedness.sort_order[0].clone();  // TODO: No clone?
                             (AggregateStrategy::InplaceSorted, AggregateStrategy::InplaceSorted, Some(order))
                         }
                         Some(n) => {
@@ -1695,13 +1690,12 @@ pub fn evaluate_const(expr: Arc<dyn PhysicalExpr>) -> Result<Arc<dyn PhysicalExp
 /// Return value of input_sortedness_by_group_key.  If succeeded, every group key offset appears in
 /// sort_order or unsorted exactly once.
 pub struct SortednessByGroupKey {
-    /// Elems are (offset into the sort key, offset into the group key), with sort key offsets
-    /// strictly increasing.  Each Vec<(usize, usize)> is a clump of adjacent columns, with
+    /// Elems are offsets into the group key.  Each Vec<usize> is a clump of adjacent columns, with
     /// adjacency considered after ignoring single value columns.
     ///
-    /// Each column clump sees the input ordering in sawtoothing runs of rows, sawtoothing with different
-    /// granularity.
-    pub sort_order: Vec<Vec<(usize, usize)>>,
+    /// Each column clump sees the input ordering in sawtoothing runs of rows, sawtoothing with
+    /// different granularity.
+    pub sort_order: Vec<Vec<usize>>,
     /// Indexes into the group key.
     pub unsorted: Vec<usize>,
     /// true if the first clump of sort_order is detached from the prefix of the sort key (ignoring
@@ -1741,10 +1735,7 @@ impl SortednessByGroupKey {
     /// existing compute_aggregate_strategy function.
     pub fn compute_aggregate_strategy(&self) -> (AggregateStrategy, Option<Vec<usize>>) {
         if self.is_sorted_by_group_key() {
-            let order = self.sort_order[0]
-                .iter()
-                .map(|&(_sort_i, group_i)| group_i)
-                .collect_vec();
+            let order = self.sort_order[0].clone();
             (AggregateStrategy::InplaceSorted, Some(order))
         } else {
             (AggregateStrategy::Hash, None)
@@ -1804,14 +1795,14 @@ pub fn input_sortedness_by_group_key(
         };
     }
 
-    let mut clumps = Vec::<Vec<(usize, usize)>>::new();
+    let mut clumps = Vec::<Vec<usize>>::new();
     // At this point we walk through the sort_key_hit vec.
-    let mut clump = Vec::<(usize, usize)>::new();
+    let mut clump = Vec::<usize>::new();
     // Are our clumps detached from the sort prefix?
     let mut detached_from_prefix = false;
     for (i, &hit) in sort_key_hit.iter().enumerate() {
         if hit {
-            clump.push((i, sort_to_group[i]));
+            clump.push(sort_to_group[i]);
         } else if hints.single_value_columns.contains(&sort_key[i]) {
             // Don't end the clump.
         } else {
@@ -1835,6 +1826,85 @@ pub fn input_sortedness_by_group_key(
     }
 }
 
+pub fn input_sortedness_by_group_key_using_approximate(
+    input: &dyn ExecutionPlan,
+    group_key: &[(Arc<dyn PhysicalExpr>, String)],
+) -> SortednessByGroupKey {
+    if group_key.is_empty() {
+        // The caller has to deal with it (and in fact it wants to).
+        return SortednessByGroupKey::failed();
+    }
+
+    let hints = input.output_hints();
+    let input_schema = input.schema();
+    let mut input_to_group = vec![None; input_schema.fields().len()];
+
+    for (group_i, (g, _)) in group_key.iter().enumerate() {
+        let col = g.as_any().downcast_ref::<Column>();
+        if col.is_none() {
+            return SortednessByGroupKey::failed();
+        }
+        let input_col = input_schema.index_of(col.unwrap().name());
+        if input_col.is_err() {
+            return SortednessByGroupKey::failed();
+        }
+        let input_col = input_col.unwrap();
+        // If we have two group by exprs for the same input column, we might not optimize well in that case.
+        input_to_group[input_col] = Some(group_i);
+    }
+
+    let mut group_key_used = vec![false; group_key.len()];
+    let mut prefix_maintained = None::<bool>;
+    let mut approximate_sort_order = Vec::new();
+    for in_segment in hints.approximate_sort_order {
+        let mut out_segment = Vec::new();
+        for in_col in in_segment {
+            if let Some(group_i) = input_to_group[in_col] {
+                if prefix_maintained.is_none() {
+                    prefix_maintained = Some(true);
+                }
+                out_segment.push(group_i);
+                group_key_used[group_i] = true;
+            } else if hints.single_value_columns.contains(&in_col) {
+                continue;
+            } else {
+                if !out_segment.is_empty() {
+                    approximate_sort_order.push(out_segment);
+                    out_segment = Vec::new();
+                }
+                if prefix_maintained.is_none() {
+                    prefix_maintained = Some(false);
+                }
+            }
+
+            break;
+        }
+        if prefix_maintained.is_none() {
+            prefix_maintained = Some(false);
+        }
+        if !out_segment.is_empty() {
+            approximate_sort_order.push(out_segment);
+            out_segment = Vec::new();
+        }
+    }
+
+    let approximate_sort_order_is_strict = hints.approximate_sort_order_is_strict;
+    let approximate_sort_order_is_prefix = hints.approximate_sort_order_is_prefix && prefix_maintained == Some(true);
+    let mut unsorted = Vec::<usize>::new();
+    for (group_i, key_used) in group_key_used.into_iter().enumerate() {
+        if !key_used {
+            unsorted.push(group_i);
+        }
+    }
+
+    SortednessByGroupKey {
+        sort_order: approximate_sort_order,
+        unsorted,
+        detached_from_prefix: approximate_sort_order_is_prefix,
+        succeeded: true,
+    }
+}
+
 fn tuple_err<T, R>(value: (Result<T>, Result<R>)) -> Result<(T, R)> {
     match value {
         (Ok(e), Ok(e1)) => Ok((e, e1)),
diff --git a/datafusion/src/physical_plan/projection.rs b/datafusion/src/physical_plan/projection.rs
@@ -149,6 +149,7 @@ impl ExecutionPlan for ProjectionExec {
             } else {
                 continue;
             }
+            // If we project input to two output columns, we just end up picking one (and have incomplete analysis).
             input_to_output[column.index()] = Some(out_i);
         }
 
@@ -170,14 +171,54 @@ impl ExecutionPlan for ProjectionExec {
             }
         };
 
-        OptimizerHints::new_sorted(
-            if sort_order.is_empty() {
-                None
-            } else {
-                Some(sort_order)
-            },
+        // Becomes Some(true) if the first column of the first segment is mapped.
+        let mut prefix_maintained = None::<bool>;
+        let mut approximate_sort_order = Vec::new();
+        for in_segment in input_hints.approximate_sort_order {
+            let mut out_segment = Vec::new();
+            for in_col in in_segment {
+                if let Some(out_col) = input_to_output[in_col] {
+                    if prefix_maintained.is_none() {
+                        prefix_maintained = Some(true);
+                    }
+                    out_segment.push(out_col);
+                } else if input_hints.single_value_columns.contains(&in_col) {
+                    continue;
+                } else {
+                    // Some column is missing.  Note that handling this case right here --
+                    // projections missing columns, and splitting up the sort order into multiple
+                    // segments -- is the main purpose of approximate_sort_order.
+                    if !out_segment.is_empty() {
+                        approximate_sort_order.push(out_segment);
+                        out_segment = Vec::new();
+                    }
+                    if prefix_maintained.is_none() {
+                        prefix_maintained = Some(false);
+                    }
+
+                    break;
+                }
+            }
+            if prefix_maintained.is_none() {
+                // The whole first segment was single-value columns and it's gone now.
+                prefix_maintained = Some(false);
+            }
+
+            if !out_segment.is_empty() {
+                approximate_sort_order.push(out_segment);
+                out_segment = Vec::new();
+            }
+        }
+        let approximate_sort_order_is_strict = input_hints.approximate_sort_order_is_strict;
+        let approximate_sort_order_is_prefix = input_hints.approximate_sort_order_is_prefix && prefix_maintained == Some(true);
+
+        OptimizerHints {
+            sort_order: if sort_order.is_empty() { None } else { Some(sort_order) },
+            approximate_sort_order,
+            approximate_sort_order_is_prefix,
+            approximate_sort_order_is_strict,
             single_value_columns,
-        )
+        }
     }
 
     fn fmt_as(
diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs
@@ -190,11 +190,9 @@ impl ExecutionPlan for SortExec {
 
     fn output_hints(&self) -> OptimizerHints {
         let mut order = Vec::with_capacity(self.expr.len());
-        // let mut sort_order_truncated = false;
         for s in &self.expr {
             let column = s.expr.as_any().downcast_ref::<Column>();
             if column.is_none() {
-                // sort_order_truncated = true;
                 break;
             }
             let column = column.unwrap();
@@ -207,7 +205,6 @@ impl ExecutionPlan for SortExec {
         }
 
         let input_hints = self.input.output_hints();
-        // TODO: If sort_order_truncated is false, we can combine input_hints.sort_order.  Do this.
 
         OptimizerHints::new_sorted(
             Some(order),

Original file line number	Diff line number	Diff line change
`@@ -616,9 +616,11 @@ impl ExecutionPlan for LastRowByUniqueKeyExec {`
`616`	`616`	`}`
`617`	`617`
`618`	`618`	`fn output_hints(&self) -> OptimizerHints {`
	`619`	`+ // Possibly, this is abandoning approximate sort order information.`
	`620`	`+ let input_hints = self.input.output_hints();`
`619`	`621`	`OptimizerHints::new_sorted(`
`620`		`- self.input.output_hints().sort_order,`
`621`		`- self.input.output_hints().single_value_columns,`
	`622`	`+ input_hints.sort_order,`
	`623`	`+ input_hints.single_value_columns,`
`622`	`624`	`)`
`623`	`625`	`}`
`624`	`626`
Original file line number	Diff line number	Diff line change
`@@ -157,9 +157,9 @@ pub struct OptimizerHints {`
`157`	`157`	`}`
`158`	`158`
`159`	`159`	`impl OptimizerHints {`
`160`		`- /// Use with None for sort_order is arguably deprecated. Used to adapt code that preceded`
`161`		`- /// approximate_sort_order information.`
`162`		`- fn new_sorted(sort_order: Option<Vec<usize>>, single_value_columns: Vec<usize>) -> OptimizerHints {`
	`160`	`+ /// Mostly used to adapt code that preceded the creation of approximate_sort_order fields --`
	`161`	`+ /// callers may be throwing away information about approximate sort order.`
	`162`	`+ pub fn new_sorted(sort_order: Option<Vec<usize>>, single_value_columns: Vec<usize>) -> OptimizerHints {`
`163`	`163`	`let mut approximate_sort_order = Vec::new();`
`164`	`164`	`let mut approximate_sort_order_is_strict = false;`
`165`	`165`	`let mut approximate_sort_order_is_prefix = false;`
`@@ -177,6 +177,7 @@ impl OptimizerHints {`
`177`	`177`	`};`
`178`	`178`	`hints`
`179`	`179`	`}`
	`180`	`+`
`180`	`181`	`}`
`181`	`182`
`182`	`183`	/// `ExecutionPlan` represent nodes in the DataFusion Physical Plan.