1616// under the License.
1717
1818use arrow:: compute:: { cast_with_options, CastOptions } ;
19- use futures:: { Stream , StreamExt } ;
20- use std:: {
21- any:: Any ,
22- pin:: Pin ,
23- sync:: Arc ,
24- task:: { Context , Poll } ,
25- } ;
19+ use std:: sync:: Arc ;
2620
27- use arrow:: array:: {
28- downcast_dictionary_array, make_array, Array , ArrayRef , MutableArrayData , RecordBatch ,
29- RecordBatchOptions ,
30- } ;
31- use arrow:: datatypes:: { DataType , Field , FieldRef , Schema , SchemaRef } ;
21+ use arrow:: array:: { downcast_dictionary_array, make_array, Array , ArrayRef , MutableArrayData } ;
22+ use arrow:: datatypes:: DataType ;
3223use arrow:: error:: ArrowError ;
33- use datafusion:: common:: { arrow_datafusion_err, DataFusionError , Result as DataFusionResult } ;
34- use datafusion:: physical_plan:: execution_plan:: { Boundedness , CardinalityEffect , EmissionType } ;
35- use datafusion:: physical_plan:: metrics:: { BaselineMetrics , ExecutionPlanMetricsSet , MetricsSet } ;
36- use datafusion:: { execution:: TaskContext , physical_expr:: * , physical_plan:: * } ;
37-
38- /// An utility execution node which makes deep copies of input batches.
39- ///
40- /// In certain scenarios like sort, DF execution nodes only make shallow copy of input batches.
41- /// This could cause issues for Comet, since we re-use column vectors across different batches.
42- /// For those scenarios, this can be used as an adapter node.
43- #[ derive( Debug ) ]
44- pub struct CopyExec {
45- input : Arc < dyn ExecutionPlan > ,
46- schema : SchemaRef ,
47- cache : PlanProperties ,
48- metrics : ExecutionPlanMetricsSet ,
49- mode : CopyMode ,
50- }
5124
5225#[ derive( Debug , PartialEq , Clone ) ]
5326pub enum CopyMode {
@@ -57,193 +30,6 @@ pub enum CopyMode {
5730 UnpackOrClone ,
5831}
5932
60- impl CopyExec {
61- pub fn new ( input : Arc < dyn ExecutionPlan > , mode : CopyMode ) -> Self {
62- // change schema to remove dictionary types because CopyExec always unpacks
63- // dictionaries
64-
65- let fields: Vec < Field > = input
66- . schema ( )
67- . fields
68- . iter ( )
69- . map ( |f : & FieldRef | match f. data_type ( ) {
70- DataType :: Dictionary ( _, value_type) => {
71- Field :: new ( f. name ( ) , value_type. as_ref ( ) . clone ( ) , f. is_nullable ( ) )
72- }
73- _ => f. as_ref ( ) . clone ( ) ,
74- } )
75- . collect ( ) ;
76-
77- let schema = Arc :: new ( Schema :: new ( fields) ) ;
78-
79- let cache = PlanProperties :: new (
80- EquivalenceProperties :: new ( Arc :: clone ( & schema) ) ,
81- Partitioning :: UnknownPartitioning ( 1 ) ,
82- EmissionType :: Final ,
83- Boundedness :: Bounded ,
84- ) ;
85-
86- Self {
87- input,
88- schema,
89- cache,
90- metrics : ExecutionPlanMetricsSet :: default ( ) ,
91- mode,
92- }
93- }
94-
95- pub fn input ( & self ) -> & Arc < dyn ExecutionPlan > {
96- & self . input
97- }
98-
99- pub fn mode ( & self ) -> & CopyMode {
100- & self . mode
101- }
102- }
103-
104- impl DisplayAs for CopyExec {
105- fn fmt_as ( & self , t : DisplayFormatType , f : & mut std:: fmt:: Formatter ) -> std:: fmt:: Result {
106- match t {
107- DisplayFormatType :: Default | DisplayFormatType :: Verbose => {
108- write ! ( f, "CopyExec [{:?}]" , self . mode)
109- }
110- DisplayFormatType :: TreeRender => unimplemented ! ( ) ,
111- }
112- }
113- }
114-
115- impl ExecutionPlan for CopyExec {
116- fn as_any ( & self ) -> & dyn Any {
117- self
118- }
119-
120- fn schema ( & self ) -> SchemaRef {
121- Arc :: clone ( & self . schema )
122- }
123-
124- fn children ( & self ) -> Vec < & Arc < dyn ExecutionPlan > > {
125- vec ! [ & self . input]
126- }
127-
128- fn with_new_children (
129- self : Arc < Self > ,
130- children : Vec < Arc < dyn ExecutionPlan > > ,
131- ) -> DataFusionResult < Arc < dyn ExecutionPlan > > {
132- let input = Arc :: clone ( & self . input ) ;
133- let new_input = input. with_new_children ( children) ?;
134- Ok ( Arc :: new ( CopyExec {
135- input : new_input,
136- schema : Arc :: clone ( & self . schema ) ,
137- cache : self . cache . clone ( ) ,
138- metrics : self . metrics . clone ( ) ,
139- mode : self . mode . clone ( ) ,
140- } ) )
141- }
142-
143- fn execute (
144- & self ,
145- partition : usize ,
146- context : Arc < TaskContext > ,
147- ) -> DataFusionResult < SendableRecordBatchStream > {
148- let child_stream = self . input . execute ( partition, context) ?;
149- Ok ( Box :: pin ( CopyStream :: new (
150- self ,
151- self . schema ( ) ,
152- child_stream,
153- partition,
154- self . mode . clone ( ) ,
155- ) ) )
156- }
157-
158- fn partition_statistics ( & self , partition : Option < usize > ) -> DataFusionResult < Statistics > {
159- self . input . partition_statistics ( partition)
160- }
161-
162- fn properties ( & self ) -> & PlanProperties {
163- & self . cache
164- }
165-
166- fn name ( & self ) -> & str {
167- "CopyExec"
168- }
169-
170- fn metrics ( & self ) -> Option < MetricsSet > {
171- Some ( self . metrics . clone_inner ( ) )
172- }
173-
174- fn maintains_input_order ( & self ) -> Vec < bool > {
175- vec ! [ true ; self . children( ) . len( ) ]
176- }
177-
178- fn supports_limit_pushdown ( & self ) -> bool {
179- true
180- }
181-
182- fn cardinality_effect ( & self ) -> CardinalityEffect {
183- CardinalityEffect :: Equal
184- }
185- }
186-
187- struct CopyStream {
188- schema : SchemaRef ,
189- child_stream : SendableRecordBatchStream ,
190- baseline_metrics : BaselineMetrics ,
191- mode : CopyMode ,
192- }
193-
194- impl CopyStream {
195- fn new (
196- exec : & CopyExec ,
197- schema : SchemaRef ,
198- child_stream : SendableRecordBatchStream ,
199- partition : usize ,
200- mode : CopyMode ,
201- ) -> Self {
202- Self {
203- schema,
204- child_stream,
205- baseline_metrics : BaselineMetrics :: new ( & exec. metrics , partition) ,
206- mode,
207- }
208- }
209-
210- // TODO: replace copy_or_cast_array with copy_array if upstream sort kernel fixes
211- // dictionary array sorting issue.
212- fn copy ( & self , batch : RecordBatch ) -> DataFusionResult < RecordBatch > {
213- let mut timer = self . baseline_metrics . elapsed_compute ( ) . timer ( ) ;
214- let vectors = batch
215- . columns ( )
216- . iter ( )
217- . map ( |v| copy_or_unpack_array ( v, & self . mode ) )
218- . collect :: < Result < Vec < ArrayRef > , _ > > ( ) ?;
219-
220- let options = RecordBatchOptions :: new ( ) . with_row_count ( Some ( batch. num_rows ( ) ) ) ;
221- let maybe_batch =
222- RecordBatch :: try_new_with_options ( Arc :: clone ( & self . schema ) , vectors, & options)
223- . map_err ( |e| arrow_datafusion_err ! ( e) ) ;
224- timer. stop ( ) ;
225- self . baseline_metrics . record_output ( batch. num_rows ( ) ) ;
226- maybe_batch
227- }
228- }
229-
230- impl Stream for CopyStream {
231- type Item = DataFusionResult < RecordBatch > ;
232-
233- fn poll_next ( mut self : Pin < & mut Self > , cx : & mut Context < ' _ > ) -> Poll < Option < Self :: Item > > {
234- self . child_stream . poll_next_unpin ( cx) . map ( |x| match x {
235- Some ( Ok ( batch) ) => Some ( self . copy ( batch) ) ,
236- other => other,
237- } )
238- }
239- }
240-
241- impl RecordBatchStream for CopyStream {
242- fn schema ( & self ) -> SchemaRef {
243- Arc :: clone ( & self . schema )
244- }
245- }
246-
24733/// Copy an Arrow Array
24834pub ( crate ) fn copy_array ( array : & dyn Array ) -> ArrayRef {
24935 let capacity = array. len ( ) ;
0 commit comments