@@ -6,9 +6,8 @@ use arrow_array::RecordBatch;
66use arrow_schema:: SchemaRef ;
77pub use executor:: * ;
88use futures:: executor:: LocalPool ;
9- use futures:: future:: ok;
109use futures:: task:: LocalSpawnExt ;
11- use futures:: { FutureExt , Stream , StreamExt , stream} ;
10+ use futures:: { Stream , StreamExt , stream} ;
1211use itertools:: Itertools ;
1312pub use selection:: * ;
1413pub use split_by:: * ;
@@ -18,18 +17,21 @@ use vortex_array::stream::{ArrayStream, ArrayStreamAdapter};
1817use vortex_array:: { ArrayRef , ToCanonical } ;
1918use vortex_buffer:: Buffer ;
2019use vortex_dtype:: { DType , Field , FieldMask , FieldName , FieldPath } ;
21- use vortex_error:: { VortexError , VortexExpect , VortexResult , vortex_err} ;
20+ use vortex_error:: { VortexExpect , VortexResult , vortex_err} ;
2221use vortex_expr:: transform:: immediate_access:: immediate_scope_access;
2322use vortex_expr:: transform:: simplify_typed:: simplify_typed;
2423use vortex_expr:: { ExprRef , ScopeDType , root} ;
2524use vortex_metrics:: VortexMetrics ;
2625
2726use crate :: LayoutReader ;
2827use crate :: layouts:: filter:: FilterLayoutReader ;
28+ use crate :: scan:: tasks:: { TaskContext , split_exec} ;
29+
2930mod executor;
3031pub mod row_mask;
3132mod selection;
3233mod split_by;
34+ mod tasks;
3335
3436/// A struct for building a scan operation.
3537pub struct ScanBuilder < A > {
@@ -53,7 +55,7 @@ pub struct ScanBuilder<A> {
5355 file_stats : Option < Arc < [ StatsSet ] > > ,
5456}
5557
56- impl < A : ' static + Send > ScanBuilder < A > {
58+ impl < A : ' static + Send + Sync > ScanBuilder < A > {
5759 pub fn with_filter ( mut self , filter : ExprRef ) -> Self {
5860 self . filter = Some ( filter) ;
5961 self
@@ -153,10 +155,12 @@ impl<A: 'static + Send> ScanBuilder<A> {
153155 pub fn build ( self ) -> VortexResult < Vec < impl Future < Output = VortexResult < Option < A > > > > > {
154156 // Spin up the root layout reader, and wrap it in a FilterLayoutReader to perform
155157 // conjunction splitting if a filter is provided.
156- let mut layout_reader = self . layout_reader ;
157- if self . filter . is_some ( ) {
158- layout_reader = Arc :: new ( FilterLayoutReader :: new ( layout_reader) ) ;
159- }
158+ let layout_reader = if self . filter . is_some ( ) {
159+ Arc :: new ( FilterLayoutReader :: new ( self . layout_reader ) )
160+ } else {
161+ self . layout_reader
162+ } ;
163+
160164 let ctx = ScopeDType :: new ( layout_reader. dtype ( ) . clone ( ) ) ;
161165
162166 // Normalize and simplify the expressions.
@@ -177,98 +181,25 @@ impl<A: 'static + Send> ScanBuilder<A> {
177181 . collect ( ) ;
178182 let splits = self . split_by . splits ( layout_reader. as_ref ( ) , & field_mask) ?;
179183
180- let row_masks = splits
184+ // Create a task that executes the full scan pipeline for each split.
185+ let split_tasks = splits
181186 . into_iter ( )
182- . filter_map ( |row_range| {
183- if let Some ( scan_range) = & self . row_range {
184- // If the row range is fully within the scan range, return it.
185- if row_range. start >= scan_range. end || row_range. end < scan_range. start {
186- return None ;
187- }
188- // Otherwise, take the intersection of the range.
189- return Some (
190- row_range. start . max ( scan_range. start ) ..row_range. end . min ( scan_range. end ) ,
191- ) ;
192- } else {
193- Some ( row_range)
194- }
187+ . map ( move |split_range| {
188+ let ctx = Arc :: new ( TaskContext {
189+ row_range : self . row_range . clone ( ) ,
190+ selection : self . selection . clone ( ) ,
191+ filter : self . filter . clone ( ) ,
192+ reader : layout_reader. clone ( ) ,
193+ projection : projection. clone ( ) ,
194+ mapper : self . map_fn . clone ( ) ,
195+ task_executor : None ,
196+ } ) ;
197+
198+ split_exec ( ctx, split_range)
195199 } )
196- . map ( |row_range| self . selection . row_mask ( & row_range) )
197- . filter ( |mask| !mask. mask ( ) . all_false ( ) )
198- . map ( |row_mask| {
199- let row_range = row_mask. row_range ( ) ;
200- ( row_range, ok ( row_mask. mask ( ) . clone ( ) ) . boxed ( ) )
201- } )
202- . collect_vec ( ) ;
203-
204- // NOTE(ngates): since segment prefetching occurs in insertion order, we construct
205- // all pruning tasks, then all filter tasks, then all projection tasks. When a task
206- // explicitly polls a segment, it jumps to the front of the queue so this shouldn't
207- // impact the time-to-first-chunk latency.
208-
209- // If a filter expression is provided, then we set up pruning and filter evaluations.
210- let row_masks = if let Some ( filter) = & filter {
211- // Map the row masks through the pruning evaluation
212- let row_masks: Vec < _ > = row_masks
213- . into_iter ( )
214- . map ( |( row_range, mask_fut) | {
215- let eval = layout_reader. pruning_evaluation ( & row_range, filter) ?;
216- let mask_fut = async move {
217- let mask = mask_fut. await ?;
218- if mask. all_false ( ) {
219- Ok ( mask)
220- } else {
221- eval. invoke ( mask) . await
222- }
223- }
224- . boxed ( ) ;
225- Ok :: < _ , VortexError > ( ( row_range, mask_fut) )
226- } )
227- . try_collect ( ) ?;
228-
229- // Map the row masks through the filter evaluation
230- row_masks
231- . into_iter ( )
232- . map ( |( row_range, mask_fut) | {
233- let eval = layout_reader. filter_evaluation ( & row_range, filter) ?;
234- let mask_fut = async move {
235- let mask = mask_fut. await ?;
236- if mask. all_false ( ) {
237- Ok ( mask)
238- } else {
239- eval. invoke ( mask) . await
240- }
241- }
242- . boxed ( ) ;
243- Ok :: < _ , VortexError > ( ( row_range, mask_fut) )
244- } )
245- . try_collect ( ) ?
246- } else {
247- row_masks
248- } ;
200+ . try_collect ( ) ?;
249201
250- // Finally, map the row masks through the projection evaluation and spawn.
251- row_masks
252- . into_iter ( )
253- . map ( |( row_range, mask_fut) | {
254- let map_fn = self . map_fn . clone ( ) ;
255- let eval = layout_reader. projection_evaluation ( & row_range, & projection) ?;
256- let array_fut = async move {
257- let mask = mask_fut. await ?;
258- if mask. all_false ( ) {
259- Ok ( None )
260- } else {
261- map_fn ( eval. invoke ( mask) . await ?) . map ( Some )
262- }
263- }
264- . boxed ( ) ;
265-
266- Ok ( match & self . executor {
267- None => array_fut,
268- Some ( executor) => executor. spawn ( array_fut) ,
269- } )
270- } )
271- . try_collect ( )
202+ Ok ( split_tasks)
272203 }
273204
274205 /// Returns a stream over the scan objects.
0 commit comments