chore: cleanup scan task execution to be clearer (#3489)

a10y · web-flow · commit bdb8cf8b14f6 · 2025-06-09T10:10:23.000-07:00
diff --git a/vortex-expr/src/transform/immediate_access.rs b/vortex-expr/src/transform/immediate_access.rs
@@ -65,7 +65,7 @@ impl<'a> NodeVisitor<'a> for ImmediateScopeAccessesAnalysis<'a> {
     fn visit_down(&mut self, node: &'a Self::NodeTy) -> VortexResult<TraversalOrder> {
         assert!(
             !node.as_any().is::<Select>(),
-            "cannot analyse select, simply the expression"
+            "cannot analyze select, simplify the expression"
         );
         if let Some(get_item) = node.as_any().downcast_ref::<GetItem>() {
             if is_root(get_item.child()) {
diff --git a/vortex-layout/src/scan/mod.rs b/vortex-layout/src/scan/mod.rs
@@ -6,9 +6,8 @@ use arrow_array::RecordBatch;
 use arrow_schema::SchemaRef;
 pub use executor::*;
 use futures::executor::LocalPool;
-use futures::future::ok;
 use futures::task::LocalSpawnExt;
-use futures::{FutureExt, Stream, StreamExt, stream};
+use futures::{Stream, StreamExt, stream};
 use itertools::Itertools;
 pub use selection::*;
 pub use split_by::*;
@@ -18,18 +17,21 @@ use vortex_array::stream::{ArrayStream, ArrayStreamAdapter};
 use vortex_array::{ArrayRef, ToCanonical};
 use vortex_buffer::Buffer;
 use vortex_dtype::{DType, Field, FieldMask, FieldName, FieldPath};
-use vortex_error::{VortexError, VortexExpect, VortexResult, vortex_err};
+use vortex_error::{VortexExpect, VortexResult, vortex_err};
 use vortex_expr::transform::immediate_access::immediate_scope_access;
 use vortex_expr::transform::simplify_typed::simplify_typed;
 use vortex_expr::{ExprRef, ScopeDType, root};
 use vortex_metrics::VortexMetrics;
 
 use crate::LayoutReader;
 use crate::layouts::filter::FilterLayoutReader;
+use crate::scan::tasks::{TaskContext, split_exec};
+
 mod executor;
 pub mod row_mask;
 mod selection;
 mod split_by;
+mod tasks;
 
 /// A struct for building a scan operation.
 pub struct ScanBuilder<A> {
@@ -53,7 +55,7 @@ pub struct ScanBuilder<A> {
     file_stats: Option<Arc<[StatsSet]>>,
 }
 
-impl<A: 'static + Send> ScanBuilder<A> {
+impl<A: 'static + Send + Sync> ScanBuilder<A> {
     pub fn with_filter(mut self, filter: ExprRef) -> Self {
         self.filter = Some(filter);
         self
@@ -153,10 +155,12 @@ impl<A: 'static + Send> ScanBuilder<A> {
     pub fn build(self) -> VortexResult<Vec<impl Future<Output = VortexResult<Option<A>>>>> {
         // Spin up the root layout reader, and wrap it in a FilterLayoutReader to perform
         // conjunction splitting if a filter is provided.
-        let mut layout_reader = self.layout_reader;
-        if self.filter.is_some() {
-            layout_reader = Arc::new(FilterLayoutReader::new(layout_reader));
-        }
+        let layout_reader = if self.filter.is_some() {
+            Arc::new(FilterLayoutReader::new(self.layout_reader))
+        } else {
+            self.layout_reader
+        };
+
         let ctx = ScopeDType::new(layout_reader.dtype().clone());
 
         // Normalize and simplify the expressions.
@@ -177,98 +181,25 @@ impl<A: 'static + Send> ScanBuilder<A> {
             .collect();
         let splits = self.split_by.splits(layout_reader.as_ref(), &field_mask)?;
 
-        let row_masks = splits
+        // Create a task that executes the full scan pipeline for each split.
+        let split_tasks = splits
             .into_iter()
-            .filter_map(|row_range| {
-                if let Some(scan_range) = &self.row_range {
-                    // If the row range is fully within the scan range, return it.
-                    if row_range.start >= scan_range.end || row_range.end < scan_range.start {
-                        return None;
-                    }
-                    // Otherwise, take the intersection of the range.
-                    return Some(
-                        row_range.start.max(scan_range.start)..row_range.end.min(scan_range.end),
-                    );
-                } else {
-                    Some(row_range)
-                }
+            .map(move |split_range| {
+                let ctx = Arc::new(TaskContext {
+                    row_range: self.row_range.clone(),
+                    selection: self.selection.clone(),
+                    filter: self.filter.clone(),
+                    reader: layout_reader.clone(),
+                    projection: projection.clone(),
+                    mapper: self.map_fn.clone(),
+                    task_executor: None,
+                });
+
+                split_exec(ctx, split_range)
             })
-            .map(|row_range| self.selection.row_mask(&row_range))
-            .filter(|mask| !mask.mask().all_false())
-            .map(|row_mask| {
-                let row_range = row_mask.row_range();
-                (row_range, ok(row_mask.mask().clone()).boxed())
-            })
-            .collect_vec();
-
-        // NOTE(ngates): since segment prefetching occurs in insertion order, we construct
-        //  all pruning tasks, then all filter tasks, then all projection tasks. When a task
-        //  explicitly polls a segment, it jumps to the front of the queue so this shouldn't
-        //  impact the time-to-first-chunk latency.
-
-        // If a filter expression is provided, then we set up pruning and filter evaluations.
-        let row_masks = if let Some(filter) = &filter {
-            // Map the row masks through the pruning evaluation
-            let row_masks: Vec<_> = row_masks
-                .into_iter()
-                .map(|(row_range, mask_fut)| {
-                    let eval = layout_reader.pruning_evaluation(&row_range, filter)?;
-                    let mask_fut = async move {
-                        let mask = mask_fut.await?;
-                        if mask.all_false() {
-                            Ok(mask)
-                        } else {
-                            eval.invoke(mask).await
-                        }
-                    }
-                    .boxed();
-                    Ok::<_, VortexError>((row_range, mask_fut))
-                })
-                .try_collect()?;
-
-            // Map the row masks through the filter evaluation
-            row_masks
-                .into_iter()
-                .map(|(row_range, mask_fut)| {
-                    let eval = layout_reader.filter_evaluation(&row_range, filter)?;
-                    let mask_fut = async move {
-                        let mask = mask_fut.await?;
-                        if mask.all_false() {
-                            Ok(mask)
-                        } else {
-                            eval.invoke(mask).await
-                        }
-                    }
-                    .boxed();
-                    Ok::<_, VortexError>((row_range, mask_fut))
-                })
-                .try_collect()?
-        } else {
-            row_masks
-        };
+            .try_collect()?;
 
-        // Finally, map the row masks through the projection evaluation and spawn.
-        row_masks
-            .into_iter()
-            .map(|(row_range, mask_fut)| {
-                let map_fn = self.map_fn.clone();
-                let eval = layout_reader.projection_evaluation(&row_range, &projection)?;
-                let array_fut = async move {
-                    let mask = mask_fut.await?;
-                    if mask.all_false() {
-                        Ok(None)
-                    } else {
-                        map_fn(eval.invoke(mask).await?).map(Some)
-                    }
-                }
-                .boxed();
-
-                Ok(match &self.executor {
-                    None => array_fut,
-                    Some(executor) => executor.spawn(array_fut),
-                })
-            })
-            .try_collect()
+        Ok(split_tasks)
     }
 
     /// Returns a stream over the scan objects.
diff --git a/vortex-layout/src/scan/selection.rs b/vortex-layout/src/scan/selection.rs
@@ -8,7 +8,7 @@ use crate::scan::row_mask::RowMask;
 
 /// A selection identifies a set of rows to include in the scan (in addition to applying any
 /// filter predicates).
-#[derive(Default)]
+#[derive(Default, Clone)]
 pub enum Selection {
     /// No selection, all rows are included.
     #[default]
diff --git a/vortex-layout/src/scan/tasks.rs b/vortex-layout/src/scan/tasks.rs
@@ -0,0 +1,119 @@
+//! Split scanning task implementation.
+
+use std::ops::Range;
+use std::sync::Arc;
+
+use futures::FutureExt;
+use futures::future::{BoxFuture, ok};
+use vortex_array::ArrayRef;
+use vortex_error::VortexResult;
+use vortex_expr::ExprRef;
+
+use crate::LayoutReader;
+use crate::scan::{Selection, TaskExecutor, TaskExecutorExt};
+
+pub type TaskFuture<A> = BoxFuture<'static, VortexResult<A>>;
+
+/// Logic for executing a single split reading task.
+///
+/// # Task execution flow
+///
+/// First, the tasks's row range (split) is intersected with the global file row-range requested,
+/// if any.
+///
+/// Then intersected row range is then further reduced via expression-based pruning. After pruning
+/// has eliminated more blocks, the full filter is executed over the remainder of the split.
+///
+/// This mask is then provided to the reader to perform a filtered projection over the split data,
+/// finally mapping the Vortex columnar record batches into some result type `A`.
+pub(super) fn split_exec<A: 'static + Send + Sync>(
+    ctx: Arc<TaskContext<A>>,
+    split: Range<u64>,
+) -> VortexResult<TaskFuture<Option<A>>> {
+    // Step 1: using the caller-provided row range and selection, attempt to disregard this split.
+    let read_range = match &ctx.row_range {
+        None => split,
+        Some(row_range) => {
+            if row_range.start >= split.end || row_range.end < split.start {
+                // No overlap for this task
+                return Ok(ok(None).boxed());
+            }
+
+            let intersect_start = row_range.start.max(split.start);
+            let intersect_end = row_range.end.min(split.end);
+            intersect_start..intersect_end
+        }
+    };
+
+    // Apply the selection to calculate a read mask
+    let read_mask = ctx.selection.row_mask(&read_range);
+    let row_range = read_mask.row_range();
+    let row_mask = read_mask.mask().clone();
+    if row_mask.all_false() {
+        return Ok(ok(None).boxed());
+    }
+
+    let filter = match ctx.filter.as_ref() {
+        // No filter == immediate task
+        None => ok(row_mask).boxed(),
+        Some(filter) => {
+            // Step 2: if there is a filter provided, attempt to prune this range based on the filter.
+            // NOTE: it's very important that the pruning and filter evaluations are built OUTSIDE
+            // of the future. Registering these row ranges eagerly is a hint to the IO system that
+            // we want to start prefetching the IO for this split.
+            let prune = ctx.reader.pruning_evaluation(&row_range, filter)?;
+            let eval = ctx.reader.filter_evaluation(&row_range, filter)?;
+
+            async move {
+                let pruned_mask = prune.invoke(row_mask).await?;
+
+                // Step 3: apply exact filtering. The pruning step has already eliminated entire blocks
+                // where we know the filter won't match any rows, so the amount of work to do here
+                // should be a lot less.
+                eval.invoke(pruned_mask).await
+            }
+            .boxed()
+        }
+    };
+
+    // Step 4: execute the projection, only at the mask for rows which match the filter
+    let exec = ctx
+        .reader
+        .projection_evaluation(&row_range, &ctx.projection)?;
+    let mapper = ctx.mapper.clone();
+    let array_fut = async move {
+        let filtered_mask = filter.await?;
+        let array_ref = exec.invoke(filtered_mask).await?;
+        mapper(array_ref).map(Some)
+    };
+
+    match &ctx.task_executor {
+        None => Ok(array_fut.boxed()),
+        // If caller provided an executor for the CPU work, spawn onto that and await the result
+        Some(executor) => Ok(executor.clone().spawn(array_fut.boxed())),
+    }
+}
+
+/// Information needed to execute a single split task.
+pub(super) struct TaskContext<A> {
+    /// A caller-provided range of the file to read. All tasks should intersect their reads
+    /// with this range to ensure that they are split as well.
+    pub(super) row_range: Option<Range<u64>>,
+
+    /// A row selection to apply.
+    pub(super) selection: Selection,
+
+    /// The filter expression for the current task.
+    pub(super) filter: Option<ExprRef>,
+
+    /// The layout reader.
+    pub(super) reader: Arc<dyn LayoutReader>,
+
+    /// The projection expression to apply to gather the scanned rows.
+    pub(super) projection: ExprRef,
+
+    /// Function that maps into an A.
+    pub(super) mapper: Arc<dyn Fn(ArrayRef) -> VortexResult<A> + Send + Sync>,
+
+    pub(super) task_executor: Option<Arc<dyn TaskExecutor>>,
+}