vortex-data
diff --git a/‎vortex-array/src/pipeline/bit_view.rs‎
Lines changed: 2 additions & 0 deletions b/‎vortex-array/src/pipeline/bit_view.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎vortex-array/src/pipeline/driver/allocation.rs‎
Lines changed: 1 addition & 1 deletion b/‎vortex-array/src/pipeline/driver/allocation.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vortex-array/src/pipeline/driver/bind.rs‎
Lines changed: 31 additions & 17 deletions b/‎vortex-array/src/pipeline/driver/bind.rs‎
Lines changed: 31 additions & 17 deletions
diff --git a/‎vortex-array/src/pipeline/driver/input.rs‎
Lines changed: 116 additions & 0 deletions b/‎vortex-array/src/pipeline/driver/input.rs‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎vortex-array/src/pipeline/driver/mod.rs‎
Lines changed: 51 additions & 38 deletions b/‎vortex-array/src/pipeline/driver/mod.rs‎
Lines changed: 51 additions & 38 deletions
@@ -216,6 +216,8 @@ impl<'a> BitView<'a> {
     ///
     /// The function `f` receives a [`BitSlice`] containing the inclusive `start` bit as well as
     /// the length.
+    ///
+    /// FIXME(ngates): this is still broken.
     pub fn iter_slices<F>(&self, mut f: F)
     where
         F: FnMut(BitSlice),
 
@@ -86,7 +86,7 @@ pub(super) fn allocate_vectors(
             .collect(),
         vectors: allocation_types
             .into_iter()
-            .map(|dtype| PipelineVector::Compact(VectorMut::with_capacity(dtype, 2 * N)))
+            .map(|dtype| PipelineVector::Compact(VectorMut::with_capacity(dtype, N)))
             .collect(),
     })
 }
@@ -2,17 +2,18 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use vortex_error::{VortexExpect, VortexResult};
-use vortex_vector::Vector;
+use vortex_vector::{Vector, VectorOps};
 
 use crate::array::ArrayOperator;
-use crate::pipeline::driver::Node;
 use crate::pipeline::driver::allocation::VectorAllocation;
+use crate::pipeline::driver::input::InputKernel;
+use crate::pipeline::driver::{Node, NodeKind};
 use crate::pipeline::{BindContext, Kernel, VectorId};
 
 pub(crate) fn bind_kernels(
     dag: &[Node],
     allocation_plan: &VectorAllocation,
-    all_batch_inputs: &[Vector],
+    mut all_batch_inputs: Vec<Option<Vector>>,
 ) -> VortexResult<Vec<Box<dyn Kernel>>> {
     let mut kernels = Vec::with_capacity(dag.len());
     for node in dag {
@@ -26,38 +27,51 @@ pub(crate) fn bind_kernels(
             })
             .collect::<Vec<_>>();
 
-        let batch_inputs: Vec<_> = node
+        let mut batch_inputs: Vec<_> = node
             .batch_inputs
             .iter()
-            .map(|idx| all_batch_inputs[*idx].clone())
+            .map(|idx| all_batch_inputs[*idx].take())
             .collect();
 
-        let bind_context = PipelineBindContext {
-            children: &input_ids,
-            batch_inputs: &batch_inputs,
-        };
+        kernels.push(match node.array.as_pipelined() {
+            None => {
+                // If the node cannot be pipelined, it must be an input node
+                assert_eq!(node.kind, NodeKind::Input);
+                assert_eq!(node.batch_inputs.len(), 1);
+                let batch_id = node.batch_inputs[0];
 
-        let pipelined = node
-            .array
-            .as_pipelined()
-            .vortex_expect("Array in pipeline DAG does not support pipelined execution");
+                let batch = batch_inputs[batch_id]
+                    .take()
+                    .vortex_expect("Batch input vector has already been consumed")
+                    .into_mut();
 
-        kernels.push(pipelined.bind(&bind_context)?);
+                Box::new(InputKernel::new(batch))
+            }
+            Some(pipelined) => {
+                let bind_context = PipelineBindContext {
+                    children: &input_ids,
+                    batch_inputs: &mut batch_inputs,
+                };
+                pipelined.bind(&bind_context)?
+            }
+        });
     }
     Ok(kernels)
 }
 
 struct PipelineBindContext<'a> {
     children: &'a [VectorId],
-    batch_inputs: &'a [Vector],
+    batch_inputs: &'a mut [Option<Vector>],
 }
 
 impl BindContext for PipelineBindContext<'_> {
     fn pipelined_input(&self, pipelined_child_idx: usize) -> VectorId {
         self.children[pipelined_child_idx]
     }
 
-    fn batch_input(&self, batch_child_idx: usize) -> Vector {
-        self.batch_inputs[batch_child_idx].clone()
+    fn batch_input(&mut self, batch_child_idx: usize) -> Vector {
+        self.batch_inputs[batch_child_idx]
+            .take()
+            .vortex_expect("Batch input vector has already been consumed")
     }
 }
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex_error::{VortexExpect, VortexResult};
+use vortex_vector::{VectorMut, VectorMutOps, VectorOps};
+
+use crate::pipeline::bit_view::BitView;
+use crate::pipeline::{Kernel, KernelCtx, N};
+
+/// A kernel that feeds a batch vector into the pipeline in chunks of size `N` with zero-copy.
+pub(super) struct InputKernel {
+    // The batch vector to be fed into the pipeline.
+    batch: Option<VectorMut>,
+}
+
+impl InputKernel {
+    /// Create a new input kernel with the given batch vector.
+    pub(super) fn new(batch: VectorMut) -> Self {
+        Self { batch: Some(batch) }
+    }
+}
+
+impl Kernel for InputKernel {
+    fn step(
+        &mut self,
+        _ctx: &KernelCtx,
+        selection: &BitView,
+        out: &mut VectorMut,
+    ) -> VortexResult<()> {
+        let mut batch = self
+            .batch
+            .take()
+            .vortex_expect("Input kernel has already been exhausted");
+        let remaining = batch.len();
+
+        // The ideal thing to do here is to split off a chunk of size N from our owned batch vector,
+        // and then unsplit it onto the output vector. This should be a zero-copy operation in both
+        // cases, regardless of whether the output vector is the root output of the pipeline or an
+        // intermediate vector that gets cleared on each iteration.
+        //
+        // The only case this doesn't work, is when we have fewer than N elements left in our batch
+        // vector, _and_ the selection vector is not simply a dense prefix. In this case, we copy
+        // the remaining elements into the output.
+        if remaining < N && selection.true_count() < remaining {
+            // TODO(ngates): this is slow. We should instead unsplit the vector, and then manually
+            //  run a compaction over the vector.
+            let immutable = batch.freeze();
+            selection.iter_ones(|idx| {
+                out.extend_from_vector(&immutable.slice(idx..idx + 1));
+            });
+            return Ok(());
+        }
+
+        // We split off from our owned batch vector in chunks of size N, and then unsplit onto the
+        // output vector. Both of these operations should be zero-copy.
+        let mut split = batch.split_off(N.min(remaining));
+
+        // Split-off leaves [0, at) in self.batch, and returns [at, ..)
+        // So we swap the remainder back into self.batch for the next iteration
+        std::mem::swap(&mut split, &mut batch);
+
+        // If the output vector is the end of the pipeline, then each step we will be given back
+        // the same output to append to, and unsplit will be zero-copy.
+        // If the output vector is an intermediate vector, then it will be empty at the start of
+        // each step, and unsplit will also be zero-copy.
+        out.unsplit(split);
+
+        self.batch = Some(batch);
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use vortex_buffer::{bitbuffer, buffer};
+    use vortex_dtype::PTypeDowncastExt;
+    use vortex_mask::Mask;
+
+    use crate::pipeline::driver::PipelineDriver;
+    use crate::{Array, ArrayOperator, IntoArray};
+
+    #[test]
+    fn test_pipeline_input() {
+        let array = buffer![123u32; 8000].into_array();
+        assert!(
+            array.as_pipelined().is_none(),
+            "We're explicitly testing non-pipelined arrays"
+        );
+
+        let selection = Mask::new_true(array.len());
+        let vector = PipelineDriver::new(array)
+            .execute(&selection)
+            .unwrap()
+            .into_primitive()
+            .downcast::<u32>();
+        assert_eq!(vector.elements().as_ref(), &[123u32; 8000]);
+    }
+
+    #[test]
+    fn test_pipeline_input_with_selection() {
+        let array = buffer![0u32, 1, 2, 3, 4].into_array();
+        assert!(
+            array.as_pipelined().is_none(),
+            "We're explicitly testing non-pipelined arrays"
+        );
+
+        let selection = Mask::from(bitbuffer![1 0 1 0 1]);
+        let vector = PipelineDriver::new(array)
+            .execute(&selection)
+            .unwrap()
+            .into_primitive()
+            .downcast::<u32>();
+        assert_eq!(vector.elements().as_ref(), &[0u32, 2, 4]);
+    }
+}
@@ -3,6 +3,7 @@
 
 pub mod allocation;
 mod bind;
+mod input;
 mod toposort;
 
 use std::hash::{BuildHasher, Hash, Hasher};
@@ -18,7 +19,7 @@ use crate::pipeline::bit_view::{BitView, BitViewExt};
 use crate::pipeline::driver::allocation::{OutputTarget, allocate_vectors};
 use crate::pipeline::driver::bind::bind_kernels;
 use crate::pipeline::driver::toposort::topological_sort;
-use crate::pipeline::{ElementPosition, Kernel, KernelCtx, N, PipelineInputs, PipelineVector};
+use crate::pipeline::{Kernel, KernelCtx, N, PipelineInputs, PipelineVector};
 use crate::{Array, ArrayEq, ArrayHash, ArrayOperator, ArrayRef, ArrayVisitor, Precision};
 
 /// A pipeline driver takes a Vortex array and executes it into a canonical vector.
@@ -59,12 +60,10 @@ struct Node {
     batch_inputs: Vec<BatchId>,
 }
 
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 enum NodeKind {
-    /// A view node acts as a pipeline source, but is fed into the pipeline by taking zero-copy
-    /// slices of a batch vector. This occurs when a node declares its child as pipelined, but the
-    /// child itself doesn't support pipelined execution.
-    View,
+    /// An input node feeds a batch vector into the pipeline chunk-by-chunk.
+    Input,
     /// A source node provides input to the pipeline by writing into mutable output vectors one
     /// batch at a time.
     Source,
@@ -102,7 +101,7 @@ impl PipelineDriver {
 
                     Node {
                         array,
-                        kind: NodeKind::View,
+                        kind: NodeKind::Input,
                         children: vec![],
                         parents: vec![],
                         batch_inputs: vec![batch_id],
@@ -200,7 +199,7 @@ impl PipelineDriver {
         let batch_inputs: Vec<_> = self
             .batch_inputs
             .into_iter()
-            .map(|array| array.execute())
+            .map(|array| array.execute().map(Some))
             .try_collect()?;
 
         // Compute the toposort of the DAG
@@ -210,7 +209,7 @@ impl PipelineDriver {
         let allocation_plan = allocate_vectors(&self.dag, &exec_order)?;
 
         // Bind each node in the DAG to create its kernel
-        let kernels = bind_kernels(&self.dag, &allocation_plan, &batch_inputs)?;
+        let kernels = bind_kernels(&self.dag, &allocation_plan, batch_inputs)?;
 
         // Construct the kernel execution context
         let ctx = KernelCtx::new(allocation_plan.vectors);
@@ -279,47 +278,61 @@ impl Pipeline {
             // take the intermediate vector and write into that.
             match &self.output_targets[node_idx] {
                 OutputTarget::ExternalOutput => {
+                    assert!(
+                        output.capacity() >= N,
+                        "Insufficient capacity in external output vector"
+                    );
+
                     let prev_output_len = output.len();
-                    let position = kernel.step(&self.ctx, selection, output)?;
-                    if output.len() != prev_output_len + N {
-                        vortex_bail!(
-                            "Kernel produced incorrect number of output elements, expected {}, got {}",
-                            prev_output_len + N,
-                            output.len()
-                        );
-                    }
+                    kernel.step(&self.ctx, selection, output)?;
 
-                    match position {
-                        ElementPosition::Sparse => {
-                            // The output is in sparse form, we need to compact it based on the
-                            // selection mask.
-                            // TODO(ngates): we need to implement compaction here.
-                            todo!()
+                    let added_len = output.len() - prev_output_len;
+                    match added_len {
+                        N => {
+                            // If the kernel added N elements, the output is in-place.
+                            // TODO(ngates): we need to filter if the true count is not N.
                         }
-                        ElementPosition::Compact => {
-                            // The output is already compacted, we just need to adjust the length
-                            // to cover only the selected elements.
-                            output.truncate(prev_output_len + selection.true_count());
+                        _ if added_len == selection.true_count() => {
+                            // If the kernel added exactly the number of selected elements,
+                            // the output is already compacted into the start of the vector.
                         }
+                        _ => vortex_bail!(
+                            "Kernel produced incorrect number of output elements, expected to append either {} or {}, got {}",
+                            N,
+                            selection.true_count(),
+                            added_len
+                        ),
                     }
                 }
                 OutputTarget::IntermediateVector(vector_id) => {
                     let mut out_vector = VectorMut::from(self.ctx.take_output(vector_id));
                     out_vector.clear();
-
-                    let position = kernel.step(&self.ctx, selection, &mut out_vector)?;
-                    if out_vector.len() != N {
-                        vortex_bail!(
-                            "Kernel produced incorrect number of output elements, expected {}, got {}",
+                    assert!(
+                        out_vector.capacity() >= N,
+                        "Insufficient capacity in intermediate vector"
+                    );
+
+                    kernel.step(&self.ctx, selection, &mut out_vector)?;
+
+                    match out_vector.len() {
+                        N => {
+                            // If the kernel added N elements, the output is in-place.
+                            self.ctx
+                                .replace_output(vector_id, PipelineVector::InPlace(out_vector));
+                        }
+                        _ if out_vector.len() == selection.true_count() => {
+                            // If the kernel added exactly the number of selected elements,
+                            // the output is already compacted into the start of the vector.
+                            self.ctx
+                                .replace_output(vector_id, PipelineVector::Compact(out_vector));
+                        }
+                        _ => vortex_bail!(
+                            "Kernel produced incorrect number of output elements, expected to append either {} or {}, got {}",
                             N,
+                            selection.true_count(),
                             out_vector.len()
-                        );
+                        ),
                     }
-
-                    // Wrap the output vector back into a PipelineVector, indicating which position
-                    // the elements are in.
-                    let out_vector = PipelineVector::from_position(position, out_vector);
-                    self.ctx.replace_output(vector_id, out_vector)
                 }
             };
         }
Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,7 @@ pub(super) fn allocate_vectors(`
`86`	`86`	`.collect(),`
`87`	`87`	`vectors: allocation_types`
`88`	`88`	`.into_iter()`
`89`		`- .map(\|dtype\| PipelineVector::Compact(VectorMut::with_capacity(dtype, 2 * N)))`
	`89`	`+ .map(\|dtype\| PipelineVector::Compact(VectorMut::with_capacity(dtype, N)))`
`90`	`90`	`.collect(),`
`91`	`91`	`})`
`92`	`92`	`}`