ALP pipeline kernel

gatesn · gatesn · commit afc449a2a00e · 2025-11-12T08:53:57.000-05:00
Signed-off-by: Nicholas Gates &lt;nick@nickgates.com&gt;
diff --git a/encodings/alp/src/alp/operator.rs b/encodings/alp/src/alp/operator.rs
@@ -5,12 +5,12 @@ use crate::{match_each_alp_float_ptype, ALPArray, ALPFloat, ALPVTable, Exponents
 use std::marker::PhantomData;
 use vortex_array::pipeline::bit_view::BitView;
 use vortex_array::pipeline::{
-    BindContext, Kernel, KernelCtx, PipelineInputs, PipelinedNode, Position, VectorId, N,
+    BindContext, Kernel, KernelCtx, PipelineInputs, PipelinedNode, VectorId, N,
 };
 use vortex_array::vtable::OperatorVTable;
 use vortex_dtype::PTypeDowncastExt;
 use vortex_error::{vortex_bail, VortexResult};
-use vortex_vector::VectorMut;
+use vortex_vector::{VectorMut, VectorMutOps};
 
 impl OperatorVTable<ALPVTable> for ALPVTable {
     fn pipeline_node(array: &ALPArray) -> Option<&dyn PipelinedNode> {
@@ -69,49 +69,37 @@ impl<A: ALPFloat> Kernel for UnpatchedALPKernel<A> {
 
         let decoded = out.as_primitive_mut().downcast::<A>();
 
-        match encoded.position() {
-            Position::InPlace => {
-                // TODO(ngates): tune the threshold
-                if selection.true_count() < (N / 8) {
-                    // Operate only over the selected elements, appending `true_count` elements
-                    unsafe {
-                        decoded
-                            .validity_mut()
-                            .append_n(true, selection.true_count())
-                    };
-                    unsafe { decoded.elements_mut().set_len(selection.true_count()) };
-                    let decoded_buf = unsafe { decoded.elements_mut() };
+        // If our input is in-place, and we have only a few selected elements, then iterate only
+        // the selected elements and write them to the output.
+        if encoded_buf.len() == N && selection.true_count() < (N / 8) {
+            // Reserve capacity for the true_count elements.
+            decoded.reserve(selection.true_count().saturating_sub(decoded.capacity()));
 
-                    let mut out_pos = 0;
-                    selection.iter_ones(|idx| {
-                        let encoded = unsafe { encoded_buf.get_unchecked(idx) };
-                        let decoded = A::decode_single(*encoded, self.exponents);
-                        *unsafe { decoded_buf.get_unchecked_mut(out_pos) } = decoded;
-                        out_pos += 1;
-                    })
-                } else {
-                    // Operate over all N elements, appending N elements
-                    assert_eq!(encoded_buf.len(), N);
-                    decoded.extend(
-                        encoded_buf
-                            .iter()
-                            .map(|e| A::decode_single(*e, self.exponents)),
-                    );
-                }
-            }
-            Position::Compact => {
-                // Loop over the compacted input elements
-                decoded.extend(
-                    encoded
-                        .as_primitive()
-                        .downcast::<A::ALPInt>()
-                        .elements()
-                        .iter()
-                        .map(|e| A::decode_single(*e, self.exponents)),
-                )
-            }
+            // SAFETY: we set_len and append_validity ensuring elements len matches validity len.
+            unsafe { decoded.validity_mut() }.append_n(true, selection.true_count());
+            unsafe { decoded.elements_mut().set_len(selection.true_count()) };
+
+            // SAFETY: we reserved capacity above.
+            let elements = unsafe { decoded.elements_mut() };
+
+            let mut out_pos = 0;
+            selection.iter_ones(|idx| {
+                let encoded = unsafe { encoded_buf.get_unchecked(idx) };
+                let decoded_value = A::decode_single(*encoded, self.exponents);
+                unsafe { *elements.get_unchecked_mut(out_pos) = decoded_value };
+                out_pos += 1;
+            });
+
+            debug_assert_eq!(decoded.validity().len(), decoded.elements().len());
+            return Ok(());
         }
 
+        // Otherwise, iterate the entire input.
+        decoded.extend(
+            encoded_buf
+                .iter()
+                .map(|e| A::decode_single(*e, self.exponents)),
+        );
         Ok(())
     }
 }
diff --git a/vortex-array/src/pipeline/driver/allocation.rs b/vortex-array/src/pipeline/driver/allocation.rs
@@ -7,15 +7,15 @@ use vortex_error::{VortexExpect, VortexResult};
 use vortex_vector::VectorMut;
 
 use crate::pipeline::driver::{Node, NodeId};
-use crate::pipeline::{PipelineVector, VectorId, N};
+use crate::pipeline::{VectorId, N};
 use crate::Array;
 
 #[derive(Debug)]
 pub struct VectorAllocation {
     /// Where each node writes its output
     pub(crate) output_targets: Vec<OutputTarget>,
     /// The actual allocated vectors
-    pub(crate) vectors: Vec<PipelineVector>,
+    pub(crate) vectors: Vec<VectorMut>,
 }
 
 // TODO(joe): support in-place view operations
@@ -86,7 +86,7 @@ pub(super) fn allocate_vectors(
             .collect(),
         vectors: allocation_types
             .into_iter()
-            .map(|dtype| PipelineVector::new_compact(VectorMut::with_capacity(dtype, N)))
+            .map(|dtype| VectorMut::with_capacity(dtype, N))
             .collect(),
     })
 }
diff --git a/vortex-array/src/pipeline/driver/mod.rs b/vortex-array/src/pipeline/driver/mod.rs
@@ -19,7 +19,7 @@ use crate::pipeline::bit_view::{BitView, BitViewExt};
 use crate::pipeline::driver::allocation::{allocate_vectors, OutputTarget};
 use crate::pipeline::driver::bind::bind_kernels;
 use crate::pipeline::driver::toposort::topological_sort;
-use crate::pipeline::{Kernel, KernelCtx, PipelineInputs, PipelineVector, N};
+use crate::pipeline::{Kernel, KernelCtx, PipelineInputs, N};
 use crate::{Array, ArrayEq, ArrayHash, ArrayOperator, ArrayRef, ArrayVisitor, Precision};
 
 /// A pipeline driver takes a Vortex array and executes it into a canonical vector.
@@ -278,58 +278,38 @@ impl Pipeline {
             // take the intermediate vector and write into that.
             match &self.output_targets[node_idx] {
                 OutputTarget::ExternalOutput => {
-                    assert!(
-                        output.capacity() >= N,
-                        "Insufficient capacity in external output vector"
-                    );
-
-                    let prev_output_len = output.len();
-                    kernel.step(&self.ctx, selection, output)?;
-
-                    let added_len = output.len() - prev_output_len;
-                    match added_len {
-                        N => {
-                            // If the kernel added N elements, the output is in-place.
-                            // TODO(ngates): we need to filter if the true count is not N.
-                        }
-                        _ if added_len == selection.true_count() => {
-                            // If the kernel added exactly the number of selected elements,
-                            // the output is already compacted into the start of the vector.
-                        }
-                        _ => vortex_bail!(
-                            "Kernel produced incorrect number of output elements, expected to append either {} or {}, got {}",
+                    // We split off the next N elements of capacity from the external output vector.
+                    let mut tail = output.split_off(output.len());
+                    assert!(tail.is_empty());
+
+                    kernel.step(&self.ctx, selection, &mut tail)?;
+                    if tail.len() != N && tail.len() != selection.true_count() {
+                        vortex_bail!(
+                            "Kernel produced incorrect number of output elements, expected either {} or {}, got {}",
                             N,
                             selection.true_count(),
-                            added_len
-                        ),
+                            tail.len()
+                        );
                     }
+
+                    // Now we append the produced output back to the main output vector.
+                    output.unsplit(tail);
                 }
                 OutputTarget::IntermediateVector(vector_id) => {
-                    let mut out_vector = self.ctx.take_output(vector_id).into_vector();
+                    let mut out_vector = self.ctx.take_output(vector_id);
                     out_vector.clear();
-                    assert!(
-                        out_vector.capacity() >= N,
-                        "Insufficient capacity in intermediate vector"
-                    );
 
+                    assert!(out_vector.is_empty());
                     kernel.step(&self.ctx, selection, &mut out_vector)?;
 
                     match out_vector.len() {
-                        N => {
+                        // Valid cases are all N elements, or only the selected elements.
+                        n if n == N || n == selection.true_count() => {
                             // If the kernel added N elements, the output is in-place.
-                            self.ctx.replace_output(
-                                vector_id,
-                                PipelineVector::new_in_place(out_vector),
-                            );
-                        }
-                        _ if out_vector.len() == selection.true_count() => {
-                            // If the kernel added exactly the number of selected elements,
-                            // the output is already compacted into the start of the vector.
-                            self.ctx
-                                .replace_output(vector_id, PipelineVector::new_compact(out_vector));
+                            self.ctx.replace_output(vector_id, out_vector);
                         }
                         _ => vortex_bail!(
-                            "Kernel produced incorrect number of output elements, expected to append either {} or {}, got {}",
+                            "Kernel produced incorrect number of output elements, expected either {} or {}, got {}",
                             N,
                             selection.true_count(),
                             out_vector.len()
diff --git a/vortex-array/src/pipeline/mod.rs b/vortex-array/src/pipeline/mod.rs
@@ -68,19 +68,19 @@ pub trait BindContext {
 /// Each step of the kernel processes zero or more input vectors, and writes output to a
 /// pre-allocated mutable output vector.
 ///
-/// Input vectors are provided via the [`KernelCtx`] and indicate the position of their elements
-/// as either [`PipelineVector::InPlace`] or [`PipelineVector::Compact`] based on whether
-/// the selected elements are in their original positions or compacted at the start of the vector
-/// respectively.
+/// Input vectors will either have length [`N`], indicating that all elements from the step are
+/// present. Or they will have length equal to the [`BitView::true_count`] of the selection mask,
+/// in which case only the selected elements are present.
 ///
-/// The provided mutable output vector is guaranteed to have at least `N` elements of capacity.
-/// The kernel **must** append either [`BitView::true_count`] elements to the output vector (in
-/// which case the output elements are considered to be in the "Compact" position), or it must
-/// append `N` elements (in which case the output elements are considered to be in their "InPlace"
-/// positions). The pipeline driver will assert these conditions after each step.
+/// Output vectors will always be passed with length zero.
 ///
-/// Note that the output vector may not be empty at the start of the step. The kernel must append
-/// its output to the existing contents of the output vector, rather than replacing it.
+/// Kernels may choose to output either all `N` elements in their original positions, or output
+/// only the selected elements to the first `true_count` positions of the output vector. When
+/// emitting `N` elements in-place, the kernel may omit expensive computations over the unselected
+/// elements, provided that the output elements in those positions are still valid (i.e. typically
+/// zeroed, rather than undefined).
+///
+/// The pipeline driver will verify these conditions before and after each step.
 pub trait Kernel: Send {
     /// Perform a single step of the kernel.
     fn step(
@@ -93,11 +93,11 @@ pub trait Kernel: Send {
 
 /// The context provided to kernels during execution to access input vectors.
 pub struct KernelCtx {
-    vectors: Vec<Option<PipelineVector>>,
+    vectors: Vec<Option<VectorMut>>,
 }
 
 impl KernelCtx {
-    fn new(vectors: Vec<PipelineVector>) -> Self {
+    fn new(vectors: Vec<VectorMut>) -> Self {
         Self {
             vectors: vectors.into_iter().map(Some).collect(),
         }
@@ -113,21 +113,21 @@ impl KernelCtx {
     ///
     /// If the input vector at the given index is not available (typically because the vector
     /// happens to be currently borrowed as an output vector!).
-    pub fn input(&self, id: VectorId) -> &PipelineVector {
+    pub fn input(&self, id: VectorId) -> &VectorMut {
         self.vectors[id.0]
             .as_ref()
             .vortex_expect("Input vector at index is not available")
     }
 
     #[inline]
-    fn take_output(&mut self, id: &VectorId) -> PipelineVector {
+    fn take_output(&mut self, id: &VectorId) -> VectorMut {
         self.vectors[id.0]
             .take()
             .vortex_expect("Output vector at index is not available")
     }
 
     #[inline]
-    fn replace_output(&mut self, id: &VectorId, vec: PipelineVector) {
+    fn replace_output(&mut self, id: &VectorId, vec: VectorMut) {
         self.vectors[id.0] = Some(vec);
     }
 }
@@ -141,53 +141,3 @@ impl VectorId {
         VectorId(idx)
     }
 }
-
-/// A pipeline vector passed into and out of pipeline kernels.
-#[derive(Debug)]
-pub struct PipelineVector {
-    vector: VectorMut,
-    position: Position,
-}
-
-/// Describes the position of the selected elements in a pipeline vector.
-#[derive(Debug, Clone, Copy, Eq, PartialEq)]
-pub enum Position {
-    /// `InPlace` indicates that elements are in their original positions, where the selected
-    /// elements are identified by true values in the selection mask.
-    InPlace,
-    /// Compact indicates that the selected elements are compacted at the start of the vector in
-    /// positions `0..true_count`.
-    Compact,
-}
-
-impl PipelineVector {
-    pub fn new_in_place(vector: VectorMut) -> Self {
-        Self {
-            vector,
-            position: Position::InPlace,
-        }
-    }
-
-    pub fn new_compact(vector: VectorMut) -> Self {
-        Self {
-            vector,
-            position: Position::Compact,
-        }
-    }
-
-    pub fn position(&self) -> Position {
-        self.position
-    }
-
-    pub fn into_vector(self) -> VectorMut {
-        self.vector
-    }
-}
-
-impl Deref for PipelineVector {
-    type Target = VectorMut;
-
-    fn deref(&self) -> &Self::Target {
-        &self.vector
-    }
-}