vortex-data
diff --git a/‎vortex-cuda/benches/dynamic_dispatch_cuda.rs‎
Lines changed: 6 additions & 4 deletions b/‎vortex-cuda/benches/dynamic_dispatch_cuda.rs‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎vortex-cuda/src/dynamic_dispatch/mod.rs‎
Lines changed: 53 additions & 40 deletions b/‎vortex-cuda/src/dynamic_dispatch/mod.rs‎
Lines changed: 53 additions & 40 deletions
@@ -40,8 +40,8 @@ use vortex_cuda::CudaDeviceBuffer;
 use vortex_cuda::CudaExecutionCtx;
 use vortex_cuda::CudaSession;
 use vortex_cuda::dynamic_dispatch::CudaDispatchPlan;
+use vortex_cuda::dynamic_dispatch::DispatchPlan;
 use vortex_cuda::dynamic_dispatch::MaterializedPlan;
-use vortex_cuda::dynamic_dispatch::UnmaterializedPlan;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 
@@ -123,13 +123,15 @@ struct BenchRunner {
 
 impl BenchRunner {
     fn new(array: &vortex::array::ArrayRef, len: usize, cuda_ctx: &CudaExecutionCtx) -> Self {
+        let plan = match DispatchPlan::new(array).vortex_expect("build_dyn_dispatch_plan") {
+            DispatchPlan::Fused(plan) => plan,
+            _ => panic!("encoding not fusable"),
+        };
         let MaterializedPlan {
             dispatch_plan,
             device_buffers,
             shared_mem_bytes,
-        } = UnmaterializedPlan::new(array)
-            .and_then(|p| p.materialize(cuda_ctx))
-            .vortex_expect("build_dyn_dispatch_plan");
+        } = plan.materialize(cuda_ctx).vortex_expect("materialize plan");
 
         let device_plan = Arc::new(
             cuda_ctx
 
@@ -3,17 +3,13 @@
 
 //! Host interface for dynamic CUDA kernel dispatch.
 //!
-//! An [`UnmaterializedPlan`] walks an encoding tree (e.g., `ALP(FoR(BitPacked))`)
-//! and flattens it into a linear sequence of stages. Call
-//! [`materialize`](UnmaterializedPlan::materialize) to copy source buffers to
-//! the device, producing a [`MaterializedPlan`] ready for kernel launch.
+//! [`DispatchPlan::new`] walks an encoding tree (e.g., `ALP(FoR(BitPacked))`)
+//! in a single pass and returns one of three variants:
 //!
-//! For partially-fusable trees, [`find_unfusable_nodes`] identifies nodes
-//! that need separate kernels, and [`UnmaterializedPlan::new_with_subtree_inputs`] builds a plan
-//!  that incorporates their pre-executed arrays.
-//!
-//! Shared memory is dynamically sized at launch time via
-//! [`UnmaterializedPlan::shared_mem_bytes`].
+//! - [`Fused`](DispatchPlan::Fused) — call [`FusedPlan::materialize`].
+//! - [`PartiallyFused`](DispatchPlan::PartiallyFused) — execute pending
+//!   subtrees, then call [`FusedPlan::materialize_with_subtrees`].
+//! - [`Unfused`](DispatchPlan::Unfused) — fall back to single-kernel dispatch.
 
 #![allow(non_upper_case_globals)]
 #![allow(non_camel_case_types)]
@@ -47,9 +43,9 @@ use crate::CudaDeviceBuffer;
 use crate::executor::CudaExecutionCtx;
 
 pub(crate) mod plan_builder;
+pub use plan_builder::DispatchPlan;
+pub use plan_builder::FusedPlan;
 pub use plan_builder::MaterializedPlan;
-pub use plan_builder::UnmaterializedPlan;
-pub use plan_builder::find_unfusable_nodes;
 
 include!(concat!(env!("OUT_DIR"), "/dynamic_dispatch.rs"));
 
@@ -449,17 +445,18 @@ mod tests {
     use vortex::session::VortexSession;
 
     use super::CudaDispatchPlan;
+    use super::DispatchPlan;
     use super::SMEM_TILE_SIZE;
     use super::ScalarOp;
     use super::SourceOp;
     use super::Stage;
-    use super::UnmaterializedPlan;
+    use super::*;
     use crate::CudaBufferExt;
     use crate::CudaDeviceBuffer;
     use crate::CudaExecutionCtx;
     use crate::session::CudaSession;
 
-    fn make_bitpacked_array_u32(bit_width: u8, len: usize) -> BitPackedArray {
+    fn bitpacked_array_u32(bit_width: u8, len: usize) -> BitPackedArray {
         let max_val = (1u64 << bit_width).saturating_sub(1);
         let values: Vec<u32> = (0..len)
             .map(|i| ((i as u64) % (max_val + 1)) as u32)
@@ -469,6 +466,16 @@ mod tests {
             .vortex_expect("failed to create BitPacked array")
     }
 
+    fn dispatch_plan(
+        array: &vortex::array::ArrayRef,
+        ctx: &CudaExecutionCtx,
+    ) -> VortexResult<MaterializedPlan> {
+        match DispatchPlan::new(array)? {
+            DispatchPlan::Fused(plan) => plan.materialize(ctx),
+            _ => vortex_bail!("array encoding not fusable"),
+        }
+    }
+
     #[crate::test]
     fn test_max_scalar_ops() -> VortexResult<()> {
         let bit_width: u8 = 6;
@@ -481,7 +488,7 @@ mod tests {
             .map(|i| ((i as u64) % (max_val + 1)) as u32 + total_reference)
             .collect();
 
-        let bitpacked = make_bitpacked_array_u32(bit_width, len);
+        let bitpacked = bitpacked_array_u32(bit_width, len);
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
         let packed = bitpacked.packed().clone();
         let device_input = futures::executor::block_on(cuda_ctx.ensure_on_device(packed))?;
@@ -669,9 +676,9 @@ mod tests {
             .map(|i| ((i as u64) % (max_val + 1)) as u32)
             .collect();
 
-        let bp = make_bitpacked_array_u32(bit_width, len);
+        let bp = bitpacked_array_u32(bit_width, len);
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&bp.into_array())?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&bp.into_array(), &cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -692,11 +699,11 @@ mod tests {
             .collect();
         let expected: Vec<u32> = raw.iter().map(|&v| v + reference).collect();
 
-        let bp = make_bitpacked_array_u32(bit_width, len);
+        let bp = bitpacked_array_u32(bit_width, len);
         let for_arr = FoRArray::try_new(bp.into_array(), Scalar::from(reference))?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&for_arr.into_array())?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&for_arr.into_array(), &cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -722,7 +729,7 @@ mod tests {
         let re = RunEndArray::new(ends_arr, values_arr);
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&re.into_array())?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&re.into_array(), &cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -755,7 +762,7 @@ mod tests {
         let dict = DictArray::try_new(codes_bp.into_array(), dict_for.into_array())?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&dict.into_array())?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&dict.into_array(), &cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -787,7 +794,7 @@ mod tests {
         );
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&tree.into_array())?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&tree.into_array(), &cuda_ctx)?;
 
         let actual =
             run_dispatch_plan_f32(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -816,7 +823,7 @@ mod tests {
         let zz = ZigZagArray::try_new(bp.into_array())?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&zz.into_array())?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&zz.into_array(), &cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -845,7 +852,7 @@ mod tests {
         let for_arr = FoRArray::try_new(re.into_array(), Scalar::from(reference))?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&for_arr.into_array())?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&for_arr.into_array(), &cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -874,7 +881,7 @@ mod tests {
         let for_arr = FoRArray::try_new(dict.into_array(), Scalar::from(reference))?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&for_arr.into_array())?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&for_arr.into_array(), &cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -902,7 +909,7 @@ mod tests {
         let dict = DictArray::try_new(codes_for.into_array(), values_prim.into_array())?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&dict.into_array())?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&dict.into_array(), &cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -927,7 +934,7 @@ mod tests {
         let dict = DictArray::try_new(codes_bp.into_array(), values_prim.into_array())?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&dict.into_array())?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&dict.into_array(), &cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -946,8 +953,11 @@ mod tests {
         let values_prim = PrimitiveArray::new(Buffer::from(dict_values), NonNullable);
         let dict = DictArray::try_new(codes_prim.into_array(), values_prim.into_array())?;
 
-        // UnmaterializedPlan::new should fail because u8 codes != u32 values in byte width.
-        assert!(UnmaterializedPlan::new(&dict.into_array()).is_err());
+        // DispatchPlan::new should return Unfused because u8 codes != u32 values in byte width.
+        assert!(matches!(
+            DispatchPlan::new(&dict.into_array())?,
+            DispatchPlan::Unfused
+        ));
 
         Ok(())
     }
@@ -961,8 +971,11 @@ mod tests {
         let values_arr = PrimitiveArray::new(Buffer::from(values), NonNullable).into_array();
         let re = RunEndArray::new(ends_arr, values_arr);
 
-        // UnmaterializedPlan::new should fail because u64 ends != i32 values in byte width.
-        assert!(UnmaterializedPlan::new(&re.into_array()).is_err());
+        // DispatchPlan::new should return Unfused because u64 ends != i32 values in byte width.
+        assert!(matches!(
+            DispatchPlan::new(&re.into_array())?,
+            DispatchPlan::Unfused
+        ));
 
         Ok(())
     }
@@ -997,7 +1010,7 @@ mod tests {
         let expected: Vec<u32> = data[slice_start..slice_end].to_vec();
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&sliced)?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1048,7 +1061,7 @@ mod tests {
         let expected: Vec<u32> = all_decoded[slice_start..slice_end].to_vec();
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&sliced)?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1098,7 +1111,7 @@ mod tests {
             .collect();
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&sliced)?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1143,7 +1156,7 @@ mod tests {
         let expected: Vec<u32> = data[slice_start..slice_end].to_vec();
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&sliced)?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1192,7 +1205,7 @@ mod tests {
         let expected: Vec<u32> = all_decoded[slice_start..slice_end].to_vec();
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&sliced)?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1244,7 +1257,7 @@ mod tests {
         let expected: Vec<u32> = all_decoded[slice_start..slice_end].to_vec();
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&sliced)?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1301,7 +1314,7 @@ mod tests {
         let expected: Vec<u32> = all_decoded[slice_start..slice_end].to_vec();
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&sliced)?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1333,7 +1346,7 @@ mod tests {
         let seq = SequenceArray::try_new_typed(base, multiplier, Nullability::NonNullable, len)?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&seq.into_array())?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&seq.into_array(), &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1366,7 +1379,7 @@ mod tests {
         let seq = SequenceArray::try_new_typed(base, multiplier, Nullability::NonNullable, len)?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = UnmaterializedPlan::new(&seq.into_array())?.materialize(&cuda_ctx)?;
+        let plan = dispatch_plan(&seq.into_array(), &cuda_ctx)?;
 
         let actual_u32 = run_dynamic_dispatch_plan(
             &cuda_ctx,