scans

robert3005 · robert3005 · commit 6e11e9873893 · 2025-10-27T16:59:50.000Z
Signed-off-by: Robert Kruszewski &lt;github@robertk.io&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/vortex-gpu/src/bit_unpack.rs b/vortex-gpu/src/bit_unpack.rs
@@ -9,8 +9,7 @@ use std::time::Duration;
 
 use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT;
 use cudarc::driver::{
-    CudaContext, CudaFunction, CudaSlice, CudaStream, CudaViewMut, DeviceRepr, LaunchConfig,
-    PushKernelArg,
+    CudaContext, CudaFunction, CudaSlice, CudaStream, DeviceRepr, LaunchConfig, PushKernelArg,
 };
 use cudarc::nvrtc::Ptx;
 use vortex_array::Canonical;
@@ -22,6 +21,7 @@ use vortex_error::{VortexExpect, VortexResult, vortex_err};
 use vortex_fastlanes::BitPackedArray;
 
 use crate::task::GPUTask;
+use crate::{ErasedCudaSlice, GpuArray};
 
 #[derive(Hash, PartialEq, Eq, Debug)]
 struct UnpackKernelId {
@@ -189,7 +189,7 @@ impl<P: NativePType + DeviceRepr> GPUTask for BitPackingTask<P> {
             .map(|_| ())
     }
 
-    fn export_result(&mut self) -> VortexResult<Canonical> {
+    fn export_result(&mut self) -> VortexResult<GpuArray> {
         let mut buffer = BufferMut::<P>::with_capacity(self.len());
 
         unsafe { buffer.set_len(self.len()) }
@@ -204,12 +204,8 @@ impl<P: NativePType + DeviceRepr> GPUTask for BitPackingTask<P> {
         ))
     }
 
-    fn output(&mut self) -> CudaViewMut<'_, u8> {
-        unsafe {
-            self.unpacked
-                .transmute_mut(self.len() * size_of::<P>())
-                .vortex_expect("Failed to transmute")
-        }
+    fn output(&mut self) -> ErasedCudaSlice {
+        ErasedCudaSlice::new(self.unpacked)
     }
 
     fn len(&self) -> usize {
diff --git a/vortex-gpu/src/buffer/gpu_buf.rs b/vortex-gpu/src/buffer/gpu_buf.rs
@@ -10,6 +10,7 @@ use vortex_error::vortex_panic;
 
 pub struct ErasedCudaSlice {
     ptr: CUdeviceptr,
+    stream: Arc<CudaStream>,
     len: usize,
     ptype: PType,
 }
@@ -18,8 +19,10 @@ impl ErasedCudaSlice {
     pub fn new<T: NativePType>(slice: impl Into<CudaSlice<T>>) -> Self {
         let slice = slice.into();
         let len = slice.len();
+        let stream = slice.stream().clone();
         Self {
             ptr: slice.leak(),
+            stream,
             len,
             ptype: T::PTYPE,
         }
@@ -33,7 +36,7 @@ impl ErasedCudaSlice {
         self.len
     }
 
-    pub fn as_slice<T: NativePType>(&self, stream: &Arc<CudaStream>) -> CudaSlice<T> {
+    pub fn as_slice<T: NativePType>(&self) -> CudaSlice<T> {
         if T::PTYPE != self.ptype() {
             vortex_panic!(
                 "Attempted to get slice of type {} from array of type {}",
@@ -42,6 +45,6 @@ impl ErasedCudaSlice {
             )
         }
 
-        unsafe { stream.upgrade_device_ptr::<T>(self.ptr, self.len) }
+        unsafe { self.stream.upgrade_device_ptr::<T>(self.ptr, self.len) }
     }
 }
diff --git a/vortex-gpu/src/for_.rs b/vortex-gpu/src/for_.rs
@@ -6,19 +6,16 @@ use std::time::Duration;
 
 use cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT;
 use cudarc::driver::{
-    CudaContext, CudaFunction, CudaStream, CudaViewMut, DeviceRepr, LaunchConfig, PushKernelArg,
+    CudaContext, CudaFunction, CudaStream, DeviceRepr, LaunchConfig, PushKernelArg,
 };
 use cudarc::nvrtc::Ptx;
-use vortex_array::Canonical;
 use vortex_array::arrays::PrimitiveArray;
-use vortex_array::validity::Validity;
-use vortex_buffer::BufferMut;
 use vortex_dtype::{NativePType, PType, match_each_native_ptype};
 use vortex_error::{VortexExpect, VortexResult, vortex_err};
 use vortex_fastlanes::{BitPackedVTable, FoRArray};
 
-use crate::bit_unpack;
 use crate::task::GPUTask;
+use crate::{ErasedCudaSlice, GpuArray, GpuPrimitiveArray, bit_unpack};
 
 struct ForTask<P> {
     stream: Arc<CudaStream>,
@@ -71,43 +68,23 @@ fn cuda_for_kernel(ptype: PType, ctx: &Arc<CudaContext>) -> VortexResult<CudaFun
 
 impl<P: NativePType + DeviceRepr> GPUTask for ForTask<P> {
     fn launch_task(&mut self) -> VortexResult<()> {
-        let len = self.len();
         self.bp_task.launch_task()?;
         let mut launch = self.stream.launch_builder(&self.func);
-        let mut view = unsafe {
-            self.bp_task
-                .output()
-                .transmute_mut::<P>(len)
-                .vortex_expect("")
-        };
+        let mut view = self.bp_task.output().as_slice::<P>();
         launch.arg(&mut view);
         launch.arg(&self.reference);
         unsafe { launch.launch(self.launch_config) }
             .map_err(|e| vortex_err!("Failed to launch: {e}"))
             .map(|_| ())
     }
 
-    fn export_result(&mut self) -> VortexResult<Canonical> {
-        let len = self.len();
-        let mut buffer = BufferMut::<P>::with_capacity(len);
-
-        unsafe { buffer.set_len(len) }
-        self.stream
-            .memcpy_dtoh(
-                &unsafe { self.bp_task.output().transmute::<P>(len).vortex_expect("") },
-                &mut buffer,
-            )
-            .map_err(|e| vortex_err!("Failed to copy to device: {e}"))?;
-        self.stream
-            .synchronize()
-            .map_err(|e| vortex_err!("Failed to synchronize: {e}"))?;
-        Ok(Canonical::Primitive(PrimitiveArray::new(
-            buffer,
-            Validity::NonNullable,
-        )))
+    fn export_result(&mut self) -> VortexResult<GpuArray> {
+        Ok(GpuArray::Primitive(GpuPrimitiveArray {
+            values: self.bp_task.output(),
+        }))
     }
 
-    fn output(&mut self) -> CudaViewMut<'_, u8> {
+    fn output(&mut self) -> ErasedCudaSlice {
         self.bp_task.output()
     }
 
diff --git a/vortex-gpu/src/gpu_array.rs b/vortex-gpu/src/gpu_array.rs
@@ -3,12 +3,54 @@
 
 use std::sync::Arc;
 
+use cudarc::driver::CudaSlice;
+use vortex_dtype::NativePType;
+
 use crate::buffer::ErasedCudaSlice;
 
-pub type GpuArrayRef = Arc<dyn GpuArray>;
+pub enum GpuArray {
+    Primitive(GpuPrimitiveArray),
+    Bool(GpuBoolArray),
+    Struct(GpuStructArray),
+    Chunked(GpuChunkedArray),
+}
+
+pub struct GpuPrimitiveArray {
+    values: ErasedCudaSlice,
+}
+
+impl GpuPrimitiveArray {
+    fn as_slice<T: NativePType>(&self) -> CudaSlice<T> {
+        self.values.as_slice()
+    }
+}
+
+pub struct GpuBoolArray {
+    values: CudaSlice<bool>,
+}
 
-pub trait GpuArray {
-    fn child(&self, idx: usize) -> GpuArrayRef;
+impl GpuBoolArray {
+    fn values(&self) -> CudaSlice<bool> {
+        self.values.clone()
+    }
+}
+
+pub struct GpuChunkedArray {
+    gpu_arrays: Arc<[GpuArray]>,
+}
+
+impl GpuChunkedArray {
+    fn child(&self, idx: usize) -> &GpuArray {
+        &self.gpu_arrays[idx]
+    }
+}
+
+pub struct GpuStructArray {
+    gpu_arrays: Arc<[GpuArray]>,
+}
 
-    fn buffer(&self, idx: usize) -> ErasedCudaSlice;
+impl GpuStructArray {
+    fn child(&self, idx: usize) -> &GpuArray {
+        &self.gpu_arrays[idx]
+    }
 }
diff --git a/vortex-gpu/src/lib.rs b/vortex-gpu/src/lib.rs
@@ -2,18 +2,20 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 pub mod bit_unpack;
-pub mod buffer;
+mod buffer;
 pub mod for_;
 mod for_bp;
-pub mod gpu_array;
+mod gpu_array;
 mod indent;
 mod jit;
 mod rle_decompress;
 mod take;
 mod task;
 
 pub use bit_unpack::{cuda_bit_unpack, cuda_bit_unpack_timed};
+pub use buffer::*;
 pub use for_::{cuda_for_unpack, cuda_for_unpack_timed};
 pub use for_bp::{cuda_for_bp_unpack, cuda_for_bp_unpack_timed};
+pub use gpu_array::*;
 pub use jit::create_run_jit_kernel;
 pub use take::cuda_take;
diff --git a/vortex-gpu/src/task.rs b/vortex-gpu/src/task.rs
@@ -1,19 +1,19 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-use cudarc::driver::CudaViewMut;
-use vortex_array::Canonical;
 use vortex_error::VortexResult;
 
+use crate::{ErasedCudaSlice, GpuArray};
+
 pub trait GPUTask {
     // Must call `launch_task` once
     fn launch_task(&mut self) -> VortexResult<()>;
 
     // Must call this after launch_task
-    fn export_result(&mut self) -> VortexResult<Canonical>;
+    fn export_result(&mut self) -> VortexResult<GpuArray>;
 
     // Re can transmute as runtime
-    fn output(&mut self) -> CudaViewMut<'_, u8>;
+    fn output(&mut self) -> ErasedCudaSlice;
 
     fn len(&self) -> usize;
 }
diff --git a/vortex-layout/Cargo.toml b/vortex-layout/Cargo.toml
@@ -48,6 +48,7 @@ vortex-dtype = { workspace = true }
 vortex-error = { workspace = true }
 vortex-expr = { workspace = true }
 vortex-flatbuffers = { workspace = true, features = ["layout"] }
+vortex-gpu = { workspace = true, optional = true }
 vortex-io = { workspace = true }
 vortex-mask = { workspace = true }
 vortex-metrics = { workspace = true }
@@ -68,8 +69,8 @@ vortex-io = { path = "../vortex-io", features = ["tokio"] }
 test-harness = []
 tokio = ["dep:tokio", "vortex-error/tokio"]
 zstd = ["dep:vortex-zstd"]
-gpu = ["cuda"]
-cuda = ["dep:cudarc"]
+gpu = ["cuda", "dep:vortex-gpu"]
+cuda = ["dep:cudarc", "vortex-gpu/cuda"]
 
 [lints]
 workspace = true
diff --git a/vortex-layout/src/gpu/layouts/chunked/reader.rs b/vortex-layout/src/gpu/layouts/chunked/reader.rs
@@ -18,7 +18,7 @@ use vortex_expr::ExprRef;
 use crate::gpu::children::LazyGpuReaderChildren;
 use crate::layouts::chunked::ChunkedLayout;
 use crate::segments::SegmentSource;
-use crate::{GpuLayoutReader, GpuLayoutReaderRef};
+use crate::{GpuArrayFuture, GpuLayoutReader, GpuLayoutReaderRef};
 
 pub struct GpuChunkedLayoutReader {
     layout: ChunkedLayout,
@@ -133,7 +133,7 @@ impl GpuLayoutReader for GpuChunkedLayoutReader {
         &self,
         row_range: &Range<u64>,
         expr: &ExprRef,
-    ) -> VortexResult<BoxFuture<'static, VortexResult<ArrayRef>>> {
+    ) -> VortexResult<GpuArrayFuture> {
         let dtype = expr.return_dtype(self.dtype())?;
         let mut chunk_evals = FuturesOrdered::new();
 
diff --git a/vortex-layout/src/gpu/layouts/flat/reader.rs b/vortex-layout/src/gpu/layouts/flat/reader.rs
@@ -14,10 +14,10 @@ use vortex_dtype::{DType, FieldMask};
 use vortex_error::{VortexResult, VortexUnwrap as _};
 use vortex_expr::{ExprRef, Scope, is_root};
 
-use crate::GpuLayoutReader;
 use crate::layouts::SharedArrayFuture;
 use crate::layouts::flat::FlatLayout;
 use crate::segments::SegmentSource;
+use crate::{GpuArrayFuture, GpuLayoutReader, ShareGpuArrayFuture};
 
 pub struct GpuFlatReader {
     layout: FlatLayout,
@@ -39,7 +39,7 @@ impl GpuFlatReader {
     }
 
     /// Register the segment request and return a future that would resolve into the deserialised array.
-    fn array_future(&self) -> SharedArrayFuture {
+    fn array_future(&self) -> ShareGpuArrayFuture {
         let row_count = usize::try_from(self.layout.row_count()).vortex_unwrap();
 
         // We create the segment_fut here to ensure we give the segment reader visibility into
@@ -51,9 +51,7 @@ impl GpuFlatReader {
         let dtype = self.layout.dtype().clone();
         async move {
             let segment = segment_fut.await?;
-            ArrayParts::try_from(segment)?
-                .decode(&ctx, &dtype, row_count)
-                .map_err(Arc::new)
+            ArrayParts::try_from(segment)?.decode(&ctx, &dtype, row_count)?
         }
         .boxed()
         .shared()
@@ -87,7 +85,7 @@ impl GpuLayoutReader for GpuFlatReader {
         &self,
         row_range: &Range<u64>,
         expr: &ExprRef,
-    ) -> VortexResult<BoxFuture<'static, VortexResult<ArrayRef>>> {
+    ) -> VortexResult<GpuArrayFuture> {
         assert_eq!(
             row_range.clone(),
             0..self.layout.row_count(),
@@ -100,14 +98,7 @@ impl GpuLayoutReader for GpuFlatReader {
         Ok(async move {
             log::debug!("Flat array evaluation {} - {}", name, expr);
 
-            let mut array = array.clone().await?;
-
-            // Evaluate the projection expression.
-            if !is_root(&expr) {
-                array = expr.evaluate(&Scope::new(array))?;
-            }
-
-            Ok(array)
+            array.clone().await
         }
         .boxed())
     }
diff --git a/vortex-layout/src/gpu/layouts/gpu_segments.rs b/vortex-layout/src/gpu/layouts/gpu_segments.rs
@@ -5,11 +5,12 @@ use cudarc::driver::CudaSlice;
 use futures::future::BoxFuture;
 use vortex_buffer::ByteBuffer;
 use vortex_error::VortexResult;
+use vortex_gpu::ErasedCudaSlice;
 
 use crate::segments::SegmentId;
 
 /// Static future resolving to a segment byte buffer.
-pub type GpuSegmentFuture = BoxFuture<'static, VortexResult<CudaSlice>>;
+pub type GpuSegmentFuture = BoxFuture<'static, VortexResult<ErasedCudaSlice>>;
 
 /// A trait for providing segment data to a [`crate::LayoutReader`].
 pub trait GpuSegmentSource: 'static + Send + Sync {
diff --git a/vortex-layout/src/gpu/layouts/struct_/reader.rs b/vortex-layout/src/gpu/layouts/struct_/reader.rs
@@ -22,7 +22,7 @@ use crate::gpu::children::LazyGpuReaderChildren;
 use crate::layouts::partitioned::PartitionedExprEval;
 use crate::layouts::struct_::StructLayout;
 use crate::segments::SegmentSource;
-use crate::{ArrayFuture, GpuLayoutReader, GpuLayoutReaderRef};
+use crate::{ArrayFuture, GpuArrayFuture, GpuLayoutReader, GpuLayoutReaderRef};
 
 pub struct GpuStructReader {
     layout: StructLayout,
@@ -192,7 +192,7 @@ impl GpuLayoutReader for GpuStructReader {
         &self,
         row_range: &Range<u64>,
         expr: &ExprRef,
-    ) -> VortexResult<ArrayFuture> {
+    ) -> VortexResult<GpuArrayFuture> {
         // Partition the expression into expressions that can be evaluated over individual fields
         let len = usize::try_from(row_range.end - row_range.start)
             .vortex_expect("read range len must fit into usize");
diff --git a/vortex-layout/src/gpu/mod.rs b/vortex-layout/src/gpu/mod.rs

Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@ use vortex_error::vortex_panic;`
`10`	`10`
`11`	`11`	`pub struct ErasedCudaSlice {`
`12`	`12`	`ptr: CUdeviceptr,`
	`13`	`+ stream: Arc<CudaStream>,`
`13`	`14`	`len: usize,`
`14`	`15`	`ptype: PType,`
`15`	`16`	`}`
`@@ -18,8 +19,10 @@ impl ErasedCudaSlice {`
`18`	`19`	`pub fn new<T: NativePType>(slice: impl Into<CudaSlice<T>>) -> Self {`
`19`	`20`	`let slice = slice.into();`
`20`	`21`	`let len = slice.len();`
	`22`	`+ let stream = slice.stream().clone();`
`21`	`23`	`Self {`
`22`	`24`	`ptr: slice.leak(),`
	`25`	`+ stream,`
`23`	`26`	`len,`
`24`	`27`	`ptype: T::PTYPE,`
`25`	`28`	`}`
`@@ -33,7 +36,7 @@ impl ErasedCudaSlice {`
`33`	`36`	`self.len`
`34`	`37`	`}`
`35`	`38`
`36`		`- pub fn as_slice<T: NativePType>(&self, stream: &Arc<CudaStream>) -> CudaSlice<T> {`
	`39`	`+ pub fn as_slice<T: NativePType>(&self) -> CudaSlice<T> {`
`37`	`40`	`if T::PTYPE != self.ptype() {`
`38`	`41`	`vortex_panic!(`
`39`	`42`	`"Attempted to get slice of type {} from array of type {}",`
`@@ -42,6 +45,6 @@ impl ErasedCudaSlice {`
`42`	`45`	`)`
`43`	`46`	`}`
`44`	`47`
`45`		`- unsafe { stream.upgrade_device_ptr::<T>(self.ptr, self.len) }`
	`48`	`+ unsafe { self.stream.upgrade_device_ptr::<T>(self.ptr, self.len) }`
`46`	`49`	`}`
`47`	`50`	`}`