chore: CUDA device buffer (#6043)

0ax1 · web-flow · commit 45326d76b90a · 2026-01-19T18:12:09.000Z
Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/vortex-array/src/buffer.rs b/vortex-array/src/buffer.rs
@@ -7,6 +7,8 @@ use std::hash::Hasher;
 use std::ops::Range;
 use std::sync::Arc;
 
+use vortex_buffer::ALIGNMENT_TO_HOST_COPY;
+use vortex_buffer::Alignment;
 use vortex_buffer::ByteBuffer;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
@@ -53,7 +55,7 @@ pub trait DeviceBuffer: 'static + Send + Sync + Debug + DynEq + DynHash {
     /// # Errors
     ///
     /// This operation may fail, depending on the device implementation and the underlying hardware.
-    fn copy_to_host(&self) -> VortexResult<ByteBuffer>;
+    fn copy_to_host(&self, alignment: Alignment) -> VortexResult<ByteBuffer>;
 
     /// Create a new buffer that references a subrange of this buffer at the given
     /// slice indices.
@@ -89,6 +91,16 @@ impl BufferHandle {
 }
 
 impl BufferHandle {
+    /// Returns `true` if this buffer resides on the device (GPU).
+    pub fn is_on_device(&self) -> bool {
+        matches!(&self.0, Inner::Device(_))
+    }
+
+    /// Returns `true` if this buffer resides on the host (CPU).
+    pub fn is_on_host(&self) -> bool {
+        matches!(&self.0, Inner::Host(_))
+    }
+
     /// Gets the size of the buffer, in bytes.
     pub fn len(&self) -> usize {
         match &self.0 {
@@ -226,7 +238,7 @@ impl BufferHandle {
     pub fn try_to_host(&self) -> VortexResult<ByteBuffer> {
         match &self.0 {
             Inner::Host(b) => Ok(b.clone()),
-            Inner::Device(device) => device.copy_to_host(),
+            Inner::Device(device) => device.copy_to_host(ALIGNMENT_TO_HOST_COPY),
         }
     }
 
@@ -236,7 +248,7 @@ impl BufferHandle {
     pub fn try_into_host(self) -> VortexResult<ByteBuffer> {
         match self.0 {
             Inner::Host(b) => Ok(b),
-            Inner::Device(device) => device.copy_to_host(),
+            Inner::Device(device) => device.copy_to_host(ALIGNMENT_TO_HOST_COPY),
         }
     }
 }
diff --git a/vortex-buffer/src/alignment.rs b/vortex-buffer/src/alignment.rs
@@ -6,6 +6,9 @@ use std::ops::Deref;
 
 use vortex_error::VortexExpect;
 
+/// Default alignment for device-to-host buffer copies.
+pub const ALIGNMENT_TO_HOST_COPY: Alignment = Alignment::new(256);
+
 /// The alignment of a buffer.
 ///
 /// This type is a wrapper around `usize` that ensures the alignment is a power of 2 and fits into
diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs
@@ -83,7 +83,7 @@ fn make_for_array_u64(len: usize) -> FoRArray {
 /// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
 fn launch_for_kernel_timed_u8(
     for_array: &FoRArray,
-    device_data: cudarc::driver::CudaSlice<u8>,
+    device_data: &cudarc::driver::CudaSlice<u8>,
     reference: u8,
     cuda_ctx: &mut CudaExecutionCtx,
 ) -> vortex_error::VortexResult<Duration> {
@@ -93,7 +93,7 @@ fn launch_for_kernel_timed_u8(
         execution_ctx: cuda_ctx,
         module: "for",
         ptypes: &[for_array.ptype()],
-        launch_args: [device_data, reference, array_len_u64],
+        launch_args: [*device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
     );
@@ -109,7 +109,7 @@ fn launch_for_kernel_timed_u8(
 /// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
 fn launch_for_kernel_timed_u16(
     for_array: &FoRArray,
-    device_data: cudarc::driver::CudaSlice<u16>,
+    device_data: &cudarc::driver::CudaSlice<u16>,
     reference: u16,
     cuda_ctx: &mut CudaExecutionCtx,
 ) -> vortex_error::VortexResult<Duration> {
@@ -119,7 +119,7 @@ fn launch_for_kernel_timed_u16(
         execution_ctx: cuda_ctx,
         module: "for",
         ptypes: &[for_array.ptype()],
-        launch_args: [device_data, reference, array_len_u64],
+        launch_args: [*device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
     );
@@ -135,7 +135,7 @@ fn launch_for_kernel_timed_u16(
 /// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
 fn launch_for_kernel_timed_u32(
     for_array: &FoRArray,
-    device_data: cudarc::driver::CudaSlice<u32>,
+    device_data: &cudarc::driver::CudaSlice<u32>,
     reference: u32,
     cuda_ctx: &mut CudaExecutionCtx,
 ) -> vortex_error::VortexResult<Duration> {
@@ -145,7 +145,7 @@ fn launch_for_kernel_timed_u32(
         execution_ctx: cuda_ctx,
         module: "for",
         ptypes: &[for_array.ptype()],
-        launch_args: [device_data, reference, array_len_u64],
+        launch_args: [*device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
     );
@@ -161,7 +161,7 @@ fn launch_for_kernel_timed_u32(
 /// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
 fn launch_for_kernel_timed_u64(
     for_array: &FoRArray,
-    device_data: cudarc::driver::CudaSlice<u64>,
+    device_data: &cudarc::driver::CudaSlice<u64>,
     reference: u64,
     cuda_ctx: &mut CudaExecutionCtx,
 ) -> vortex_error::VortexResult<Duration> {
@@ -171,7 +171,7 @@ fn launch_for_kernel_timed_u64(
         execution_ctx: cuda_ctx,
         module: "for",
         ptypes: &[for_array.ptype()],
-        launch_args: [device_data, reference, array_len_u64],
+        launch_args: [*device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
     );
@@ -210,12 +210,12 @@ fn benchmark_for_u8(c: &mut Criterion) {
 
                     for _ in 0..iters {
                         let device_data = cuda_ctx
-                            .to_device(unpacked_slice)
+                            .copy_buffer_to_device(unpacked_slice)
                             .vortex_expect("failed to copy to device");
 
                         let kernel_time = launch_for_kernel_timed_u8(
                             for_array,
-                            device_data,
+                            device_data.cuda_slice(),
                             reference,
                             &mut cuda_ctx,
                         )
@@ -259,12 +259,12 @@ fn benchmark_for_u16(c: &mut Criterion) {
 
                     for _ in 0..iters {
                         let device_data = cuda_ctx
-                            .to_device(unpacked_slice)
+                            .copy_buffer_to_device(unpacked_slice)
                             .vortex_expect("failed to copy to device");
 
                         let kernel_time = launch_for_kernel_timed_u16(
                             for_array,
-                            device_data,
+                            device_data.cuda_slice(),
                             reference,
                             &mut cuda_ctx,
                         )
@@ -308,12 +308,12 @@ fn benchmark_for_u32(c: &mut Criterion) {
 
                     for _ in 0..iters {
                         let device_data = cuda_ctx
-                            .to_device(unpacked_slice)
+                            .copy_buffer_to_device(unpacked_slice)
                             .vortex_expect("failed to copy to device");
 
                         let kernel_time = launch_for_kernel_timed_u32(
                             for_array,
-                            device_data,
+                            device_data.cuda_slice(),
                             reference,
                             &mut cuda_ctx,
                         )
@@ -357,12 +357,12 @@ fn benchmark_for_u64(c: &mut Criterion) {
 
                     for _ in 0..iters {
                         let device_data = cuda_ctx
-                            .to_device(unpacked_slice)
+                            .copy_buffer_to_device(unpacked_slice)
                             .vortex_expect("failed to copy to device");
 
                         let kernel_time = launch_for_kernel_timed_u64(
                             for_array,
-                            device_data,
+                            device_data.cuda_slice(),
                             reference,
                             &mut cuda_ctx,
                         )
diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::hash::Hasher;
+use std::ops::Range;
+use std::sync::Arc;
+
+use cudarc::driver::CudaSlice;
+use cudarc::driver::DeviceRepr;
+use vortex_array::buffer::DeviceBuffer;
+use vortex_buffer::Alignment;
+use vortex_buffer::BufferMut;
+use vortex_buffer::ByteBuffer;
+use vortex_error::VortexResult;
+use vortex_error::vortex_err;
+
+/// A CUDA device buffer wrapping a [`CudaSlice<T>`].
+pub struct CudaDeviceBuffer<T> {
+    cuda_slice: CudaSlice<T>,
+}
+
+impl<T> CudaDeviceBuffer<T> {
+    /// Creates a new CUDA device buffer from a [`CudaSlice`].
+    pub fn new(cuda_slice: CudaSlice<T>) -> Self {
+        Self { cuda_slice }
+    }
+
+    /// Returns a reference to the underlying [`CudaSlice<T>`].
+    pub fn cuda_slice(&self) -> &CudaSlice<T> {
+        &self.cuda_slice
+    }
+}
+
+impl<T> Debug for CudaDeviceBuffer<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CudaDeviceBuffer")
+            .field(
+                "address",
+                &(&raw const self.cuda_slice as *const _ as usize),
+            )
+            .field("num_bytes", &self.cuda_slice.num_bytes())
+            .finish()
+    }
+}
+
+impl<T: 'static> Hash for CudaDeviceBuffer<T> {
+    /// Hash the buffer pointer address.
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        (&raw const self.cuda_slice).hash(state);
+    }
+}
+
+impl<T: 'static> PartialEq for CudaDeviceBuffer<T> {
+    /// Compares two buffers by pointer address.
+    fn eq(&self, other: &Self) -> bool {
+        std::ptr::eq(&raw const self.cuda_slice, &raw const other.cuda_slice)
+    }
+}
+
+impl<T: DeviceRepr + Clone + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T> {
+    /// Returns the number of elements in the CUDA device buffer of type T.
+    fn len(&self) -> usize {
+        self.cuda_slice.len()
+    }
+
+    /// Copies the CUDA device buffer to host memory.
+    ///
+    /// Allocates a host buffer with the specified alignment and copies the data
+    /// from the device to the host. The operation is implicitly synchronized
+    /// when the underlying event is dropped.
+    ///
+    /// # Arguments
+    ///
+    /// * `alignment` - The byte alignment for the allocated host buffer.
+    ///
+    /// # Returns
+    ///
+    /// A `ByteBuffer` containing the copied data, or an error if the copy fails.
+    fn copy_to_host(&self, alignment: Alignment) -> VortexResult<ByteBuffer> {
+        let len = self.cuda_slice.len();
+        let mut host_buffer = BufferMut::<T>::with_capacity_aligned(len, alignment);
+
+        // TODO(0ax1): Make the memcopy to host async. Even though `memcpy_dtoh`
+        // uses into `memcpy_dtoh_async`, it implicitly calls synchronize on the
+        // stream when dropping the `SyncOnDrop` `_record_dst` event at the end
+        // of the function.
+        self.cuda_slice
+            .stream()
+            .memcpy_dtoh(&self.cuda_slice, unsafe {
+                // SAFETY: We allocated sufficient capacity and fill the entire buffer.
+                host_buffer.set_len(len);
+                host_buffer.as_mut_slice()
+            })
+            .map_err(|e| vortex_err!("Failed to copy from device to host: {}", e))?;
+
+        Ok(host_buffer.freeze().into_byte_buffer())
+    }
+
+    /// Slices the CUDA device buffer to a subrange.
+    fn slice(&self, _range: Range<usize>) -> Arc<dyn DeviceBuffer> {
+        // TODO(0ax1): impl slice on CUDA slice
+        unimplemented!("CudaDeviceBuffer::slice is not yet implemented")
+    }
+}
diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
@@ -17,14 +17,12 @@ use vortex_array::Array;
 use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::VortexSessionExecute;
-use vortex_buffer::Alignment;
-use vortex_buffer::Buffer;
-use vortex_buffer::BufferMut;
 use vortex_dtype::PType;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 use vortex_session::VortexSession;
 
+use crate::CudaDeviceBuffer;
 use crate::CudaSession;
 use crate::session::CudaSessionExt;
 
@@ -161,53 +159,6 @@ impl CudaExecutionCtx {
         }
     }
 
-    /// Copies data from host to device.
-    pub fn to_device<T: DeviceRepr>(&self, data: &[T]) -> VortexResult<CudaSlice<T>> {
-        // TODO(0ax1): Make the memcopy to device async. Even though `memcpy_htod`
-        // uses into `memcpy_htod_async`, it implicitly calls synchronize on the
-        // stream when dropping the `SyncOnDrop` `_record_dst` event at the end
-        // of the function.
-        self.stream
-            .clone_htod(data)
-            .map_err(|e| vortex_err!("Failed to copy to device: {}", e))
-    }
-
-    /// Copies data from device to host.
-    ///
-    /// Returns a `Buffer<T>` with the specified alignment.
-    pub fn to_host<T: DeviceRepr>(
-        &self,
-        buffer: &CudaSlice<T>,
-        alignment: Alignment,
-    ) -> VortexResult<Buffer<T>> {
-        let len = buffer.len();
-        let mut host_buffer = BufferMut::<T>::with_capacity_aligned(len, alignment);
-
-        // TODO(0ax1): Make the memcopy to host async. Even though `memcpy_dtoh`
-        // uses into `memcpy_dtoh_async`, it implicitly calls synchronize on the
-        // stream when dropping the `SyncOnDrop` `_record_dst` event at the end
-        // of the function.
-        self.stream
-            .memcpy_dtoh(buffer, unsafe {
-                // SAFETY: We allocated with sufficient capacity and fill the entire buffer.
-                host_buffer.set_len(len);
-                host_buffer.as_mut_slice()
-            })
-            .map_err(|e| vortex_err!("Failed to copy from device: {}", e))?;
-
-        Ok(host_buffer.freeze())
-    }
-
-    /// Synchronizes the stream
-    ///
-    /// On `synchronize` the host waits for all pending operations of the stream to complete.
-    #[cfg(test)]
-    pub fn synchronize(&self) -> VortexResult<()> {
-        self.stream
-            .synchronize()
-            .map_err(|e| vortex_err!("Failed to synchronize device: {}", e))
-    }
-
     /// Loads a CUDA kernel function by module name and ptype(s).
     ///
     /// # Arguments
@@ -232,6 +183,20 @@ impl CudaExecutionCtx {
     pub fn launch_builder<'a>(&'a self, func: &'a CudaFunction) -> LaunchArgs<'a> {
         self.stream.launch_builder(func)
     }
+
+    /// Copies host data to the device, returning a [`CudaDeviceBuffer`].
+    ///
+    /// This is the primary way to get data onto the GPU for kernel execution.
+    pub fn copy_buffer_to_device<T: DeviceRepr + Clone + Send + Sync + 'static>(
+        &self,
+        data: &[T],
+    ) -> VortexResult<CudaDeviceBuffer<T>> {
+        let cuda_slice = self
+            .stream
+            .clone_htod(data)
+            .map_err(|e| vortex_err!("Failed to copy to device: {}", e))?;
+        Ok(CudaDeviceBuffer::new(cuda_slice))
+    }
 }
 
 /// Support trait for CUDA-accelerated decompression of arrays.
diff --git a/vortex-cuda/src/for_.rs b/vortex-cuda/src/for_.rs
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs