vortex-data · 0ax1 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
@@ -23,7 +23,7 @@ _test-harness = []
 [dependencies]
 async-trait = { workspace = true }
 cudarc = { workspace = true }
-futures = { workspace = true }
+futures = { workspace = true, features = ["executor"] }
 kanal = { workspace = true }
 tracing = { workspace = true }
 vortex-alp = { workspace = true }

diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
@@ -111,6 +111,11 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>
 
     /// Synchronous copy of CUDA device to host memory.
     ///
+    /// The copy is not started before other operations on the streams are completed.
+    /// This is synonymous to doing a synchronize on the stream before the copy.
+    ///
+    /// The asynchronous `copy_to_host` function should be preferred whenever possible.
+    ///
     /// # Arguments
     ///
     /// * `alignment` - The memory alignment to use for the host buffer.
@@ -119,29 +124,7 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>
     ///
     /// Returns an error if the CUDA memory copy operation fails.
     fn copy_to_host_sync(&self, alignment: Alignment) -> VortexResult<ByteBuffer> {
-        let mut host_buffer = BufferMut::<T>::with_capacity_aligned(self.len, alignment);
-
-        // Add offset to device pointer to account for any previous slicing operations.
-        let src_ptr = self.device_ptr + (self.offset * size_of::<T>()) as u64;
-
-        // SAFETY: We pass a valid pointer to a buffer with sufficient capacity.
-        // `cuMemcpyDtoHAsync_v2` fully initializes the memory.
-        unsafe {
-            sys::cuMemcpyDtoH_v2(
-                host_buffer.spare_capacity_mut().as_mut_ptr().cast(),
-                src_ptr,
-                self.len * size_of::<T>(),
-            )
-            .result()
-            .map_err(|e| vortex_err!("Failed to copy from device to host: {}", e))?;
-        }
-
-        // SAFETY: `cuMemcpyDtoHAsync_v2` fully initialized the buffer.
-        unsafe {
-            host_buffer.set_len(self.len);
-        }
-
-        Ok(host_buffer.freeze().into_byte_buffer())
+        futures::executor::block_on(self.copy_to_host(alignment)?)
     }
 
     /// Copies a device buffer to host memory asynchronously.

diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -306,7 +306,7 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
-        cuda_ctx.synchronize_stream()?;
+
         let cuda_result = cuda_primitive_to_host(cuda_result)?;
 
         // Compare CUDA result with baseline
@@ -341,7 +341,7 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
-        cuda_ctx.synchronize_stream()?;
+
         let cuda_result = cuda_primitive_to_host(cuda_result)?;
 
         // Compare CUDA result with baseline
@@ -373,7 +373,6 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
-        cuda_ctx.synchronize_stream()?;
         let cuda_result = cuda_primitive_to_host(cuda_result)?;
 
         // Compare CUDA result with baseline
@@ -405,7 +404,7 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
-        cuda_ctx.synchronize_stream()?;
+
         let cuda_result = cuda_primitive_to_host(cuda_result)?;
 
         // Compare CUDA result with baseline
@@ -441,7 +440,6 @@ mod tests {
                 .into_primitive()
         });
 
-        cuda_ctx.synchronize_stream()?;
         let cuda_result = cuda_primitive_to_host(cuda_result)?;
 
         // Compare CUDA result with baseline
@@ -479,7 +477,6 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
-        cuda_ctx.synchronize_stream()?;
         let cuda_result = cuda_primitive_to_host(cuda_result)?;
 
         // Compare CUDA result with baseline
@@ -524,7 +521,6 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
-        cuda_ctx.synchronize_stream()?;
         let cuda_result = cuda_primitive_to_host(cuda_result)?;
 
         // Compare CUDA result with baseline
@@ -570,7 +566,6 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
-        cuda_ctx.synchronize_stream()?;
         let cuda_result = cuda_primitive_to_host(cuda_result)?;
 
         // Compare CUDA result with baseline
@@ -604,7 +599,6 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
-        cuda_ctx.synchronize_stream()?;
         let cuda_result = cuda_primitive_to_host(cuda_result)?;
 
         // Compare CUDA result with baseline
@@ -644,7 +638,6 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_decimal();
-        cuda_ctx.synchronize_stream()?;
         let cuda_result = cuda_decimal_to_host(cuda_result)?;
 
         assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
@@ -673,7 +666,6 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_decimal();
-        cuda_ctx.synchronize_stream()?;
         let cuda_result = cuda_decimal_to_host(cuda_result)?;
 
         assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
@@ -702,7 +694,6 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_decimal();
-        cuda_ctx.synchronize_stream()?;
         let cuda_result = cuda_decimal_to_host(cuda_result)?;
 
         assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
@@ -734,7 +725,6 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_decimal();
-        cuda_ctx.synchronize_stream()?;
         let cuda_result = cuda_decimal_to_host(cuda_result)?;
 
         assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
@@ -771,7 +761,6 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_decimal();
-        cuda_ctx.synchronize_stream()?;
         let cuda_result = cuda_decimal_to_host(cuda_result)?;
 
         assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
@@ -808,7 +797,6 @@ mod tests {
             .await
             .vortex_expect("GPU decompression failed")
             .into_decimal();
-        cuda_ctx.synchronize_stream()?;
         let cuda_result = cuda_decimal_to_host(cuda_result)?;
 
         assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());