Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vortex-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ _test-harness = []
[dependencies]
async-trait = { workspace = true }
cudarc = { workspace = true }
futures = { workspace = true }
futures = { workspace = true, features = ["executor"] }
kanal = { workspace = true }
tracing = { workspace = true }
vortex-alp = { workspace = true }
Expand Down
29 changes: 6 additions & 23 deletions vortex-cuda/src/device_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,11 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>

/// Synchronous copy of CUDA device to host memory.
///
/// The copy is not started before other operations on the streams are completed.
/// This is synonymous to doing a synchronize on the stream before the copy.
///
/// The asynchronous `copy_to_host` function should be preferred whenever possible.
///
/// # Arguments
///
/// * `alignment` - The memory alignment to use for the host buffer.
Expand All @@ -119,29 +124,7 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>
///
/// Returns an error if the CUDA memory copy operation fails.
fn copy_to_host_sync(&self, alignment: Alignment) -> VortexResult<ByteBuffer> {
let mut host_buffer = BufferMut::<T>::with_capacity_aligned(self.len, alignment);

// Add offset to device pointer to account for any previous slicing operations.
let src_ptr = self.device_ptr + (self.offset * size_of::<T>()) as u64;

// SAFETY: We pass a valid pointer to a buffer with sufficient capacity.
// `cuMemcpyDtoHAsync_v2` fully initializes the memory.
unsafe {
sys::cuMemcpyDtoH_v2(
host_buffer.spare_capacity_mut().as_mut_ptr().cast(),
src_ptr,
self.len * size_of::<T>(),
)
.result()
.map_err(|e| vortex_err!("Failed to copy from device to host: {}", e))?;
}

// SAFETY: `cuMemcpyDtoHAsync_v2` fully initialized the buffer.
unsafe {
host_buffer.set_len(self.len);
}

Ok(host_buffer.freeze().into_byte_buffer())
futures::executor::block_on(self.copy_to_host(alignment)?)
}

/// Copies a device buffer to host memory asynchronously.
Expand Down
18 changes: 3 additions & 15 deletions vortex-cuda/src/kernel/arrays/dict.rs
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_primitive();
cuda_ctx.synchronize_stream()?;

let cuda_result = cuda_primitive_to_host(cuda_result)?;

// Compare CUDA result with baseline
Expand Down Expand Up @@ -341,7 +341,7 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_primitive();
cuda_ctx.synchronize_stream()?;

let cuda_result = cuda_primitive_to_host(cuda_result)?;

// Compare CUDA result with baseline
Expand Down Expand Up @@ -373,7 +373,6 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_primitive();
cuda_ctx.synchronize_stream()?;
let cuda_result = cuda_primitive_to_host(cuda_result)?;

// Compare CUDA result with baseline
Expand Down Expand Up @@ -405,7 +404,7 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_primitive();
cuda_ctx.synchronize_stream()?;

let cuda_result = cuda_primitive_to_host(cuda_result)?;

// Compare CUDA result with baseline
Expand Down Expand Up @@ -441,7 +440,6 @@ mod tests {
.into_primitive()
});

cuda_ctx.synchronize_stream()?;
let cuda_result = cuda_primitive_to_host(cuda_result)?;

// Compare CUDA result with baseline
Expand Down Expand Up @@ -479,7 +477,6 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_primitive();
cuda_ctx.synchronize_stream()?;
let cuda_result = cuda_primitive_to_host(cuda_result)?;

// Compare CUDA result with baseline
Expand Down Expand Up @@ -524,7 +521,6 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_primitive();
cuda_ctx.synchronize_stream()?;
let cuda_result = cuda_primitive_to_host(cuda_result)?;

// Compare CUDA result with baseline
Expand Down Expand Up @@ -570,7 +566,6 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_primitive();
cuda_ctx.synchronize_stream()?;
let cuda_result = cuda_primitive_to_host(cuda_result)?;

// Compare CUDA result with baseline
Expand Down Expand Up @@ -604,7 +599,6 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_primitive();
cuda_ctx.synchronize_stream()?;
let cuda_result = cuda_primitive_to_host(cuda_result)?;

// Compare CUDA result with baseline
Expand Down Expand Up @@ -644,7 +638,6 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_decimal();
cuda_ctx.synchronize_stream()?;
let cuda_result = cuda_decimal_to_host(cuda_result)?;

assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
Expand Down Expand Up @@ -673,7 +666,6 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_decimal();
cuda_ctx.synchronize_stream()?;
let cuda_result = cuda_decimal_to_host(cuda_result)?;

assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
Expand Down Expand Up @@ -702,7 +694,6 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_decimal();
cuda_ctx.synchronize_stream()?;
let cuda_result = cuda_decimal_to_host(cuda_result)?;

assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
Expand Down Expand Up @@ -734,7 +725,6 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_decimal();
cuda_ctx.synchronize_stream()?;
let cuda_result = cuda_decimal_to_host(cuda_result)?;

assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
Expand Down Expand Up @@ -771,7 +761,6 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_decimal();
cuda_ctx.synchronize_stream()?;
let cuda_result = cuda_decimal_to_host(cuda_result)?;

assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
Expand Down Expand Up @@ -808,7 +797,6 @@ mod tests {
.await
.vortex_expect("GPU decompression failed")
.into_decimal();
cuda_ctx.synchronize_stream()?;
let cuda_result = cuda_decimal_to_host(cuda_result)?;

assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
Expand Down
Loading