Skip to content

Commit 46cbff9

Browse files
committed
fix: sync copy to host
Signed-off-by: Alexander Droste <[email protected]>
1 parent 707d188 commit 46cbff9

File tree

2 files changed

+6
-38
lines changed

2 files changed

+6
-38
lines changed

vortex-cuda/src/device_buffer.rs

Lines changed: 3 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>
111111

112112
/// Synchronous copy of CUDA device to host memory.
113113
///
114+
/// The copy is not started before other operations on the streams are completed.
115+
///
114116
/// # Arguments
115117
///
116118
/// * `alignment` - The memory alignment to use for the host buffer.
@@ -119,29 +121,7 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>
119121
///
120122
/// Returns an error if the CUDA memory copy operation fails.
121123
fn copy_to_host_sync(&self, alignment: Alignment) -> VortexResult<ByteBuffer> {
122-
let mut host_buffer = BufferMut::<T>::with_capacity_aligned(self.len, alignment);
123-
124-
// Add offset to device pointer to account for any previous slicing operations.
125-
let src_ptr = self.device_ptr + (self.offset * size_of::<T>()) as u64;
126-
127-
// SAFETY: We pass a valid pointer to a buffer with sufficient capacity.
128-
// `cuMemcpyDtoHAsync_v2` fully initializes the memory.
129-
unsafe {
130-
sys::cuMemcpyDtoH_v2(
131-
host_buffer.spare_capacity_mut().as_mut_ptr().cast(),
132-
src_ptr,
133-
self.len * size_of::<T>(),
134-
)
135-
.result()
136-
.map_err(|e| vortex_err!("Failed to copy from device to host: {}", e))?;
137-
}
138-
139-
// SAFETY: `cuMemcpyDtoHAsync_v2` fully initialized the buffer.
140-
unsafe {
141-
host_buffer.set_len(self.len);
142-
}
143-
144-
Ok(host_buffer.freeze().into_byte_buffer())
124+
futures::executor::block_on(self.copy_to_host(alignment)?)
145125
}
146126

147127
/// Copies a device buffer to host memory asynchronously.

vortex-cuda/src/kernel/arrays/dict.rs

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ mod tests {
306306
.await
307307
.vortex_expect("GPU decompression failed")
308308
.into_primitive();
309-
cuda_ctx.synchronize_stream()?;
309+
310310
let cuda_result = cuda_primitive_to_host(cuda_result)?;
311311

312312
// Compare CUDA result with baseline
@@ -341,7 +341,7 @@ mod tests {
341341
.await
342342
.vortex_expect("GPU decompression failed")
343343
.into_primitive();
344-
cuda_ctx.synchronize_stream()?;
344+
345345
let cuda_result = cuda_primitive_to_host(cuda_result)?;
346346

347347
// Compare CUDA result with baseline
@@ -373,7 +373,6 @@ mod tests {
373373
.await
374374
.vortex_expect("GPU decompression failed")
375375
.into_primitive();
376-
cuda_ctx.synchronize_stream()?;
377376
let cuda_result = cuda_primitive_to_host(cuda_result)?;
378377

379378
// Compare CUDA result with baseline
@@ -405,7 +404,7 @@ mod tests {
405404
.await
406405
.vortex_expect("GPU decompression failed")
407406
.into_primitive();
408-
cuda_ctx.synchronize_stream()?;
407+
409408
let cuda_result = cuda_primitive_to_host(cuda_result)?;
410409

411410
// Compare CUDA result with baseline
@@ -441,7 +440,6 @@ mod tests {
441440
.into_primitive()
442441
});
443442

444-
cuda_ctx.synchronize_stream()?;
445443
let cuda_result = cuda_primitive_to_host(cuda_result)?;
446444

447445
// Compare CUDA result with baseline
@@ -479,7 +477,6 @@ mod tests {
479477
.await
480478
.vortex_expect("GPU decompression failed")
481479
.into_primitive();
482-
cuda_ctx.synchronize_stream()?;
483480
let cuda_result = cuda_primitive_to_host(cuda_result)?;
484481

485482
// Compare CUDA result with baseline
@@ -524,7 +521,6 @@ mod tests {
524521
.await
525522
.vortex_expect("GPU decompression failed")
526523
.into_primitive();
527-
cuda_ctx.synchronize_stream()?;
528524
let cuda_result = cuda_primitive_to_host(cuda_result)?;
529525

530526
// Compare CUDA result with baseline
@@ -570,7 +566,6 @@ mod tests {
570566
.await
571567
.vortex_expect("GPU decompression failed")
572568
.into_primitive();
573-
cuda_ctx.synchronize_stream()?;
574569
let cuda_result = cuda_primitive_to_host(cuda_result)?;
575570

576571
// Compare CUDA result with baseline
@@ -604,7 +599,6 @@ mod tests {
604599
.await
605600
.vortex_expect("GPU decompression failed")
606601
.into_primitive();
607-
cuda_ctx.synchronize_stream()?;
608602
let cuda_result = cuda_primitive_to_host(cuda_result)?;
609603

610604
// Compare CUDA result with baseline
@@ -644,7 +638,6 @@ mod tests {
644638
.await
645639
.vortex_expect("GPU decompression failed")
646640
.into_decimal();
647-
cuda_ctx.synchronize_stream()?;
648641
let cuda_result = cuda_decimal_to_host(cuda_result)?;
649642

650643
assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
@@ -673,7 +666,6 @@ mod tests {
673666
.await
674667
.vortex_expect("GPU decompression failed")
675668
.into_decimal();
676-
cuda_ctx.synchronize_stream()?;
677669
let cuda_result = cuda_decimal_to_host(cuda_result)?;
678670

679671
assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
@@ -702,7 +694,6 @@ mod tests {
702694
.await
703695
.vortex_expect("GPU decompression failed")
704696
.into_decimal();
705-
cuda_ctx.synchronize_stream()?;
706697
let cuda_result = cuda_decimal_to_host(cuda_result)?;
707698

708699
assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
@@ -734,7 +725,6 @@ mod tests {
734725
.await
735726
.vortex_expect("GPU decompression failed")
736727
.into_decimal();
737-
cuda_ctx.synchronize_stream()?;
738728
let cuda_result = cuda_decimal_to_host(cuda_result)?;
739729

740730
assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
@@ -771,7 +761,6 @@ mod tests {
771761
.await
772762
.vortex_expect("GPU decompression failed")
773763
.into_decimal();
774-
cuda_ctx.synchronize_stream()?;
775764
let cuda_result = cuda_decimal_to_host(cuda_result)?;
776765

777766
assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());
@@ -808,7 +797,6 @@ mod tests {
808797
.await
809798
.vortex_expect("GPU decompression failed")
810799
.into_decimal();
811-
cuda_ctx.synchronize_stream()?;
812800
let cuda_result = cuda_decimal_to_host(cuda_result)?;
813801

814802
assert_arrays_eq!(cuda_result.into_array(), baseline.into_array());

0 commit comments

Comments
 (0)