Skip to content

Commit 45326d7

Browse files
authored
chore: CUDA device buffer (#6043)
Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 61f141c commit 45326d7

File tree

7 files changed

+164
-79
lines changed

7 files changed

+164
-79
lines changed

vortex-array/src/buffer.rs

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ use std::hash::Hasher;
77
use std::ops::Range;
88
use std::sync::Arc;
99

10+
use vortex_buffer::ALIGNMENT_TO_HOST_COPY;
11+
use vortex_buffer::Alignment;
1012
use vortex_buffer::ByteBuffer;
1113
use vortex_error::VortexExpect;
1214
use vortex_error::VortexResult;
@@ -53,7 +55,7 @@ pub trait DeviceBuffer: 'static + Send + Sync + Debug + DynEq + DynHash {
5355
/// # Errors
5456
///
5557
/// This operation may fail, depending on the device implementation and the underlying hardware.
56-
fn copy_to_host(&self) -> VortexResult<ByteBuffer>;
58+
fn copy_to_host(&self, alignment: Alignment) -> VortexResult<ByteBuffer>;
5759

5860
/// Create a new buffer that references a subrange of this buffer at the given
5961
/// slice indices.
@@ -89,6 +91,16 @@ impl BufferHandle {
8991
}
9092

9193
impl BufferHandle {
94+
/// Returns `true` if this buffer resides on the device (GPU).
95+
pub fn is_on_device(&self) -> bool {
96+
matches!(&self.0, Inner::Device(_))
97+
}
98+
99+
/// Returns `true` if this buffer resides on the host (CPU).
100+
pub fn is_on_host(&self) -> bool {
101+
matches!(&self.0, Inner::Host(_))
102+
}
103+
92104
/// Gets the size of the buffer, in bytes.
93105
pub fn len(&self) -> usize {
94106
match &self.0 {
@@ -226,7 +238,7 @@ impl BufferHandle {
226238
pub fn try_to_host(&self) -> VortexResult<ByteBuffer> {
227239
match &self.0 {
228240
Inner::Host(b) => Ok(b.clone()),
229-
Inner::Device(device) => device.copy_to_host(),
241+
Inner::Device(device) => device.copy_to_host(ALIGNMENT_TO_HOST_COPY),
230242
}
231243
}
232244

@@ -236,7 +248,7 @@ impl BufferHandle {
236248
pub fn try_into_host(self) -> VortexResult<ByteBuffer> {
237249
match self.0 {
238250
Inner::Host(b) => Ok(b),
239-
Inner::Device(device) => device.copy_to_host(),
251+
Inner::Device(device) => device.copy_to_host(ALIGNMENT_TO_HOST_COPY),
240252
}
241253
}
242254
}

vortex-buffer/src/alignment.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ use std::ops::Deref;
66

77
use vortex_error::VortexExpect;
88

9+
/// Default alignment for device-to-host buffer copies.
10+
pub const ALIGNMENT_TO_HOST_COPY: Alignment = Alignment::new(256);
11+
912
/// The alignment of a buffer.
1013
///
1114
/// This type is a wrapper around `usize` that ensures the alignment is a power of 2 and fits into

vortex-cuda/benches/for_cuda.rs

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ fn make_for_array_u64(len: usize) -> FoRArray {
8383
/// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
8484
fn launch_for_kernel_timed_u8(
8585
for_array: &FoRArray,
86-
device_data: cudarc::driver::CudaSlice<u8>,
86+
device_data: &cudarc::driver::CudaSlice<u8>,
8787
reference: u8,
8888
cuda_ctx: &mut CudaExecutionCtx,
8989
) -> vortex_error::VortexResult<Duration> {
@@ -93,7 +93,7 @@ fn launch_for_kernel_timed_u8(
9393
execution_ctx: cuda_ctx,
9494
module: "for",
9595
ptypes: &[for_array.ptype()],
96-
launch_args: [device_data, reference, array_len_u64],
96+
launch_args: [*device_data, reference, array_len_u64],
9797
event_recording: CU_EVENT_BLOCKING_SYNC,
9898
array_len: for_array.len()
9999
);
@@ -109,7 +109,7 @@ fn launch_for_kernel_timed_u8(
109109
/// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
110110
fn launch_for_kernel_timed_u16(
111111
for_array: &FoRArray,
112-
device_data: cudarc::driver::CudaSlice<u16>,
112+
device_data: &cudarc::driver::CudaSlice<u16>,
113113
reference: u16,
114114
cuda_ctx: &mut CudaExecutionCtx,
115115
) -> vortex_error::VortexResult<Duration> {
@@ -119,7 +119,7 @@ fn launch_for_kernel_timed_u16(
119119
execution_ctx: cuda_ctx,
120120
module: "for",
121121
ptypes: &[for_array.ptype()],
122-
launch_args: [device_data, reference, array_len_u64],
122+
launch_args: [*device_data, reference, array_len_u64],
123123
event_recording: CU_EVENT_BLOCKING_SYNC,
124124
array_len: for_array.len()
125125
);
@@ -135,7 +135,7 @@ fn launch_for_kernel_timed_u16(
135135
/// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
136136
fn launch_for_kernel_timed_u32(
137137
for_array: &FoRArray,
138-
device_data: cudarc::driver::CudaSlice<u32>,
138+
device_data: &cudarc::driver::CudaSlice<u32>,
139139
reference: u32,
140140
cuda_ctx: &mut CudaExecutionCtx,
141141
) -> vortex_error::VortexResult<Duration> {
@@ -145,7 +145,7 @@ fn launch_for_kernel_timed_u32(
145145
execution_ctx: cuda_ctx,
146146
module: "for",
147147
ptypes: &[for_array.ptype()],
148-
launch_args: [device_data, reference, array_len_u64],
148+
launch_args: [*device_data, reference, array_len_u64],
149149
event_recording: CU_EVENT_BLOCKING_SYNC,
150150
array_len: for_array.len()
151151
);
@@ -161,7 +161,7 @@ fn launch_for_kernel_timed_u32(
161161
/// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
162162
fn launch_for_kernel_timed_u64(
163163
for_array: &FoRArray,
164-
device_data: cudarc::driver::CudaSlice<u64>,
164+
device_data: &cudarc::driver::CudaSlice<u64>,
165165
reference: u64,
166166
cuda_ctx: &mut CudaExecutionCtx,
167167
) -> vortex_error::VortexResult<Duration> {
@@ -171,7 +171,7 @@ fn launch_for_kernel_timed_u64(
171171
execution_ctx: cuda_ctx,
172172
module: "for",
173173
ptypes: &[for_array.ptype()],
174-
launch_args: [device_data, reference, array_len_u64],
174+
launch_args: [*device_data, reference, array_len_u64],
175175
event_recording: CU_EVENT_BLOCKING_SYNC,
176176
array_len: for_array.len()
177177
);
@@ -210,12 +210,12 @@ fn benchmark_for_u8(c: &mut Criterion) {
210210

211211
for _ in 0..iters {
212212
let device_data = cuda_ctx
213-
.to_device(unpacked_slice)
213+
.copy_buffer_to_device(unpacked_slice)
214214
.vortex_expect("failed to copy to device");
215215

216216
let kernel_time = launch_for_kernel_timed_u8(
217217
for_array,
218-
device_data,
218+
device_data.cuda_slice(),
219219
reference,
220220
&mut cuda_ctx,
221221
)
@@ -259,12 +259,12 @@ fn benchmark_for_u16(c: &mut Criterion) {
259259

260260
for _ in 0..iters {
261261
let device_data = cuda_ctx
262-
.to_device(unpacked_slice)
262+
.copy_buffer_to_device(unpacked_slice)
263263
.vortex_expect("failed to copy to device");
264264

265265
let kernel_time = launch_for_kernel_timed_u16(
266266
for_array,
267-
device_data,
267+
device_data.cuda_slice(),
268268
reference,
269269
&mut cuda_ctx,
270270
)
@@ -308,12 +308,12 @@ fn benchmark_for_u32(c: &mut Criterion) {
308308

309309
for _ in 0..iters {
310310
let device_data = cuda_ctx
311-
.to_device(unpacked_slice)
311+
.copy_buffer_to_device(unpacked_slice)
312312
.vortex_expect("failed to copy to device");
313313

314314
let kernel_time = launch_for_kernel_timed_u32(
315315
for_array,
316-
device_data,
316+
device_data.cuda_slice(),
317317
reference,
318318
&mut cuda_ctx,
319319
)
@@ -357,12 +357,12 @@ fn benchmark_for_u64(c: &mut Criterion) {
357357

358358
for _ in 0..iters {
359359
let device_data = cuda_ctx
360-
.to_device(unpacked_slice)
360+
.copy_buffer_to_device(unpacked_slice)
361361
.vortex_expect("failed to copy to device");
362362

363363
let kernel_time = launch_for_kernel_timed_u64(
364364
for_array,
365-
device_data,
365+
device_data.cuda_slice(),
366366
reference,
367367
&mut cuda_ctx,
368368
)

vortex-cuda/src/device_buffer.rs

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use std::fmt::Debug;
5+
use std::hash::Hash;
6+
use std::hash::Hasher;
7+
use std::ops::Range;
8+
use std::sync::Arc;
9+
10+
use cudarc::driver::CudaSlice;
11+
use cudarc::driver::DeviceRepr;
12+
use vortex_array::buffer::DeviceBuffer;
13+
use vortex_buffer::Alignment;
14+
use vortex_buffer::BufferMut;
15+
use vortex_buffer::ByteBuffer;
16+
use vortex_error::VortexResult;
17+
use vortex_error::vortex_err;
18+
19+
/// A CUDA device buffer wrapping a [`CudaSlice<T>`].
20+
pub struct CudaDeviceBuffer<T> {
21+
cuda_slice: CudaSlice<T>,
22+
}
23+
24+
impl<T> CudaDeviceBuffer<T> {
25+
/// Creates a new CUDA device buffer from a [`CudaSlice`].
26+
pub fn new(cuda_slice: CudaSlice<T>) -> Self {
27+
Self { cuda_slice }
28+
}
29+
30+
/// Returns a reference to the underlying [`CudaSlice<T>`].
31+
pub fn cuda_slice(&self) -> &CudaSlice<T> {
32+
&self.cuda_slice
33+
}
34+
}
35+
36+
impl<T> Debug for CudaDeviceBuffer<T> {
37+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
38+
f.debug_struct("CudaDeviceBuffer")
39+
.field(
40+
"address",
41+
&(&raw const self.cuda_slice as *const _ as usize),
42+
)
43+
.field("num_bytes", &self.cuda_slice.num_bytes())
44+
.finish()
45+
}
46+
}
47+
48+
impl<T: 'static> Hash for CudaDeviceBuffer<T> {
49+
/// Hash the buffer pointer address.
50+
fn hash<H: Hasher>(&self, state: &mut H) {
51+
(&raw const self.cuda_slice).hash(state);
52+
}
53+
}
54+
55+
impl<T: 'static> PartialEq for CudaDeviceBuffer<T> {
56+
/// Compares two buffers by pointer address.
57+
fn eq(&self, other: &Self) -> bool {
58+
std::ptr::eq(&raw const self.cuda_slice, &raw const other.cuda_slice)
59+
}
60+
}
61+
62+
impl<T: DeviceRepr + Clone + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T> {
63+
/// Returns the number of elements in the CUDA device buffer of type T.
64+
fn len(&self) -> usize {
65+
self.cuda_slice.len()
66+
}
67+
68+
/// Copies the CUDA device buffer to host memory.
69+
///
70+
/// Allocates a host buffer with the specified alignment and copies the data
71+
/// from the device to the host. The operation is implicitly synchronized
72+
/// when the underlying event is dropped.
73+
///
74+
/// # Arguments
75+
///
76+
/// * `alignment` - The byte alignment for the allocated host buffer.
77+
///
78+
/// # Returns
79+
///
80+
/// A `ByteBuffer` containing the copied data, or an error if the copy fails.
81+
fn copy_to_host(&self, alignment: Alignment) -> VortexResult<ByteBuffer> {
82+
let len = self.cuda_slice.len();
83+
let mut host_buffer = BufferMut::<T>::with_capacity_aligned(len, alignment);
84+
85+
// TODO(0ax1): Make the memcopy to host async. Even though `memcpy_dtoh`
86+
// uses into `memcpy_dtoh_async`, it implicitly calls synchronize on the
87+
// stream when dropping the `SyncOnDrop` `_record_dst` event at the end
88+
// of the function.
89+
self.cuda_slice
90+
.stream()
91+
.memcpy_dtoh(&self.cuda_slice, unsafe {
92+
// SAFETY: We allocated sufficient capacity and fill the entire buffer.
93+
host_buffer.set_len(len);
94+
host_buffer.as_mut_slice()
95+
})
96+
.map_err(|e| vortex_err!("Failed to copy from device to host: {}", e))?;
97+
98+
Ok(host_buffer.freeze().into_byte_buffer())
99+
}
100+
101+
/// Slices the CUDA device buffer to a subrange.
102+
fn slice(&self, _range: Range<usize>) -> Arc<dyn DeviceBuffer> {
103+
// TODO(0ax1): impl slice on CUDA slice
104+
unimplemented!("CudaDeviceBuffer::slice is not yet implemented")
105+
}
106+
}

vortex-cuda/src/executor.rs

Lines changed: 15 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,12 @@ use vortex_array::Array;
1717
use vortex_array::ArrayRef;
1818
use vortex_array::Canonical;
1919
use vortex_array::VortexSessionExecute;
20-
use vortex_buffer::Alignment;
21-
use vortex_buffer::Buffer;
22-
use vortex_buffer::BufferMut;
2320
use vortex_dtype::PType;
2421
use vortex_error::VortexResult;
2522
use vortex_error::vortex_err;
2623
use vortex_session::VortexSession;
2724

25+
use crate::CudaDeviceBuffer;
2826
use crate::CudaSession;
2927
use crate::session::CudaSessionExt;
3028

@@ -161,53 +159,6 @@ impl CudaExecutionCtx {
161159
}
162160
}
163161

164-
/// Copies data from host to device.
165-
pub fn to_device<T: DeviceRepr>(&self, data: &[T]) -> VortexResult<CudaSlice<T>> {
166-
// TODO(0ax1): Make the memcopy to device async. Even though `memcpy_htod`
167-
// uses into `memcpy_htod_async`, it implicitly calls synchronize on the
168-
// stream when dropping the `SyncOnDrop` `_record_dst` event at the end
169-
// of the function.
170-
self.stream
171-
.clone_htod(data)
172-
.map_err(|e| vortex_err!("Failed to copy to device: {}", e))
173-
}
174-
175-
/// Copies data from device to host.
176-
///
177-
/// Returns a `Buffer<T>` with the specified alignment.
178-
pub fn to_host<T: DeviceRepr>(
179-
&self,
180-
buffer: &CudaSlice<T>,
181-
alignment: Alignment,
182-
) -> VortexResult<Buffer<T>> {
183-
let len = buffer.len();
184-
let mut host_buffer = BufferMut::<T>::with_capacity_aligned(len, alignment);
185-
186-
// TODO(0ax1): Make the memcopy to host async. Even though `memcpy_dtoh`
187-
// uses into `memcpy_dtoh_async`, it implicitly calls synchronize on the
188-
// stream when dropping the `SyncOnDrop` `_record_dst` event at the end
189-
// of the function.
190-
self.stream
191-
.memcpy_dtoh(buffer, unsafe {
192-
// SAFETY: We allocated with sufficient capacity and fill the entire buffer.
193-
host_buffer.set_len(len);
194-
host_buffer.as_mut_slice()
195-
})
196-
.map_err(|e| vortex_err!("Failed to copy from device: {}", e))?;
197-
198-
Ok(host_buffer.freeze())
199-
}
200-
201-
/// Synchronizes the stream
202-
///
203-
/// On `synchronize` the host waits for all pending operations of the stream to complete.
204-
#[cfg(test)]
205-
pub fn synchronize(&self) -> VortexResult<()> {
206-
self.stream
207-
.synchronize()
208-
.map_err(|e| vortex_err!("Failed to synchronize device: {}", e))
209-
}
210-
211162
/// Loads a CUDA kernel function by module name and ptype(s).
212163
///
213164
/// # Arguments
@@ -232,6 +183,20 @@ impl CudaExecutionCtx {
232183
pub fn launch_builder<'a>(&'a self, func: &'a CudaFunction) -> LaunchArgs<'a> {
233184
self.stream.launch_builder(func)
234185
}
186+
187+
/// Copies host data to the device, returning a [`CudaDeviceBuffer`].
188+
///
189+
/// This is the primary way to get data onto the GPU for kernel execution.
190+
pub fn copy_buffer_to_device<T: DeviceRepr + Clone + Send + Sync + 'static>(
191+
&self,
192+
data: &[T],
193+
) -> VortexResult<CudaDeviceBuffer<T>> {
194+
let cuda_slice = self
195+
.stream
196+
.clone_htod(data)
197+
.map_err(|e| vortex_err!("Failed to copy to device: {}", e))?;
198+
Ok(CudaDeviceBuffer::new(cuda_slice))
199+
}
235200
}
236201

237202
/// Support trait for CUDA-accelerated decompression of arrays.

0 commit comments

Comments
 (0)