Skip to content

Commit 605dcb0

Browse files
committed
Feat: start work on async allocation/copying/dropping
1 parent f8627da commit 605dcb0

File tree

9 files changed

+721
-373
lines changed

9 files changed

+721
-373
lines changed

crates/cust/CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,16 @@ overall simplifying the context handling APIs. This does mean that the API chang
1717
The old context handling is fully present in `cust::context::legacy` for anyone who needs it for specific reasons. If you use `quick_init` you don't need to worry about
1818
any breaking changes, the API is the same.
1919

20+
- Added `cust::memory::LockedBox`, same as `LockedBuffer` except for single elements.
21+
- Added `cust::memory::cuda_malloc_async`.
22+
- Added `cust::memory::cuda_free_async`.
23+
- Added `impl AsyncCopyDestination<LockedBox<T>> for DeviceBox<T>` for async HtoD memcpy.
24+
- Added the `bytemuck` feature which is enabled by default.
25+
- `zeroed` functions on `DeviceBox` and others are no longer unsafe and instead now require `T: Zeroable`. The functions are only available with the `bytemuck` feature.
26+
- Added `zeroed_async` to `DeviceBox`.
27+
- Added `drop_async` to `DeviceBox`.
28+
- Added `new_async` to `DeviceBox`.
29+
2030
## 0.2.2 - 12/5/21
2131

2232
- Update find_cuda_helper to 0.2

crates/cust/Cargo.toml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,32 @@
22
name = "cust"
33
version = "0.2.2"
44
# Big thanks to the original author of rustacuda <3
5-
authors = ["Riccardo D'Ambrosio <[email protected]>", "Brook Heisler <[email protected]>"]
5+
authors = [
6+
"Riccardo D'Ambrosio <[email protected]>",
7+
"Brook Heisler <[email protected]>",
8+
]
69
edition = "2018"
710
license = "MIT OR Apache-2.0"
811
description = "High level bindings to the CUDA Driver API"
912
repository = "https://github.com/Rust-GPU/Rust-CUDA"
1013
readme = "../../README.md"
1114

1215
[dependencies]
13-
cust_raw = { path = "../cust_raw", version = "0.11.2"}
16+
cust_raw = { path = "../cust_raw", version = "0.11.2" }
1417
bitflags = "1.2"
1518
cust_derive = { path = "../cust_derive", version = "0.1" }
1619
num-complex = { version = "0.4", optional = true }
1720
vek = { version = "0.15.1", optional = true, default-features = false }
21+
bytemuck = { version = "1.7.3", optional = true }
22+
23+
[features]
24+
default = ["bytemuck"]
1825

1926
[build-dependencies]
2027
find_cuda_helper = { path = "../find_cuda_helper", version = "0.2" }
2128

2229
[dev-dependencies]
2330
image = "0.23.14"
31+
32+
[package.metadata.docs.rs]
33+
rustdoc-args = ["--cfg", "docsrs"]

crates/cust/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@
5454
//! Cust will try to find the CUDA libraries automatically, if it is unable to find it, you can set
5555
//! `CUDA_LIBRARY_PATH` to some path manually.
5656
57+
#![cfg_attr(docsrs, feature(doc_cfg))]
58+
5759
pub mod device;
5860
pub mod error;
5961
pub mod event;

crates/cust/src/memory/device/device_box.rs

Lines changed: 199 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@ use crate::error::{CudaResult, DropResult, ToResult};
22
use crate::memory::device::AsyncCopyDestination;
33
use crate::memory::device::CopyDestination;
44
use crate::memory::malloc::{cuda_free, cuda_malloc};
5-
use crate::memory::DeviceCopy;
65
use crate::memory::DevicePointer;
6+
use crate::memory::{cuda_free_async, cuda_malloc_async, DeviceCopy};
77
use crate::stream::Stream;
88
use crate::sys as cuda;
99
use std::fmt::{self, Pointer};
10-
use std::mem;
10+
use std::mem::{self, ManuallyDrop};
1111

1212
use std::os::raw::c_void;
1313

@@ -39,6 +39,89 @@ impl<T: DeviceCopy> DeviceBox<T> {
3939
dev_box.copy_from(val)?;
4040
Ok(dev_box)
4141
}
42+
43+
/// Allocates device memory asynchronously and asynchronously copies `val` into it.
44+
///
45+
/// This doesn't actually allocate if `T` is zero-sized.
46+
///
47+
/// If the memory behind `val` is not page-locked (pinned), a staging buffer
48+
/// will be allocated using a worker thread. If you are going to be making
49+
/// many asynchronous copies, it is generally a good idea to keep the data as a [`cust::memory::LockedBuffer`]
50+
/// or [`cust::memory::LockedBox`]. This will ensure the driver does not have to allocate a staging buffer
51+
/// on its own.
52+
///
53+
/// However, don't keep all of your data as page-locked, doing so might slow down
54+
/// the OS because it is unable to page out that memory to disk.
55+
///
56+
/// # Safety
57+
///
58+
/// This method enqueues two operations on the stream: An async allocation
59+
/// and an async memcpy. Because of this, you must ensure that:
60+
/// - The memory is not used in any way before it is actually allocated on the stream. You
61+
/// can ensure this happens by synchronizing the stream explicitly or using events.
62+
/// - `val` is still valid when the memory copy actually takes place.
63+
///
64+
/// # Examples
65+
///
66+
/// ```
67+
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
68+
/// # let _context = cust::quick_init().unwrap();
69+
/// use cust::{memory::*, stream::*};
70+
/// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
71+
/// let mut host_val = 0;
72+
/// unsafe {
73+
/// let mut allocated = DeviceBox::new_async(&5u8, &stream)?;
74+
/// allocated.async_copy_to(&mut host_val, &stream)?;
75+
/// allocated.drop_async(&stream)?;
76+
/// }
77+
/// // ensure all async ops are done before trying to access the value
78+
/// stream.synchronize()?;
79+
/// assert_eq!(host_val, 5);
80+
/// # Ok(())
81+
/// # }
82+
pub unsafe fn new_async(val: &T, stream: &Stream) -> CudaResult<Self> {
83+
let mut dev_box = DeviceBox::uninitialized()?;
84+
dev_box.async_copy_from(val, stream)?;
85+
Ok(dev_box)
86+
}
87+
88+
/// Enqueues an operation to free the memory backed by this [`DeviceBox`] on a
89+
/// particular stream. The stream will free the allocation as soon as it reaches
90+
/// the operation in the stream. You can ensure the memory is freed by synchronizing
91+
/// the stream.
92+
///
93+
/// This function uses internal memory pool semantics. Async allocations will reserve memory
94+
/// in the default memory pool in the stream, and async frees will release the memory back to the pool
95+
/// for further use by async allocations.
96+
///
97+
/// The memory inside of the pool is all freed back to the OS once the stream is synchronized unless
98+
/// a custom pool is configured to not do so.
99+
///
100+
/// # Examples
101+
///
102+
/// ```
103+
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
104+
/// # let _context = cust::quick_init().unwrap();
105+
/// use cust::{memory::*, stream::*};
106+
/// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
107+
/// let mut host_val = 0;
108+
/// unsafe {
109+
/// let mut allocated = DeviceBox::new_async(&5u8, &stream)?;
110+
/// allocated.async_copy_to(&mut host_val, &stream)?;
111+
/// allocated.drop_async(&stream)?;
112+
/// }
113+
/// // ensure all async ops are done before trying to access the value
114+
/// stream.synchronize()?;
115+
/// assert_eq!(host_val, 5);
116+
/// # Ok(())
117+
/// # }
118+
pub fn drop_async(self, stream: &Stream) -> CudaResult<()> {
119+
// make sure we dont run the normal destructor, otherwise a double drop will happen
120+
let me = ManuallyDrop::new(self);
121+
// SAFETY: we consume the box so its not possible to use the box past its drop point unless
122+
// you keep around a pointer, but in that case, we cannot guarantee safety.
123+
unsafe { cuda_free_async(stream, me.ptr) }
124+
}
42125
}
43126

44127
impl<T: DeviceCopy + Default> DeviceBox<T> {
@@ -50,6 +133,76 @@ impl<T: DeviceCopy + Default> DeviceBox<T> {
50133
}
51134
}
52135

136+
#[cfg(feature = "bytemuck")]
137+
impl<T: DeviceCopy + bytemuck::Zeroable> DeviceBox<T> {
138+
/// Allocate device memory and fill it with zeroes (`0u8`).
139+
///
140+
/// This doesn't actually allocate if `T` is zero-sized.
141+
///
142+
/// # Examples
143+
///
144+
/// ```
145+
/// # let _context = cust::quick_init().unwrap();
146+
/// use cust::memory::*;
147+
/// let mut zero = DeviceBox::zeroed().unwrap();
148+
/// let mut value = 5u64;
149+
/// zero.copy_to(&mut value).unwrap();
150+
/// assert_eq!(0, value);
151+
/// ```
152+
#[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
153+
pub fn zeroed() -> CudaResult<Self> {
154+
unsafe {
155+
let mut new_box = DeviceBox::uninitialized()?;
156+
if mem::size_of::<T>() != 0 {
157+
cuda::cuMemsetD8_v2(
158+
new_box.as_device_ptr().as_raw_mut() as u64,
159+
0,
160+
mem::size_of::<T>(),
161+
)
162+
.to_result()?;
163+
}
164+
Ok(new_box)
165+
}
166+
}
167+
168+
/// Allocate device memory asynchronously and asynchronously fills it with zeroes (`0u8`).
169+
///
170+
/// This doesn't actually allocate if `T` is zero-sized.
171+
///
172+
/// # Examples
173+
///
174+
/// ```
175+
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
176+
/// # let _context = cust::quick_init().unwrap();
177+
/// use cust::{memory::*, stream::*};
178+
/// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
179+
/// let mut value = 5u64;
180+
/// unsafe {
181+
/// let mut zero = DeviceBox::zeroed_async(&stream)?;
182+
/// zero.async_copy_to(&mut value, &stream)?;
183+
/// zero.free_async(&stream)?;
184+
/// }
185+
/// stream.synchronize()?;
186+
/// assert_eq!(value, 0);
187+
/// # Ok(())
188+
/// # }
189+
/// ```
190+
#[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
191+
pub unsafe fn zeroed_async(stream: &Stream) -> CudaResult<Self> {
192+
let mut new_box = DeviceBox::uninitialized_async(stream)?;
193+
if mem::size_of::<T>() != 0 {
194+
cuda::cuMemsetD8Async(
195+
new_box.as_device_ptr().as_raw_mut() as u64,
196+
0,
197+
mem::size_of::<T>(),
198+
stream.as_inner(),
199+
)
200+
.to_result()?;
201+
}
202+
Ok(new_box)
203+
}
204+
}
205+
53206
impl<T> DeviceBox<T> {
54207
/// Allocate device memory, but do not initialize it.
55208
///
@@ -80,37 +233,27 @@ impl<T> DeviceBox<T> {
80233
}
81234
}
82235

83-
/// Allocate device memory and fill it with zeroes (`0u8`).
236+
/// Allocates device memory asynchronously on a stream, without initializing it.
84237
///
85-
/// This doesn't actually allocate if `T` is zero-sized.
238+
/// This doesn't actually allocate if `T` is zero sized.
86239
///
87240
/// # Safety
88241
///
89-
/// The backing memory is zeroed, which may not be a valid bit-pattern for type `T`. The caller
90-
/// must ensure either that all-zeroes is a valid bit-pattern for type `T` or that the backing
91-
/// memory is set to a valid value before it is read.
92-
///
93-
/// # Examples
242+
/// The allocated memory retains all of the unsafety of [`DeviceBox::uninitialized`], with
243+
/// the additional consideration that the memory cannot be used until it is actually allocated
244+
/// on the stream. This means proper stream ordering semantics must be followed, such as
245+
/// only enqueing kernel launches that use the memory AFTER the allocation call.
94246
///
95-
/// ```
96-
/// # let _context = cust::quick_init().unwrap();
97-
/// use cust::memory::*;
98-
/// let mut zero = unsafe { DeviceBox::zeroed().unwrap() };
99-
/// let mut value = 5u64;
100-
/// zero.copy_to(&mut value).unwrap();
101-
/// assert_eq!(0, value);
102-
/// ```
103-
pub unsafe fn zeroed() -> CudaResult<Self> {
104-
let mut new_box = DeviceBox::uninitialized()?;
105-
if mem::size_of::<T>() != 0 {
106-
cuda::cuMemsetD8_v2(
107-
new_box.as_device_ptr().as_raw_mut() as u64,
108-
0,
109-
mem::size_of::<T>(),
110-
)
111-
.to_result()?;
247+
/// You can synchronize the stream to ensure the memory allocation operation is complete.
248+
pub unsafe fn uninitialized_async(stream: &Stream) -> CudaResult<Self> {
249+
if mem::size_of::<T>() == 0 {
250+
Ok(DeviceBox {
251+
ptr: DevicePointer::null(),
252+
})
253+
} else {
254+
let ptr = cuda_malloc_async(stream, 1)?;
255+
Ok(DeviceBox { ptr })
112256
}
113-
Ok(new_box)
114257
}
115258

116259
/// Constructs a DeviceBox from a raw pointer.
@@ -318,6 +461,35 @@ impl<T: DeviceCopy> CopyDestination<DeviceBox<T>> for DeviceBox<T> {
318461
Ok(())
319462
}
320463
}
464+
impl<T: DeviceCopy> AsyncCopyDestination<T> for DeviceBox<T> {
465+
unsafe fn async_copy_from(&mut self, val: &T, stream: &Stream) -> CudaResult<()> {
466+
let size = mem::size_of::<T>();
467+
if size != 0 {
468+
cuda::cuMemcpyHtoDAsync_v2(
469+
self.ptr.as_raw_mut() as u64,
470+
val as *const _ as *const c_void,
471+
size,
472+
stream.as_inner(),
473+
)
474+
.to_result()?
475+
}
476+
Ok(())
477+
}
478+
479+
unsafe fn async_copy_to(&self, val: &mut T, stream: &Stream) -> CudaResult<()> {
480+
let size = mem::size_of::<T>();
481+
if size != 0 {
482+
cuda::cuMemcpyDtoHAsync_v2(
483+
val as *mut _ as *mut c_void,
484+
self.ptr.as_raw() as u64,
485+
size,
486+
stream.as_inner(),
487+
)
488+
.to_result()?
489+
}
490+
Ok(())
491+
}
492+
}
321493
impl<T: DeviceCopy> AsyncCopyDestination<DeviceBox<T>> for DeviceBox<T> {
322494
unsafe fn async_copy_from(&mut self, val: &DeviceBox<T>, stream: &Stream) -> CudaResult<()> {
323495
let size = mem::size_of::<T>();

crates/cust/src/memory/device/mod.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ pub trait CopyDestination<O: ?Sized>: crate::private::Sealed {
3838
/// buffer resulting in a data race and undefined behavior.
3939
///
4040
/// Thus to enforce safety, the following invariants must be upheld:
41-
/// * The source and destination are not deallocated
42-
/// * The source is not modified
43-
/// * The destination is not written or read by any other operation
41+
/// - The source and destination are not deallocated
42+
/// - The source is not modified
43+
/// - The destination is not written or read by any other operation
4444
///
4545
/// These invariants must be preserved until the stream is synchronized or an event queued after
4646
/// the copy is triggered.

0 commit comments

Comments
 (0)