@@ -2,12 +2,12 @@ use crate::error::{CudaResult, DropResult, ToResult};
2
2
use crate :: memory:: device:: AsyncCopyDestination ;
3
3
use crate :: memory:: device:: CopyDestination ;
4
4
use crate :: memory:: malloc:: { cuda_free, cuda_malloc} ;
5
- use crate :: memory:: DeviceCopy ;
6
5
use crate :: memory:: DevicePointer ;
6
+ use crate :: memory:: { cuda_free_async, cuda_malloc_async, DeviceCopy } ;
7
7
use crate :: stream:: Stream ;
8
8
use crate :: sys as cuda;
9
9
use std:: fmt:: { self , Pointer } ;
10
- use std:: mem;
10
+ use std:: mem:: { self , ManuallyDrop } ;
11
11
12
12
use std:: os:: raw:: c_void;
13
13
@@ -39,6 +39,89 @@ impl<T: DeviceCopy> DeviceBox<T> {
39
39
dev_box. copy_from ( val) ?;
40
40
Ok ( dev_box)
41
41
}
42
+
43
+ /// Allocates device memory asynchronously and asynchronously copies `val` into it.
44
+ ///
45
+ /// This doesn't actually allocate if `T` is zero-sized.
46
+ ///
47
+ /// If the memory behind `val` is not page-locked (pinned), a staging buffer
48
+ /// will be allocated using a worker thread. If you are going to be making
49
+ /// many asynchronous copies, it is generally a good idea to keep the data as a [`cust::memory::LockedBuffer`]
50
+ /// or [`cust::memory::LockedBox`]. This will ensure the driver does not have to allocate a staging buffer
51
+ /// on its own.
52
+ ///
53
+ /// However, don't keep all of your data as page-locked, doing so might slow down
54
+ /// the OS because it is unable to page out that memory to disk.
55
+ ///
56
+ /// # Safety
57
+ ///
58
+ /// This method enqueues two operations on the stream: An async allocation
59
+ /// and an async memcpy. Because of this, you must ensure that:
60
+ /// - The memory is not used in any way before it is actually allocated on the stream. You
61
+ /// can ensure this happens by synchronizing the stream explicitly or using events.
62
+ /// - `val` is still valid when the memory copy actually takes place.
63
+ ///
64
+ /// # Examples
65
+ ///
66
+ /// ```
67
+ /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
68
+ /// # let _context = cust::quick_init().unwrap();
69
+ /// use cust::{memory::*, stream::*};
70
+ /// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
71
+ /// let mut host_val = 0;
72
+ /// unsafe {
73
+ /// let mut allocated = DeviceBox::new_async(&5u8, &stream)?;
74
+ /// allocated.async_copy_to(&mut host_val, &stream)?;
75
+ /// allocated.drop_async(&stream)?;
76
+ /// }
77
+ /// // ensure all async ops are done before trying to access the value
78
+ /// stream.synchronize()?;
79
+ /// assert_eq!(host_val, 5);
80
+ /// # Ok(())
81
+ /// # }
82
+ pub unsafe fn new_async ( val : & T , stream : & Stream ) -> CudaResult < Self > {
83
+ let mut dev_box = DeviceBox :: uninitialized ( ) ?;
84
+ dev_box. async_copy_from ( val, stream) ?;
85
+ Ok ( dev_box)
86
+ }
87
+
88
+ /// Enqueues an operation to free the memory backed by this [`DeviceBox`] on a
89
+ /// particular stream. The stream will free the allocation as soon as it reaches
90
+ /// the operation in the stream. You can ensure the memory is freed by synchronizing
91
+ /// the stream.
92
+ ///
93
+ /// This function uses internal memory pool semantics. Async allocations will reserve memory
94
+ /// in the default memory pool in the stream, and async frees will release the memory back to the pool
95
+ /// for further use by async allocations.
96
+ ///
97
+ /// The memory inside of the pool is all freed back to the OS once the stream is synchronized unless
98
+ /// a custom pool is configured to not do so.
99
+ ///
100
+ /// # Examples
101
+ ///
102
+ /// ```
103
+ /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
104
+ /// # let _context = cust::quick_init().unwrap();
105
+ /// use cust::{memory::*, stream::*};
106
+ /// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
107
+ /// let mut host_val = 0;
108
+ /// unsafe {
109
+ /// let mut allocated = DeviceBox::new_async(&5u8, &stream)?;
110
+ /// allocated.async_copy_to(&mut host_val, &stream)?;
111
+ /// allocated.drop_async(&stream)?;
112
+ /// }
113
+ /// // ensure all async ops are done before trying to access the value
114
+ /// stream.synchronize()?;
115
+ /// assert_eq!(host_val, 5);
116
+ /// # Ok(())
117
+ /// # }
118
+ pub fn drop_async ( self , stream : & Stream ) -> CudaResult < ( ) > {
119
+ // make sure we dont run the normal destructor, otherwise a double drop will happen
120
+ let me = ManuallyDrop :: new ( self ) ;
121
+ // SAFETY: we consume the box so its not possible to use the box past its drop point unless
122
+ // you keep around a pointer, but in that case, we cannot guarantee safety.
123
+ unsafe { cuda_free_async ( stream, me. ptr ) }
124
+ }
42
125
}
43
126
44
127
impl < T : DeviceCopy + Default > DeviceBox < T > {
@@ -50,6 +133,76 @@ impl<T: DeviceCopy + Default> DeviceBox<T> {
50
133
}
51
134
}
52
135
136
+ #[ cfg( feature = "bytemuck" ) ]
137
+ impl < T : DeviceCopy + bytemuck:: Zeroable > DeviceBox < T > {
138
+ /// Allocate device memory and fill it with zeroes (`0u8`).
139
+ ///
140
+ /// This doesn't actually allocate if `T` is zero-sized.
141
+ ///
142
+ /// # Examples
143
+ ///
144
+ /// ```
145
+ /// # let _context = cust::quick_init().unwrap();
146
+ /// use cust::memory::*;
147
+ /// let mut zero = DeviceBox::zeroed().unwrap();
148
+ /// let mut value = 5u64;
149
+ /// zero.copy_to(&mut value).unwrap();
150
+ /// assert_eq!(0, value);
151
+ /// ```
152
+ #[ cfg_attr( docsrs, doc( cfg( feature = "bytemuck" ) ) ) ]
153
+ pub fn zeroed ( ) -> CudaResult < Self > {
154
+ unsafe {
155
+ let mut new_box = DeviceBox :: uninitialized ( ) ?;
156
+ if mem:: size_of :: < T > ( ) != 0 {
157
+ cuda:: cuMemsetD8_v2 (
158
+ new_box. as_device_ptr ( ) . as_raw_mut ( ) as u64 ,
159
+ 0 ,
160
+ mem:: size_of :: < T > ( ) ,
161
+ )
162
+ . to_result ( ) ?;
163
+ }
164
+ Ok ( new_box)
165
+ }
166
+ }
167
+
168
+ /// Allocate device memory asynchronously and asynchronously fills it with zeroes (`0u8`).
169
+ ///
170
+ /// This doesn't actually allocate if `T` is zero-sized.
171
+ ///
172
+ /// # Examples
173
+ ///
174
+ /// ```
175
+ /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
176
+ /// # let _context = cust::quick_init().unwrap();
177
+ /// use cust::{memory::*, stream::*};
178
+ /// let stream = Stream::new(StreamFlags::DEFAULT, None)?;
179
+ /// let mut value = 5u64;
180
+ /// unsafe {
181
+ /// let mut zero = DeviceBox::zeroed_async(&stream)?;
182
+ /// zero.async_copy_to(&mut value, &stream)?;
183
+ /// zero.free_async(&stream)?;
184
+ /// }
185
+ /// stream.synchronize()?;
186
+ /// assert_eq!(value, 0);
187
+ /// # Ok(())
188
+ /// # }
189
+ /// ```
190
+ #[ cfg_attr( docsrs, doc( cfg( feature = "bytemuck" ) ) ) ]
191
+ pub unsafe fn zeroed_async ( stream : & Stream ) -> CudaResult < Self > {
192
+ let mut new_box = DeviceBox :: uninitialized_async ( stream) ?;
193
+ if mem:: size_of :: < T > ( ) != 0 {
194
+ cuda:: cuMemsetD8Async (
195
+ new_box. as_device_ptr ( ) . as_raw_mut ( ) as u64 ,
196
+ 0 ,
197
+ mem:: size_of :: < T > ( ) ,
198
+ stream. as_inner ( ) ,
199
+ )
200
+ . to_result ( ) ?;
201
+ }
202
+ Ok ( new_box)
203
+ }
204
+ }
205
+
53
206
impl < T > DeviceBox < T > {
54
207
/// Allocate device memory, but do not initialize it.
55
208
///
@@ -80,37 +233,27 @@ impl<T> DeviceBox<T> {
80
233
}
81
234
}
82
235
83
- /// Allocate device memory and fill it with zeroes (`0u8`) .
236
+ /// Allocates device memory asynchronously on a stream, without initializing it .
84
237
///
85
- /// This doesn't actually allocate if `T` is zero- sized.
238
+ /// This doesn't actually allocate if `T` is zero sized.
86
239
///
87
240
/// # Safety
88
241
///
89
- /// The backing memory is zeroed, which may not be a valid bit-pattern for type `T`. The caller
90
- /// must ensure either that all-zeroes is a valid bit-pattern for type `T` or that the backing
91
- /// memory is set to a valid value before it is read.
92
- ///
93
- /// # Examples
242
+ /// The allocated memory retains all of the unsafety of [`DeviceBox::uninitialized`], with
243
+ /// the additional consideration that the memory cannot be used until it is actually allocated
244
+ /// on the stream. This means proper stream ordering semantics must be followed, such as
245
+ /// only enqueing kernel launches that use the memory AFTER the allocation call.
94
246
///
95
- /// ```
96
- /// # let _context = cust::quick_init().unwrap();
97
- /// use cust::memory::*;
98
- /// let mut zero = unsafe { DeviceBox::zeroed().unwrap() };
99
- /// let mut value = 5u64;
100
- /// zero.copy_to(&mut value).unwrap();
101
- /// assert_eq!(0, value);
102
- /// ```
103
- pub unsafe fn zeroed ( ) -> CudaResult < Self > {
104
- let mut new_box = DeviceBox :: uninitialized ( ) ?;
105
- if mem:: size_of :: < T > ( ) != 0 {
106
- cuda:: cuMemsetD8_v2 (
107
- new_box. as_device_ptr ( ) . as_raw_mut ( ) as u64 ,
108
- 0 ,
109
- mem:: size_of :: < T > ( ) ,
110
- )
111
- . to_result ( ) ?;
247
+ /// You can synchronize the stream to ensure the memory allocation operation is complete.
248
+ pub unsafe fn uninitialized_async ( stream : & Stream ) -> CudaResult < Self > {
249
+ if mem:: size_of :: < T > ( ) == 0 {
250
+ Ok ( DeviceBox {
251
+ ptr : DevicePointer :: null ( ) ,
252
+ } )
253
+ } else {
254
+ let ptr = cuda_malloc_async ( stream, 1 ) ?;
255
+ Ok ( DeviceBox { ptr } )
112
256
}
113
- Ok ( new_box)
114
257
}
115
258
116
259
/// Constructs a DeviceBox from a raw pointer.
@@ -318,6 +461,35 @@ impl<T: DeviceCopy> CopyDestination<DeviceBox<T>> for DeviceBox<T> {
318
461
Ok ( ( ) )
319
462
}
320
463
}
464
+ impl < T : DeviceCopy > AsyncCopyDestination < T > for DeviceBox < T > {
465
+ unsafe fn async_copy_from ( & mut self , val : & T , stream : & Stream ) -> CudaResult < ( ) > {
466
+ let size = mem:: size_of :: < T > ( ) ;
467
+ if size != 0 {
468
+ cuda:: cuMemcpyHtoDAsync_v2 (
469
+ self . ptr . as_raw_mut ( ) as u64 ,
470
+ val as * const _ as * const c_void ,
471
+ size,
472
+ stream. as_inner ( ) ,
473
+ )
474
+ . to_result ( ) ?
475
+ }
476
+ Ok ( ( ) )
477
+ }
478
+
479
+ unsafe fn async_copy_to ( & self , val : & mut T , stream : & Stream ) -> CudaResult < ( ) > {
480
+ let size = mem:: size_of :: < T > ( ) ;
481
+ if size != 0 {
482
+ cuda:: cuMemcpyDtoHAsync_v2 (
483
+ val as * mut _ as * mut c_void ,
484
+ self . ptr . as_raw ( ) as u64 ,
485
+ size,
486
+ stream. as_inner ( ) ,
487
+ )
488
+ . to_result ( ) ?
489
+ }
490
+ Ok ( ( ) )
491
+ }
492
+ }
321
493
impl < T : DeviceCopy > AsyncCopyDestination < DeviceBox < T > > for DeviceBox < T > {
322
494
unsafe fn async_copy_from ( & mut self , val : & DeviceBox < T > , stream : & Stream ) -> CudaResult < ( ) > {
323
495
let size = mem:: size_of :: < T > ( ) ;
0 commit comments