diff --git a/crates/cudart/src/memory.rs b/crates/cudart/src/memory.rs index 6e49257..83f4357 100644 --- a/crates/cudart/src/memory.rs +++ b/crates/cudart/src/memory.rs @@ -1,14 +1,12 @@ // memory management // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html +use bitflags::bitflags; +use era_cudart_sys::*; use std::alloc::Layout; use std::mem::{self, MaybeUninit}; use std::ops::{Deref, DerefMut}; -use std::os::raw::c_void; - -use bitflags::bitflags; - -use era_cudart_sys::*; +use std::ptr::NonNull; use crate::result::{CudaResult, CudaResultWrap}; use crate::slice::{AllocationData, CudaSlice, CudaSliceMut, DeviceSlice}; @@ -21,22 +19,17 @@ pub struct DeviceAllocation(AllocationData); impl DeviceAllocation { pub fn alloc(length: usize) -> CudaResult { let layout = Layout::array::(length).unwrap(); - let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit(); + let mut dev_ptr = MaybeUninit::uninit(); unsafe { cudaMalloc(dev_ptr.as_mut_ptr(), layout.size()) .wrap_maybe_uninit(dev_ptr) - .map(|ptr| { - Self(AllocationData { - ptr: ptr as *mut T, - len: length, - }) - }) + .map(|ptr| Self(AllocationData::new_unchecked(ptr as _, length))) } } pub fn free(self) -> CudaResult<()> { unsafe { - let ptr = self.0.ptr as *mut c_void; + let ptr = self.0.ptr.as_ptr() as _; mem::forget(self); cudaFree(ptr).wrap() } @@ -45,11 +38,11 @@ impl DeviceAllocation { /// # Safety /// /// The caller must ensure that the inputs are valid. - pub unsafe fn from_raw_parts(ptr: *mut T, len: usize) -> Self { - Self(AllocationData { ptr, len }) + pub unsafe fn from_raw_parts(ptr: NonNull, len: usize) -> Self { + Self(AllocationData::new(ptr, len)) } - pub fn into_raw_parts(self) -> (*mut T, usize) { + pub fn into_raw_parts(self) -> (NonNull, usize) { let result = (self.0.ptr, self.0.len); mem::forget(self); result @@ -58,7 +51,7 @@ impl DeviceAllocation { impl Drop for DeviceAllocation { fn drop(&mut self) { - unsafe { cudaFree(self.as_mut_c_void_ptr()).eprint_error_and_backtrace() }; + unsafe { cudaFree(self.0.ptr.as_ptr() as _).eprint_error_and_backtrace() }; } } @@ -103,10 +96,10 @@ impl CudaSliceMut for DeviceAllocation { bitflags! { #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct CudaHostAllocFlags: u32 { - const DEFAULT = era_cudart_sys::cudaHostAllocDefault; - const PORTABLE = era_cudart_sys::cudaHostAllocPortable; - const MAPPED = era_cudart_sys::cudaHostAllocMapped; - const WRITE_COMBINED = era_cudart_sys::cudaHostAllocWriteCombined; + const DEFAULT = cudaHostAllocDefault; + const PORTABLE = cudaHostAllocPortable; + const MAPPED = cudaHostAllocMapped; + const WRITE_COMBINED = cudaHostAllocWriteCombined; } } @@ -123,22 +116,17 @@ pub struct HostAllocation(AllocationData); impl HostAllocation { pub fn alloc(length: usize, flags: CudaHostAllocFlags) -> CudaResult { let layout = Layout::array::(length).unwrap(); - let mut ptr = MaybeUninit::<*mut c_void>::uninit(); + let mut ptr = MaybeUninit::uninit(); unsafe { cudaHostAlloc(ptr.as_mut_ptr(), layout.size(), flags.bits()) .wrap_maybe_uninit(ptr) - .map(|ptr| { - Self(AllocationData { - ptr: ptr as *mut T, - len: length, - }) - }) + .map(|ptr| Self(AllocationData::new_unchecked(ptr as _, length))) } } pub fn free(self) -> CudaResult<()> { unsafe { - let ptr = self.0.ptr as *mut c_void; + let ptr = self.0.ptr.as_ptr() as _; mem::forget(self); cudaFreeHost(ptr).wrap() } @@ -147,11 +135,11 @@ impl HostAllocation { /// # Safety /// /// The caller must ensure that the inputs are valid. - pub unsafe fn from_raw_parts(ptr: *mut T, len: usize) -> Self { - Self(AllocationData { ptr, len }) + pub unsafe fn from_raw_parts(ptr: NonNull, len: usize) -> Self { + Self(AllocationData::new(ptr, len)) } - pub fn into_raw_parts(self) -> (*mut T, usize) { + pub fn into_raw_parts(self) -> (NonNull, usize) { let result = (self.0.ptr, self.0.len); mem::forget(self); result @@ -160,7 +148,12 @@ impl HostAllocation { impl Drop for HostAllocation { fn drop(&mut self) { - unsafe { cudaFreeHost(self.0.ptr as *mut c_void).eprint_error_and_backtrace() }; + let ptr = self.0.ptr.as_ptr(); + let len = self.0.len; + unsafe { + std::ptr::drop_in_place(std::slice::from_raw_parts_mut(ptr, len)); + cudaFreeHost(ptr as _).eprint_error_and_backtrace() + }; } } @@ -192,11 +185,11 @@ impl AsMut<[T]> for HostAllocation { bitflags! { #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct CudaHostRegisterFlags: u32 { - const DEFAULT = era_cudart_sys::cudaHostRegisterDefault; - const PORTABLE = era_cudart_sys::cudaHostRegisterPortable; - const MAPPED = era_cudart_sys::cudaHostRegisterMapped; - const IO_MEMORY = era_cudart_sys::cudaHostRegisterIoMemory; - const READ_ONLY = era_cudart_sys::cudaHostRegisterReadOnly; + const DEFAULT = cudaHostRegisterDefault; + const PORTABLE = cudaHostRegisterPortable; + const MAPPED = cudaHostRegisterMapped; + const IO_MEMORY = cudaHostRegisterIoMemory; + const READ_ONLY = cudaHostRegisterReadOnly; } } @@ -215,25 +208,19 @@ impl<'a, T> HostRegistration<'a, T> { let length = slice.len(); let layout = Layout::array::(length).unwrap(); unsafe { - cudaHostRegister( - slice.as_c_void_ptr() as *mut c_void, - layout.size(), - flags.bits(), - ) - .wrap_value(Self(slice)) + cudaHostRegister(slice.as_c_void_ptr() as _, layout.size(), flags.bits()) + .wrap_value(Self(slice)) } } pub fn unregister(self) -> CudaResult<()> { - unsafe { cudaHostUnregister(self.0.as_c_void_ptr() as *mut c_void).wrap() } + unsafe { cudaHostUnregister(self.0.as_c_void_ptr() as _).wrap() } } } impl Drop for HostRegistration<'_, T> { fn drop(&mut self) { - unsafe { - cudaHostUnregister(self.0.as_c_void_ptr() as *mut c_void).eprint_error_and_backtrace() - }; + unsafe { cudaHostUnregister(self.0.as_c_void_ptr() as _).eprint_error_and_backtrace() }; } } @@ -388,8 +375,8 @@ pub fn memory_set_async( } pub fn memory_get_info() -> CudaResult<(usize, usize)> { - let mut free = MaybeUninit::::uninit(); - let mut total = MaybeUninit::::uninit(); + let mut free = MaybeUninit::uninit(); + let mut total = MaybeUninit::uninit(); unsafe { let error = cudaMemGetInfo(free.as_mut_ptr(), total.as_mut_ptr()); if error == CudaError::Success { @@ -400,17 +387,6 @@ pub fn memory_get_info() -> CudaResult<(usize, usize)> { } } -#[derive(Copy, Clone, Default, Debug, PartialEq, Eq)] -pub struct HostAllocator { - flags: CudaHostAllocFlags, -} - -impl HostAllocator { - pub fn new(flags: CudaHostAllocFlags) -> Self { - Self { flags } - } -} - #[cfg(test)] mod tests { use serial_test::serial; diff --git a/crates/cudart/src/memory_pools.rs b/crates/cudart/src/memory_pools.rs index 3bd207b..52b4c37 100644 --- a/crates/cudart/src/memory_pools.rs +++ b/crates/cudart/src/memory_pools.rs @@ -1,13 +1,12 @@ // Stream Ordered Memory Allocator // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html +use era_cudart_sys::*; use std::alloc::Layout; use std::mem; use std::mem::MaybeUninit; use std::ops::{Deref, DerefMut}; -use std::os::raw::c_void; - -use era_cudart_sys::*; +use std::ptr::NonNull; use crate::result::{CudaResult, CudaResultWrap}; use crate::slice::{AllocationData, CudaSlice, CudaSliceMut, DeviceSlice}; @@ -55,7 +54,7 @@ impl CudaMemPool { } pub fn get_access(&self, location: CudaMemLocation) -> CudaResult { - let mut result = MaybeUninit::::uninit(); + let mut result = MaybeUninit::uninit(); unsafe { cudaMemPoolGetAccess( result.as_mut_ptr(), @@ -67,12 +66,12 @@ impl CudaMemPool { } pub fn get_attribute_value, U>(&self, attribute: T) -> CudaResult { - let mut value = MaybeUninit::::uninit(); + let mut value = MaybeUninit::uninit(); unsafe { cudaMemPoolGetAttribute( self.handle, mem::transmute::(attribute.into()), - value.as_mut_ptr() as *mut c_void, + value.as_mut_ptr() as _, ) .wrap_maybe_uninit(value) } @@ -87,7 +86,7 @@ impl CudaMemPool { cudaMemPoolSetAttribute( self.handle, mem::transmute::(attribute.into()), - &value as *const _ as *mut c_void, + &value as *const _ as _, ) .wrap() } @@ -156,7 +155,7 @@ impl CudaOwnedMemPool { } pub fn create(properties: &CudaMemPoolProperties) -> CudaResult { - let mut handle = MaybeUninit::::uninit(); + let mut handle = MaybeUninit::uninit(); unsafe { cudaMemPoolCreate(handle.as_mut_ptr(), properties) .wrap_maybe_uninit(handle) @@ -207,15 +206,12 @@ pub struct DevicePoolAllocation<'a, T> { impl<'a, T> DevicePoolAllocation<'a, T> { pub fn alloc_async(length: usize, stream: &'a CudaStream) -> CudaResult { let layout = Layout::array::(length).unwrap(); - let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit(); + let mut dev_ptr = MaybeUninit::uninit(); unsafe { cudaMallocAsync(dev_ptr.as_mut_ptr(), layout.size(), stream.into()) .wrap_maybe_uninit(dev_ptr) .map(|ptr| Self { - data: AllocationData { - ptr: ptr as *mut T, - len: length, - }, + data: AllocationData::new_unchecked(ptr as _, length), stream, }) } @@ -227,7 +223,7 @@ impl<'a, T> DevicePoolAllocation<'a, T> { stream: &'a CudaStream, ) -> CudaResult { let layout = Layout::array::(length).unwrap(); - let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit(); + let mut dev_ptr = MaybeUninit::uninit(); unsafe { cudaMallocFromPoolAsync( dev_ptr.as_mut_ptr(), @@ -237,10 +233,7 @@ impl<'a, T> DevicePoolAllocation<'a, T> { ) .wrap_maybe_uninit(dev_ptr) .map(|ptr| Self { - data: AllocationData { - ptr: ptr as *mut T, - len: length, - }, + data: AllocationData::new_unchecked(ptr as _, length), stream, }) } @@ -248,17 +241,14 @@ impl<'a, T> DevicePoolAllocation<'a, T> { pub fn free_async(self, stream: &CudaStream) -> CudaResult<()> { unsafe { - let ptr = self.as_c_void_ptr() as *mut c_void; + let ptr = self.as_c_void_ptr() as _; mem::forget(self); cudaFreeAsync(ptr, stream.into()).wrap() } } pub fn swap_stream(self, stream: &CudaStream) -> DevicePoolAllocation { - let data = AllocationData { - ptr: self.data.ptr, - len: self.data.len, - }; + let data = AllocationData::new(self.data.ptr, self.data.len); mem::forget(self); DevicePoolAllocation { data, stream } } @@ -266,14 +256,14 @@ impl<'a, T> DevicePoolAllocation<'a, T> { /// # Safety /// /// The caller must ensure that the inputs are valid. - pub unsafe fn from_raw_parts(ptr: *mut T, len: usize, stream: &'a CudaStream) -> Self { + pub unsafe fn from_raw_parts(ptr: NonNull, len: usize, stream: &'a CudaStream) -> Self { Self { - data: AllocationData { ptr, len }, + data: AllocationData::new(ptr, len), stream, } } - pub fn into_raw_parts(self) -> (*mut T, usize, &'a CudaStream) { + pub fn into_raw_parts(self) -> (NonNull, usize, &'a CudaStream) { let result = (self.data.ptr, self.data.len, self.stream); mem::forget(self); result @@ -509,7 +499,7 @@ mod tests { let stream = CudaStream::create().unwrap(); let allocation = DevicePoolAllocation::::alloc_from_pool_async(LENGTH, &pool, &stream).unwrap(); - let size = mem::size_of::() * LENGTH; + let size = size_of::() * LENGTH; let used = pool .get_attribute(CudaMemPoolAttributeU64::AttrUsedMemCurrent) .unwrap() as usize; diff --git a/crates/cudart/src/slice/allocation_data.rs b/crates/cudart/src/slice/allocation_data.rs index 4b11763..18a8eab 100644 --- a/crates/cudart/src/slice/allocation_data.rs +++ b/crates/cudart/src/slice/allocation_data.rs @@ -1,11 +1,32 @@ +use std::marker::PhantomData; +use std::ptr::NonNull; use std::slice; use crate::slice::{CudaSlice, CudaSliceMut}; #[derive(Debug)] pub(crate) struct AllocationData { - pub ptr: *mut T, + pub ptr: NonNull, pub len: usize, + _owns_t: PhantomData, +} + +impl AllocationData { + pub fn new(ptr: NonNull, len: usize) -> Self { + Self { + ptr, + len, + _owns_t: PhantomData, + } + } + + pub unsafe fn new_unchecked(ptr: *mut T, len: usize) -> Self { + Self { + ptr: NonNull::new_unchecked(ptr), + len, + _owns_t: PhantomData, + } + } } unsafe impl Send for AllocationData where Vec: Send {} @@ -14,12 +35,12 @@ unsafe impl Sync for AllocationData where Vec: Sync {} impl CudaSlice for AllocationData { unsafe fn as_slice(&self) -> &[T] { - slice::from_raw_parts(self.ptr, self.len) + slice::from_raw_parts(self.ptr.as_ptr(), self.len) } } impl CudaSliceMut for AllocationData { unsafe fn as_mut_slice(&mut self) -> &mut [T] { - slice::from_raw_parts_mut(self.ptr, self.len) + slice::from_raw_parts_mut(self.ptr.as_ptr(), self.len) } } diff --git a/crates/cudart/src/stream.rs b/crates/cudart/src/stream.rs index be8f176..5beb7b2 100644 --- a/crates/cudart/src/stream.rs +++ b/crates/cudart/src/stream.rs @@ -21,8 +21,8 @@ pub struct CudaStream { bitflags! { #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct CudaStreamCreateFlags: u32 { - const DEFAULT = era_cudart_sys::cudaStreamDefault; - const NON_BLOCKING = era_cudart_sys::cudaStreamNonBlocking; + const DEFAULT = cudaStreamDefault; + const NON_BLOCKING = cudaStreamNonBlocking; } } @@ -35,8 +35,8 @@ impl Default for CudaStreamCreateFlags { bitflags! { #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct CudaStreamWaitEventFlags: u32 { - const DEFAULT = era_cudart_sys::cudaEventWaitDefault; - const WAIT_EXTERNAL = era_cudart_sys::cudaEventWaitExternal; + const DEFAULT = cudaEventWaitDefault; + const WAIT_EXTERNAL = cudaEventWaitExternal; } } @@ -47,7 +47,9 @@ impl Default for CudaStreamWaitEventFlags { } impl CudaStream { - fn from_handle(handle: cudaStream_t) -> Self { + pub const DEFAULT: CudaStream = Self::from_handle(null_mut()); + + const fn from_handle(handle: cudaStream_t) -> Self { Self { handle } } @@ -113,7 +115,7 @@ impl CudaStream { impl Default for CudaStream { fn default() -> Self { - Self { handle: null_mut() } + Self::DEFAULT } } @@ -127,6 +129,8 @@ impl Drop for CudaStream { } } +unsafe impl Sync for CudaStream {} + impl From<&CudaStream> for cudaStream_t { fn from(stream: &CudaStream) -> Self { stream.handle