Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 37 additions & 61 deletions crates/cudart/src/memory.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
// memory management
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html

use bitflags::bitflags;
use era_cudart_sys::*;
use std::alloc::Layout;
use std::mem::{self, MaybeUninit};
use std::ops::{Deref, DerefMut};
use std::os::raw::c_void;

use bitflags::bitflags;

use era_cudart_sys::*;
use std::ptr::NonNull;

use crate::result::{CudaResult, CudaResultWrap};
use crate::slice::{AllocationData, CudaSlice, CudaSliceMut, DeviceSlice};
Expand All @@ -21,22 +19,17 @@ pub struct DeviceAllocation<T>(AllocationData<T>);
impl<T> DeviceAllocation<T> {
pub fn alloc(length: usize) -> CudaResult<Self> {
let layout = Layout::array::<T>(length).unwrap();
let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit();
let mut dev_ptr = MaybeUninit::uninit();
unsafe {
cudaMalloc(dev_ptr.as_mut_ptr(), layout.size())
.wrap_maybe_uninit(dev_ptr)
.map(|ptr| {
Self(AllocationData {
ptr: ptr as *mut T,
len: length,
})
})
.map(|ptr| Self(AllocationData::new_unchecked(ptr as _, length)))
}
}

pub fn free(self) -> CudaResult<()> {
unsafe {
let ptr = self.0.ptr as *mut c_void;
let ptr = self.0.ptr.as_ptr() as _;
mem::forget(self);
cudaFree(ptr).wrap()
}
Expand All @@ -45,11 +38,11 @@ impl<T> DeviceAllocation<T> {
/// # Safety
///
/// The caller must ensure that the inputs are valid.
pub unsafe fn from_raw_parts(ptr: *mut T, len: usize) -> Self {
Self(AllocationData { ptr, len })
pub unsafe fn from_raw_parts(ptr: NonNull<T>, len: usize) -> Self {
Self(AllocationData::new(ptr, len))
}

pub fn into_raw_parts(self) -> (*mut T, usize) {
pub fn into_raw_parts(self) -> (NonNull<T>, usize) {
let result = (self.0.ptr, self.0.len);
mem::forget(self);
result
Expand All @@ -58,7 +51,7 @@ impl<T> DeviceAllocation<T> {

impl<T> Drop for DeviceAllocation<T> {
fn drop(&mut self) {
unsafe { cudaFree(self.as_mut_c_void_ptr()).eprint_error_and_backtrace() };
unsafe { cudaFree(self.0.ptr.as_ptr() as _).eprint_error_and_backtrace() };
}
}

Expand Down Expand Up @@ -103,10 +96,10 @@ impl<T> CudaSliceMut<T> for DeviceAllocation<T> {
bitflags! {
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct CudaHostAllocFlags: u32 {
const DEFAULT = era_cudart_sys::cudaHostAllocDefault;
const PORTABLE = era_cudart_sys::cudaHostAllocPortable;
const MAPPED = era_cudart_sys::cudaHostAllocMapped;
const WRITE_COMBINED = era_cudart_sys::cudaHostAllocWriteCombined;
const DEFAULT = cudaHostAllocDefault;
const PORTABLE = cudaHostAllocPortable;
const MAPPED = cudaHostAllocMapped;
const WRITE_COMBINED = cudaHostAllocWriteCombined;
}
}

Expand All @@ -123,22 +116,17 @@ pub struct HostAllocation<T>(AllocationData<T>);
impl<T> HostAllocation<T> {
pub fn alloc(length: usize, flags: CudaHostAllocFlags) -> CudaResult<Self> {
let layout = Layout::array::<T>(length).unwrap();
let mut ptr = MaybeUninit::<*mut c_void>::uninit();
let mut ptr = MaybeUninit::uninit();
unsafe {
cudaHostAlloc(ptr.as_mut_ptr(), layout.size(), flags.bits())
.wrap_maybe_uninit(ptr)
.map(|ptr| {
Self(AllocationData {
ptr: ptr as *mut T,
len: length,
})
})
.map(|ptr| Self(AllocationData::new_unchecked(ptr as _, length)))
}
}

pub fn free(self) -> CudaResult<()> {
unsafe {
let ptr = self.0.ptr as *mut c_void;
let ptr = self.0.ptr.as_ptr() as _;
mem::forget(self);
cudaFreeHost(ptr).wrap()
}
Expand All @@ -147,11 +135,11 @@ impl<T> HostAllocation<T> {
/// # Safety
///
/// The caller must ensure that the inputs are valid.
pub unsafe fn from_raw_parts(ptr: *mut T, len: usize) -> Self {
Self(AllocationData { ptr, len })
pub unsafe fn from_raw_parts(ptr: NonNull<T>, len: usize) -> Self {
Self(AllocationData::new(ptr, len))
}

pub fn into_raw_parts(self) -> (*mut T, usize) {
pub fn into_raw_parts(self) -> (NonNull<T>, usize) {
let result = (self.0.ptr, self.0.len);
mem::forget(self);
result
Expand All @@ -160,7 +148,12 @@ impl<T> HostAllocation<T> {

impl<T> Drop for HostAllocation<T> {
fn drop(&mut self) {
unsafe { cudaFreeHost(self.0.ptr as *mut c_void).eprint_error_and_backtrace() };
let ptr = self.0.ptr.as_ptr();
let len = self.0.len;
unsafe {
std::ptr::drop_in_place(std::slice::from_raw_parts_mut(ptr, len));
cudaFreeHost(ptr as _).eprint_error_and_backtrace()
};
}
}

Expand Down Expand Up @@ -192,11 +185,11 @@ impl<T> AsMut<[T]> for HostAllocation<T> {
bitflags! {
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct CudaHostRegisterFlags: u32 {
const DEFAULT = era_cudart_sys::cudaHostRegisterDefault;
const PORTABLE = era_cudart_sys::cudaHostRegisterPortable;
const MAPPED = era_cudart_sys::cudaHostRegisterMapped;
const IO_MEMORY = era_cudart_sys::cudaHostRegisterIoMemory;
const READ_ONLY = era_cudart_sys::cudaHostRegisterReadOnly;
const DEFAULT = cudaHostRegisterDefault;
const PORTABLE = cudaHostRegisterPortable;
const MAPPED = cudaHostRegisterMapped;
const IO_MEMORY = cudaHostRegisterIoMemory;
const READ_ONLY = cudaHostRegisterReadOnly;
}
}

Expand All @@ -215,25 +208,19 @@ impl<'a, T> HostRegistration<'a, T> {
let length = slice.len();
let layout = Layout::array::<T>(length).unwrap();
unsafe {
cudaHostRegister(
slice.as_c_void_ptr() as *mut c_void,
layout.size(),
flags.bits(),
)
.wrap_value(Self(slice))
cudaHostRegister(slice.as_c_void_ptr() as _, layout.size(), flags.bits())
.wrap_value(Self(slice))
}
}

pub fn unregister(self) -> CudaResult<()> {
unsafe { cudaHostUnregister(self.0.as_c_void_ptr() as *mut c_void).wrap() }
unsafe { cudaHostUnregister(self.0.as_c_void_ptr() as _).wrap() }
}
}

impl<T> Drop for HostRegistration<'_, T> {
fn drop(&mut self) {
unsafe {
cudaHostUnregister(self.0.as_c_void_ptr() as *mut c_void).eprint_error_and_backtrace()
};
unsafe { cudaHostUnregister(self.0.as_c_void_ptr() as _).eprint_error_and_backtrace() };
}
}

Expand Down Expand Up @@ -388,8 +375,8 @@ pub fn memory_set_async(
}

pub fn memory_get_info() -> CudaResult<(usize, usize)> {
let mut free = MaybeUninit::<usize>::uninit();
let mut total = MaybeUninit::<usize>::uninit();
let mut free = MaybeUninit::uninit();
let mut total = MaybeUninit::uninit();
unsafe {
let error = cudaMemGetInfo(free.as_mut_ptr(), total.as_mut_ptr());
if error == CudaError::Success {
Expand All @@ -400,17 +387,6 @@ pub fn memory_get_info() -> CudaResult<(usize, usize)> {
}
}

#[derive(Copy, Clone, Default, Debug, PartialEq, Eq)]
pub struct HostAllocator {
flags: CudaHostAllocFlags,
}

impl HostAllocator {
pub fn new(flags: CudaHostAllocFlags) -> Self {
Self { flags }
}
}

#[cfg(test)]
mod tests {
use serial_test::serial;
Expand Down
44 changes: 17 additions & 27 deletions crates/cudart/src/memory_pools.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
// Stream Ordered Memory Allocator
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html

use era_cudart_sys::*;
use std::alloc::Layout;
use std::mem;
use std::mem::MaybeUninit;
use std::ops::{Deref, DerefMut};
use std::os::raw::c_void;

use era_cudart_sys::*;
use std::ptr::NonNull;

use crate::result::{CudaResult, CudaResultWrap};
use crate::slice::{AllocationData, CudaSlice, CudaSliceMut, DeviceSlice};
Expand Down Expand Up @@ -55,7 +54,7 @@ impl CudaMemPool {
}

pub fn get_access(&self, location: CudaMemLocation) -> CudaResult<CudaMemAccessFlags> {
let mut result = MaybeUninit::<CudaMemAccessFlags>::uninit();
let mut result = MaybeUninit::uninit();
unsafe {
cudaMemPoolGetAccess(
result.as_mut_ptr(),
Expand All @@ -67,12 +66,12 @@ impl CudaMemPool {
}

pub fn get_attribute_value<T: Into<i32>, U>(&self, attribute: T) -> CudaResult<U> {
let mut value = MaybeUninit::<U>::uninit();
let mut value = MaybeUninit::uninit();
unsafe {
cudaMemPoolGetAttribute(
self.handle,
mem::transmute::<i32, CudaMemPoolAttribute>(attribute.into()),
value.as_mut_ptr() as *mut c_void,
value.as_mut_ptr() as _,
)
.wrap_maybe_uninit(value)
}
Expand All @@ -87,7 +86,7 @@ impl CudaMemPool {
cudaMemPoolSetAttribute(
self.handle,
mem::transmute::<i32, CudaMemPoolAttribute>(attribute.into()),
&value as *const _ as *mut c_void,
&value as *const _ as _,
)
.wrap()
}
Expand Down Expand Up @@ -156,7 +155,7 @@ impl CudaOwnedMemPool {
}

pub fn create(properties: &CudaMemPoolProperties) -> CudaResult<Self> {
let mut handle = MaybeUninit::<cudaMemPool_t>::uninit();
let mut handle = MaybeUninit::uninit();
unsafe {
cudaMemPoolCreate(handle.as_mut_ptr(), properties)
.wrap_maybe_uninit(handle)
Expand Down Expand Up @@ -207,15 +206,12 @@ pub struct DevicePoolAllocation<'a, T> {
impl<'a, T> DevicePoolAllocation<'a, T> {
pub fn alloc_async(length: usize, stream: &'a CudaStream) -> CudaResult<Self> {
let layout = Layout::array::<T>(length).unwrap();
let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit();
let mut dev_ptr = MaybeUninit::uninit();
unsafe {
cudaMallocAsync(dev_ptr.as_mut_ptr(), layout.size(), stream.into())
.wrap_maybe_uninit(dev_ptr)
.map(|ptr| Self {
data: AllocationData {
ptr: ptr as *mut T,
len: length,
},
data: AllocationData::new_unchecked(ptr as _, length),
stream,
})
}
Expand All @@ -227,7 +223,7 @@ impl<'a, T> DevicePoolAllocation<'a, T> {
stream: &'a CudaStream,
) -> CudaResult<Self> {
let layout = Layout::array::<T>(length).unwrap();
let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit();
let mut dev_ptr = MaybeUninit::uninit();
unsafe {
cudaMallocFromPoolAsync(
dev_ptr.as_mut_ptr(),
Expand All @@ -237,43 +233,37 @@ impl<'a, T> DevicePoolAllocation<'a, T> {
)
.wrap_maybe_uninit(dev_ptr)
.map(|ptr| Self {
data: AllocationData {
ptr: ptr as *mut T,
len: length,
},
data: AllocationData::new_unchecked(ptr as _, length),
stream,
})
}
}

pub fn free_async(self, stream: &CudaStream) -> CudaResult<()> {
unsafe {
let ptr = self.as_c_void_ptr() as *mut c_void;
let ptr = self.as_c_void_ptr() as _;
mem::forget(self);
cudaFreeAsync(ptr, stream.into()).wrap()
}
}

pub fn swap_stream(self, stream: &CudaStream) -> DevicePoolAllocation<T> {
let data = AllocationData {
ptr: self.data.ptr,
len: self.data.len,
};
let data = AllocationData::new(self.data.ptr, self.data.len);
mem::forget(self);
DevicePoolAllocation { data, stream }
}

/// # Safety
///
/// The caller must ensure that the inputs are valid.
pub unsafe fn from_raw_parts(ptr: *mut T, len: usize, stream: &'a CudaStream) -> Self {
pub unsafe fn from_raw_parts(ptr: NonNull<T>, len: usize, stream: &'a CudaStream) -> Self {
Self {
data: AllocationData { ptr, len },
data: AllocationData::new(ptr, len),
stream,
}
}

pub fn into_raw_parts(self) -> (*mut T, usize, &'a CudaStream) {
pub fn into_raw_parts(self) -> (NonNull<T>, usize, &'a CudaStream) {
let result = (self.data.ptr, self.data.len, self.stream);
mem::forget(self);
result
Expand Down Expand Up @@ -509,7 +499,7 @@ mod tests {
let stream = CudaStream::create().unwrap();
let allocation =
DevicePoolAllocation::<u32>::alloc_from_pool_async(LENGTH, &pool, &stream).unwrap();
let size = mem::size_of::<u32>() * LENGTH;
let size = size_of::<u32>() * LENGTH;
let used = pool
.get_attribute(CudaMemPoolAttributeU64::AttrUsedMemCurrent)
.unwrap() as usize;
Expand Down
Loading