diff --git a/crates/cudart/src/memory.rs b/crates/cudart/src/memory.rs
index 6e49257..83f4357 100644
--- a/crates/cudart/src/memory.rs
+++ b/crates/cudart/src/memory.rs
@@ -1,14 +1,12 @@
 // memory management
 // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
 
+use bitflags::bitflags;
+use era_cudart_sys::*;
 use std::alloc::Layout;
 use std::mem::{self, MaybeUninit};
 use std::ops::{Deref, DerefMut};
-use std::os::raw::c_void;
-
-use bitflags::bitflags;
-
-use era_cudart_sys::*;
+use std::ptr::NonNull;
 
 use crate::result::{CudaResult, CudaResultWrap};
 use crate::slice::{AllocationData, CudaSlice, CudaSliceMut, DeviceSlice};
@@ -21,22 +19,17 @@ pub struct DeviceAllocation<T>(AllocationData<T>);
 impl<T> DeviceAllocation<T> {
     pub fn alloc(length: usize) -> CudaResult<Self> {
         let layout = Layout::array::<T>(length).unwrap();
-        let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit();
+        let mut dev_ptr = MaybeUninit::uninit();
         unsafe {
             cudaMalloc(dev_ptr.as_mut_ptr(), layout.size())
                 .wrap_maybe_uninit(dev_ptr)
-                .map(|ptr| {
-                    Self(AllocationData {
-                        ptr: ptr as *mut T,
-                        len: length,
-                    })
-                })
+                .map(|ptr| Self(AllocationData::new_unchecked(ptr as _, length)))
         }
     }
 
     pub fn free(self) -> CudaResult<()> {
         unsafe {
-            let ptr = self.0.ptr as *mut c_void;
+            let ptr = self.0.ptr.as_ptr() as _;
             mem::forget(self);
             cudaFree(ptr).wrap()
         }
@@ -45,11 +38,11 @@ impl<T> DeviceAllocation<T> {
     /// # Safety
     ///
     /// The caller must ensure that the inputs are valid.
-    pub unsafe fn from_raw_parts(ptr: *mut T, len: usize) -> Self {
-        Self(AllocationData { ptr, len })
+    pub unsafe fn from_raw_parts(ptr: NonNull<T>, len: usize) -> Self {
+        Self(AllocationData::new(ptr, len))
     }
 
-    pub fn into_raw_parts(self) -> (*mut T, usize) {
+    pub fn into_raw_parts(self) -> (NonNull<T>, usize) {
         let result = (self.0.ptr, self.0.len);
         mem::forget(self);
         result
@@ -58,7 +51,7 @@ impl<T> DeviceAllocation<T> {
 
 impl<T> Drop for DeviceAllocation<T> {
     fn drop(&mut self) {
-        unsafe { cudaFree(self.as_mut_c_void_ptr()).eprint_error_and_backtrace() };
+        unsafe { cudaFree(self.0.ptr.as_ptr() as _).eprint_error_and_backtrace() };
     }
 }
 
@@ -103,10 +96,10 @@ impl<T> CudaSliceMut<T> for DeviceAllocation<T> {
 bitflags! {
     #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
     pub struct CudaHostAllocFlags: u32 {
-        const DEFAULT = era_cudart_sys::cudaHostAllocDefault;
-        const PORTABLE = era_cudart_sys::cudaHostAllocPortable;
-        const MAPPED = era_cudart_sys::cudaHostAllocMapped;
-        const WRITE_COMBINED = era_cudart_sys::cudaHostAllocWriteCombined;
+        const DEFAULT = cudaHostAllocDefault;
+        const PORTABLE = cudaHostAllocPortable;
+        const MAPPED = cudaHostAllocMapped;
+        const WRITE_COMBINED = cudaHostAllocWriteCombined;
     }
 }
 
@@ -123,22 +116,17 @@ pub struct HostAllocation<T>(AllocationData<T>);
 impl<T> HostAllocation<T> {
     pub fn alloc(length: usize, flags: CudaHostAllocFlags) -> CudaResult<Self> {
         let layout = Layout::array::<T>(length).unwrap();
-        let mut ptr = MaybeUninit::<*mut c_void>::uninit();
+        let mut ptr = MaybeUninit::uninit();
         unsafe {
             cudaHostAlloc(ptr.as_mut_ptr(), layout.size(), flags.bits())
                 .wrap_maybe_uninit(ptr)
-                .map(|ptr| {
-                    Self(AllocationData {
-                        ptr: ptr as *mut T,
-                        len: length,
-                    })
-                })
+                .map(|ptr| Self(AllocationData::new_unchecked(ptr as _, length)))
         }
     }
 
     pub fn free(self) -> CudaResult<()> {
         unsafe {
-            let ptr = self.0.ptr as *mut c_void;
+            let ptr = self.0.ptr.as_ptr() as _;
             mem::forget(self);
             cudaFreeHost(ptr).wrap()
         }
@@ -147,11 +135,11 @@ impl<T> HostAllocation<T> {
     /// # Safety
     ///
     /// The caller must ensure that the inputs are valid.
-    pub unsafe fn from_raw_parts(ptr: *mut T, len: usize) -> Self {
-        Self(AllocationData { ptr, len })
+    pub unsafe fn from_raw_parts(ptr: NonNull<T>, len: usize) -> Self {
+        Self(AllocationData::new(ptr, len))
     }
 
-    pub fn into_raw_parts(self) -> (*mut T, usize) {
+    pub fn into_raw_parts(self) -> (NonNull<T>, usize) {
         let result = (self.0.ptr, self.0.len);
         mem::forget(self);
         result
@@ -160,7 +148,12 @@ impl<T> HostAllocation<T> {
 
 impl<T> Drop for HostAllocation<T> {
     fn drop(&mut self) {
-        unsafe { cudaFreeHost(self.0.ptr as *mut c_void).eprint_error_and_backtrace() };
+        let ptr = self.0.ptr.as_ptr();
+        let len = self.0.len;
+        unsafe {
+            std::ptr::drop_in_place(std::slice::from_raw_parts_mut(ptr, len));
+            cudaFreeHost(ptr as _).eprint_error_and_backtrace()
+        };
     }
 }
 
@@ -192,11 +185,11 @@ impl<T> AsMut<[T]> for HostAllocation<T> {
 bitflags! {
     #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
     pub struct CudaHostRegisterFlags: u32 {
-        const DEFAULT = era_cudart_sys::cudaHostRegisterDefault;
-        const PORTABLE = era_cudart_sys::cudaHostRegisterPortable;
-        const MAPPED = era_cudart_sys::cudaHostRegisterMapped;
-        const IO_MEMORY = era_cudart_sys::cudaHostRegisterIoMemory;
-        const READ_ONLY = era_cudart_sys::cudaHostRegisterReadOnly;
+        const DEFAULT = cudaHostRegisterDefault;
+        const PORTABLE = cudaHostRegisterPortable;
+        const MAPPED = cudaHostRegisterMapped;
+        const IO_MEMORY = cudaHostRegisterIoMemory;
+        const READ_ONLY = cudaHostRegisterReadOnly;
     }
 }
 
@@ -215,25 +208,19 @@ impl<'a, T> HostRegistration<'a, T> {
         let length = slice.len();
         let layout = Layout::array::<T>(length).unwrap();
         unsafe {
-            cudaHostRegister(
-                slice.as_c_void_ptr() as *mut c_void,
-                layout.size(),
-                flags.bits(),
-            )
-            .wrap_value(Self(slice))
+            cudaHostRegister(slice.as_c_void_ptr() as _, layout.size(), flags.bits())
+                .wrap_value(Self(slice))
         }
     }
 
     pub fn unregister(self) -> CudaResult<()> {
-        unsafe { cudaHostUnregister(self.0.as_c_void_ptr() as *mut c_void).wrap() }
+        unsafe { cudaHostUnregister(self.0.as_c_void_ptr() as _).wrap() }
     }
 }
 
 impl<T> Drop for HostRegistration<'_, T> {
     fn drop(&mut self) {
-        unsafe {
-            cudaHostUnregister(self.0.as_c_void_ptr() as *mut c_void).eprint_error_and_backtrace()
-        };
+        unsafe { cudaHostUnregister(self.0.as_c_void_ptr() as _).eprint_error_and_backtrace() };
     }
 }
 
@@ -388,8 +375,8 @@ pub fn memory_set_async(
 }
 
 pub fn memory_get_info() -> CudaResult<(usize, usize)> {
-    let mut free = MaybeUninit::<usize>::uninit();
-    let mut total = MaybeUninit::<usize>::uninit();
+    let mut free = MaybeUninit::uninit();
+    let mut total = MaybeUninit::uninit();
     unsafe {
         let error = cudaMemGetInfo(free.as_mut_ptr(), total.as_mut_ptr());
         if error == CudaError::Success {
@@ -400,17 +387,6 @@ pub fn memory_get_info() -> CudaResult<(usize, usize)> {
     }
 }
 
-#[derive(Copy, Clone, Default, Debug, PartialEq, Eq)]
-pub struct HostAllocator {
-    flags: CudaHostAllocFlags,
-}
-
-impl HostAllocator {
-    pub fn new(flags: CudaHostAllocFlags) -> Self {
-        Self { flags }
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use serial_test::serial;
diff --git a/crates/cudart/src/memory_pools.rs b/crates/cudart/src/memory_pools.rs
index 3bd207b..52b4c37 100644
--- a/crates/cudart/src/memory_pools.rs
+++ b/crates/cudart/src/memory_pools.rs
@@ -1,13 +1,12 @@
 // Stream Ordered Memory Allocator
 // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html
 
+use era_cudart_sys::*;
 use std::alloc::Layout;
 use std::mem;
 use std::mem::MaybeUninit;
 use std::ops::{Deref, DerefMut};
-use std::os::raw::c_void;
-
-use era_cudart_sys::*;
+use std::ptr::NonNull;
 
 use crate::result::{CudaResult, CudaResultWrap};
 use crate::slice::{AllocationData, CudaSlice, CudaSliceMut, DeviceSlice};
@@ -55,7 +54,7 @@ impl CudaMemPool {
     }
 
     pub fn get_access(&self, location: CudaMemLocation) -> CudaResult<CudaMemAccessFlags> {
-        let mut result = MaybeUninit::<CudaMemAccessFlags>::uninit();
+        let mut result = MaybeUninit::uninit();
         unsafe {
             cudaMemPoolGetAccess(
                 result.as_mut_ptr(),
@@ -67,12 +66,12 @@ impl CudaMemPool {
     }
 
     pub fn get_attribute_value<T: Into<i32>, U>(&self, attribute: T) -> CudaResult<U> {
-        let mut value = MaybeUninit::<U>::uninit();
+        let mut value = MaybeUninit::uninit();
         unsafe {
             cudaMemPoolGetAttribute(
                 self.handle,
                 mem::transmute::<i32, CudaMemPoolAttribute>(attribute.into()),
-                value.as_mut_ptr() as *mut c_void,
+                value.as_mut_ptr() as _,
             )
             .wrap_maybe_uninit(value)
         }
@@ -87,7 +86,7 @@ impl CudaMemPool {
             cudaMemPoolSetAttribute(
                 self.handle,
                 mem::transmute::<i32, CudaMemPoolAttribute>(attribute.into()),
-                &value as *const _ as *mut c_void,
+                &value as *const _ as _,
             )
             .wrap()
         }
@@ -156,7 +155,7 @@ impl CudaOwnedMemPool {
     }
 
     pub fn create(properties: &CudaMemPoolProperties) -> CudaResult<Self> {
-        let mut handle = MaybeUninit::<cudaMemPool_t>::uninit();
+        let mut handle = MaybeUninit::uninit();
         unsafe {
             cudaMemPoolCreate(handle.as_mut_ptr(), properties)
                 .wrap_maybe_uninit(handle)
@@ -207,15 +206,12 @@ pub struct DevicePoolAllocation<'a, T> {
 impl<'a, T> DevicePoolAllocation<'a, T> {
     pub fn alloc_async(length: usize, stream: &'a CudaStream) -> CudaResult<Self> {
         let layout = Layout::array::<T>(length).unwrap();
-        let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit();
+        let mut dev_ptr = MaybeUninit::uninit();
         unsafe {
             cudaMallocAsync(dev_ptr.as_mut_ptr(), layout.size(), stream.into())
                 .wrap_maybe_uninit(dev_ptr)
                 .map(|ptr| Self {
-                    data: AllocationData {
-                        ptr: ptr as *mut T,
-                        len: length,
-                    },
+                    data: AllocationData::new_unchecked(ptr as _, length),
                     stream,
                 })
         }
@@ -227,7 +223,7 @@ impl<'a, T> DevicePoolAllocation<'a, T> {
         stream: &'a CudaStream,
     ) -> CudaResult<Self> {
         let layout = Layout::array::<T>(length).unwrap();
-        let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit();
+        let mut dev_ptr = MaybeUninit::uninit();
         unsafe {
             cudaMallocFromPoolAsync(
                 dev_ptr.as_mut_ptr(),
@@ -237,10 +233,7 @@ impl<'a, T> DevicePoolAllocation<'a, T> {
             )
             .wrap_maybe_uninit(dev_ptr)
             .map(|ptr| Self {
-                data: AllocationData {
-                    ptr: ptr as *mut T,
-                    len: length,
-                },
+                data: AllocationData::new_unchecked(ptr as _, length),
                 stream,
             })
         }
@@ -248,17 +241,14 @@ impl<'a, T> DevicePoolAllocation<'a, T> {
 
     pub fn free_async(self, stream: &CudaStream) -> CudaResult<()> {
         unsafe {
-            let ptr = self.as_c_void_ptr() as *mut c_void;
+            let ptr = self.as_c_void_ptr() as _;
             mem::forget(self);
             cudaFreeAsync(ptr, stream.into()).wrap()
         }
     }
 
     pub fn swap_stream(self, stream: &CudaStream) -> DevicePoolAllocation<T> {
-        let data = AllocationData {
-            ptr: self.data.ptr,
-            len: self.data.len,
-        };
+        let data = AllocationData::new(self.data.ptr, self.data.len);
         mem::forget(self);
         DevicePoolAllocation { data, stream }
     }
@@ -266,14 +256,14 @@ impl<'a, T> DevicePoolAllocation<'a, T> {
     /// # Safety
     ///
     /// The caller must ensure that the inputs are valid.
-    pub unsafe fn from_raw_parts(ptr: *mut T, len: usize, stream: &'a CudaStream) -> Self {
+    pub unsafe fn from_raw_parts(ptr: NonNull<T>, len: usize, stream: &'a CudaStream) -> Self {
         Self {
-            data: AllocationData { ptr, len },
+            data: AllocationData::new(ptr, len),
             stream,
         }
     }
 
-    pub fn into_raw_parts(self) -> (*mut T, usize, &'a CudaStream) {
+    pub fn into_raw_parts(self) -> (NonNull<T>, usize, &'a CudaStream) {
         let result = (self.data.ptr, self.data.len, self.stream);
         mem::forget(self);
         result
@@ -509,7 +499,7 @@ mod tests {
         let stream = CudaStream::create().unwrap();
         let allocation =
             DevicePoolAllocation::<u32>::alloc_from_pool_async(LENGTH, &pool, &stream).unwrap();
-        let size = mem::size_of::<u32>() * LENGTH;
+        let size = size_of::<u32>() * LENGTH;
         let used = pool
             .get_attribute(CudaMemPoolAttributeU64::AttrUsedMemCurrent)
             .unwrap() as usize;
diff --git a/crates/cudart/src/slice/allocation_data.rs b/crates/cudart/src/slice/allocation_data.rs
index 4b11763..18a8eab 100644
--- a/crates/cudart/src/slice/allocation_data.rs
+++ b/crates/cudart/src/slice/allocation_data.rs
@@ -1,11 +1,32 @@
+use std::marker::PhantomData;
+use std::ptr::NonNull;
 use std::slice;
 
 use crate::slice::{CudaSlice, CudaSliceMut};
 
 #[derive(Debug)]
 pub(crate) struct AllocationData<T> {
-    pub ptr: *mut T,
+    pub ptr: NonNull<T>,
     pub len: usize,
+    _owns_t: PhantomData<T>,
+}
+
+impl<T> AllocationData<T> {
+    pub fn new(ptr: NonNull<T>, len: usize) -> Self {
+        Self {
+            ptr,
+            len,
+            _owns_t: PhantomData,
+        }
+    }
+
+    pub unsafe fn new_unchecked(ptr: *mut T, len: usize) -> Self {
+        Self {
+            ptr: NonNull::new_unchecked(ptr),
+            len,
+            _owns_t: PhantomData,
+        }
+    }
 }
 
 unsafe impl<T> Send for AllocationData<T> where Vec<T>: Send {}
@@ -14,12 +35,12 @@ unsafe impl<T> Sync for AllocationData<T> where Vec<T>: Sync {}
 
 impl<T> CudaSlice<T> for AllocationData<T> {
     unsafe fn as_slice(&self) -> &[T] {
-        slice::from_raw_parts(self.ptr, self.len)
+        slice::from_raw_parts(self.ptr.as_ptr(), self.len)
     }
 }
 
 impl<T> CudaSliceMut<T> for AllocationData<T> {
     unsafe fn as_mut_slice(&mut self) -> &mut [T] {
-        slice::from_raw_parts_mut(self.ptr, self.len)
+        slice::from_raw_parts_mut(self.ptr.as_ptr(), self.len)
     }
 }
diff --git a/crates/cudart/src/stream.rs b/crates/cudart/src/stream.rs
index be8f176..5beb7b2 100644
--- a/crates/cudart/src/stream.rs
+++ b/crates/cudart/src/stream.rs
@@ -21,8 +21,8 @@ pub struct CudaStream {
 bitflags! {
     #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
     pub struct CudaStreamCreateFlags: u32 {
-        const DEFAULT = era_cudart_sys::cudaStreamDefault;
-        const NON_BLOCKING = era_cudart_sys::cudaStreamNonBlocking;
+        const DEFAULT = cudaStreamDefault;
+        const NON_BLOCKING = cudaStreamNonBlocking;
     }
 }
 
@@ -35,8 +35,8 @@ impl Default for CudaStreamCreateFlags {
 bitflags! {
     #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
     pub struct CudaStreamWaitEventFlags: u32 {
-        const DEFAULT = era_cudart_sys::cudaEventWaitDefault;
-        const WAIT_EXTERNAL = era_cudart_sys::cudaEventWaitExternal;
+        const DEFAULT = cudaEventWaitDefault;
+        const WAIT_EXTERNAL = cudaEventWaitExternal;
     }
 }
 
@@ -47,7 +47,9 @@ impl Default for CudaStreamWaitEventFlags {
 }
 
 impl CudaStream {
-    fn from_handle(handle: cudaStream_t) -> Self {
+    pub const DEFAULT: CudaStream = Self::from_handle(null_mut());
+
+    const fn from_handle(handle: cudaStream_t) -> Self {
         Self { handle }
     }
 
@@ -113,7 +115,7 @@ impl CudaStream {
 
 impl Default for CudaStream {
     fn default() -> Self {
-        Self { handle: null_mut() }
+        Self::DEFAULT
     }
 }
 
@@ -127,6 +129,8 @@ impl Drop for CudaStream {
     }
 }
 
+unsafe impl Sync for CudaStream {}
+
 impl From<&CudaStream> for cudaStream_t {
     fn from(stream: &CudaStream) -> Self {
         stream.handle