Skip to content

Commit 72abda7

Browse files
committed
Feat: add unified memory advising APIs.
1 parent 39edde4 commit 72abda7

File tree

4 files changed

+209
-12
lines changed

4 files changed

+209
-12
lines changed

crates/cust/CHANGELOG.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,21 @@ Notable changes to this project will be documented in this file.
66

77
## 0.2.0 - 11/26/21
88

9+
- Added `Device::as_raw`.
10+
- Added `MemoryAdvise` for unified memory advising.
11+
- Added `MemoryAdvise::prefetch_host` and `MemoryAdvise::prefetch_device` for telling CUDA to explicitly fetch unified memory somewhere.
12+
- Added `MemoryAdvise::advise_read_mostly`.
13+
- Added `MemoryAdvise::preferred_location` and `MemoryAdvise::unset_preferred_location`.
14+
Note that advising APIs are only present on high end GPUs such as V100s.
15+
16+
- Change `GpuBox::as_device_ptr` and `GpuBuffer::as_device_ptr` to take `&self` instead of `&mut self`.
917
- Rename `DBuffer` -> `DeviceBuffer`. This is how it was in rustacuda, but it was changed
1018
at some point, but now we reconsidered that it may be the wrong choice.
1119
- Renamed `DBox` -> `DeviceBox`.
1220
- Renamed `DSlice` -> `DeviceSlice`.
13-
- Fixed some doctests that were using old APIs.
21+
1422
- Remove `GpuBox::as_device_ptr_mut` and `GpuBuffer::as_device_ptr_mut`.
15-
- Change `GpuBox::as_device_ptr` and `GpuBuffer::as_device_ptr` to take `&self` instead of `&mut self`.
1623
- Remove accidentally added `vek` default feature.
1724
- `vek` feature now uses `default-features = false`, this also means `Rgb` and `Rgba` no longer implement `DeviceCopy`.
25+
26+
- Fixed some doctests that were using old APIs.

crates/cust/src/context.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -256,12 +256,8 @@ impl Context {
256256
// lifetime guarantees so we create-and-push, then pop, then the programmer has to
257257
// push again.
258258
let mut ctx: CUcontext = ptr::null_mut();
259-
cuda::cuCtxCreate_v2(
260-
&mut ctx as *mut CUcontext,
261-
flags.bits(),
262-
device.into_inner(),
263-
)
264-
.to_result()?;
259+
cuda::cuCtxCreate_v2(&mut ctx as *mut CUcontext, flags.bits(), device.as_raw())
260+
.to_result()?;
265261
Ok(Context { inner: ctx })
266262
}
267263
}

crates/cust/src/device.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,9 @@ impl Device {
356356
}
357357
}
358358

359-
pub(crate) fn into_inner(self) -> CUdevice {
359+
/// Returns a raw handle to this device, not handing over ownership, meaning that dropping
360+
/// this device will try to drop the underlying device.
361+
pub fn as_raw(&self) -> CUdevice {
360362
self.device
361363
}
362364
}

crates/cust/src/memory/unified.rs

Lines changed: 193 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
use super::DeviceCopy;
2+
use crate::device::Device;
3+
#[allow(unused_imports)]
4+
use crate::device::DeviceAttribute;
25
use crate::error::*;
36
use crate::memory::malloc::{cuda_free_unified, cuda_malloc_unified};
47
use crate::memory::UnifiedPointer;
8+
use crate::prelude::Stream;
9+
use crate::sys as cuda;
510
use std::borrow::{Borrow, BorrowMut};
611
use std::cmp::Ordering;
712
use std::convert::{AsMut, AsRef};
@@ -175,7 +180,7 @@ impl<T: DeviceCopy> UnifiedBox<T> {
175180
/// let ptr = x.as_unified_ptr();
176181
/// println!("{:p}", ptr);
177182
/// ```
178-
pub fn as_unified_ptr(&mut self) -> UnifiedPointer<T> {
183+
pub fn as_unified_ptr(&self) -> UnifiedPointer<T> {
179184
self.ptr
180185
}
181186

@@ -591,6 +596,191 @@ impl<T: DeviceCopy> Drop for UnifiedBuffer<T> {
591596
}
592597
}
593598

599+
/// Functions for advising the driver about certain uses of unified memory. Such as advising the driver
600+
/// to prefetch memory or to treat memory as read-mostly.
601+
///
602+
/// Note that none of the following APIs are required for correctness and/or safety, any use of the memory
603+
/// will be valid no matter the use of the following functions. However, such uses may be very inefficient and/or
604+
/// have increased memory consumption.
605+
pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed {
606+
fn as_slice(&self) -> &[T];
607+
608+
// prefetch is documented as only being able to return Success, InvalidValue, or InvalidDevice.
609+
// None of which should ever happen because Streams, Devices, and unified buffers are always valid.
610+
// So we don't return a CUDA result.
611+
612+
/// Advises the driver to enqueue an operation on the stream to prefetch the memory to the CPU.
613+
/// This will cause the driver to fetch the data back to the CPU as soon as the operation is reached
614+
/// on the stream.
615+
///
616+
/// The CPU must have the attribute [`DeviceAttribute::ConcurrentManagedAccess`].
617+
///
618+
/// # Example
619+
///
620+
/// ```no_run
621+
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
622+
/// # let _context = cust::quick_init().unwrap();
623+
/// # use cust::prelude::*;
624+
/// use cust::memory::*;
625+
/// let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
626+
/// let x = UnifiedBuffer::from_slice(&[10u32, 20, 30])?;
627+
/// x.prefetch_to_host(&stream);
628+
/// stream.synchronize()?;
629+
/// # Ok(())
630+
/// # }
631+
/// ```
632+
fn prefetch_to_host(&self, stream: &Stream) -> CudaResult<()> {
633+
let slice = self.as_slice();
634+
let mem_size = std::mem::size_of_val(slice);
635+
636+
unsafe {
637+
cuda::cuMemPrefetchAsync(
638+
slice.as_ptr() as cuda::CUdeviceptr,
639+
mem_size,
640+
-1, // CU_DEVICE_CPU #define
641+
stream.as_inner(),
642+
)
643+
.to_result()?;
644+
}
645+
Ok(())
646+
}
647+
648+
/// Advises the driver to enqueue an operation on the stream to prefetch the memory to a certain GPU.
649+
/// This will cause the driver to fetch the data to the specified device as soon as the operation
650+
/// is reached on the stream.
651+
///
652+
/// The device must have the attribute [`DeviceAttribute::ConcurrentManagedAccess`].
653+
///
654+
/// # Example
655+
///
656+
/// ```no_run
657+
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
658+
/// # let _context = cust::quick_init().unwrap();
659+
/// # use cust::prelude::*;
660+
/// use cust::memory::*;
661+
/// let device = Device::get_device(0)?;
662+
/// let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
663+
/// let x = UnifiedBuffer::from_slice(&[10u32, 20, 30])?;
664+
/// x.prefetch_to_device(&stream, &device);
665+
/// stream.synchronize()?;
666+
/// # Ok(())
667+
/// # }
668+
/// ```
669+
fn prefetch_to_device(&self, stream: &Stream, device: &Device) -> CudaResult<()> {
670+
let slice = self.as_slice();
671+
let mem_size = std::mem::size_of_val(slice);
672+
673+
unsafe {
674+
cuda::cuMemPrefetchAsync(
675+
slice.as_ptr() as cuda::CUdeviceptr,
676+
mem_size,
677+
device.as_raw(),
678+
stream.as_inner(),
679+
)
680+
.to_result()?;
681+
}
682+
Ok(())
683+
}
684+
685+
/// Advises the driver that this memory range is mostly going to be read to, and occasionally written to.
686+
///
687+
/// Any read accesses from any processor will create a read-only copy of at least the accessed pages in that processor's memory.
688+
///
689+
/// Additionally, when prefetching, a read-only copy of the data will be created on the destination processor. If any processor
690+
/// attempts to write to this data, all copies of the corresponding page will be invalidated except for the one where the write occurred.
691+
///
692+
/// For a page to be read-duplicated, the accessing processor must have a non-zero value for [`DeviceAttribute::ConcurrentManagedAccess`].
693+
/// Additionally, if a context is created on a device that does not have [`DeviceAttribute::ConcurrentManagedAccess`], then read-duplication
694+
/// will not occur until all such contexts are destroyed.
695+
fn advise_read_mostly(&self, read_mostly: bool) -> CudaResult<()> {
696+
let slice = self.as_slice();
697+
let mem_size = std::mem::size_of_val(slice);
698+
699+
let advice = if read_mostly {
700+
cuda::CUmem_advise::CU_MEM_ADVISE_SET_READ_MOSTLY
701+
} else {
702+
cuda::CUmem_advise::CU_MEM_ADVISE_UNSET_READ_MOSTLY
703+
};
704+
705+
unsafe {
706+
cuda::cuMemAdvise(slice.as_ptr() as cuda::CUdeviceptr, mem_size, advice, 0)
707+
.to_result()?;
708+
}
709+
Ok(())
710+
}
711+
712+
/// Advises the driver as to the preferred device for this memory range. Either
713+
/// a device with `Some(device)` or the CPU with `None`. If the device is a GPU,
714+
/// it must have [`DeviceAttribute::ConcurrentManagedAccess`].
715+
///
716+
/// Setting the preferred location does not cause the data to be migrated to that location immediately.
717+
/// It instead guides the migration policy when a fault occurs on the memory region. If the data is already in
718+
/// its preferred location and the faulting processor can establish a mapping without requiring the data to be migrated,
719+
/// then data migration will be avoided. On the other hand, if the data is not there or a mapping cannot be established,
720+
/// then it will be migrated to the accessing processor.
721+
///
722+
/// Having a preferred location can override the page thrash detection and resolution logic in the unified memory driver.
723+
/// Normally if a page is detected to be constantly thrashing between processors, the page may eventually be pinned to
724+
/// host memory by the driver. But if the preferred location is set as device memory, then the page will continue
725+
/// to thrash indefinitely.
726+
///
727+
/// If [`advise_read_mostly`](Self::advise_read_mostly) is set on this memory region or a subset of it, then the policies
728+
/// associated with that device will override the policies of this advice.
729+
///
730+
/// This advice does not prevent the use of [`prefetch_to_host`](Self::prefetch_to_host) or [`prefetch_to_device`](Self::prefetch_to_device).
731+
fn preferred_location(&self, preferred_location: Option<Device>) -> CudaResult<()> {
732+
let slice = self.as_slice();
733+
let mem_size = std::mem::size_of_val(slice);
734+
735+
unsafe {
736+
cuda::cuMemAdvise(
737+
slice.as_ptr() as cuda::CUdeviceptr,
738+
mem_size,
739+
cuda::CUmem_advise::CU_MEM_ADVISE_SET_PREFERRED_LOCATION,
740+
preferred_location.map(|d| d.as_raw()).unwrap_or(-1),
741+
)
742+
.to_result()?;
743+
}
744+
Ok(())
745+
}
746+
747+
/// Undoes the most recent changes by [`set_preferred_location`](Self::set_preferred_location).
748+
fn unset_preferred_location(&self) -> CudaResult<()> {
749+
let slice = self.as_slice();
750+
let mem_size = std::mem::size_of_val(slice);
751+
752+
unsafe {
753+
cuda::cuMemAdvise(
754+
slice.as_ptr() as cuda::CUdeviceptr,
755+
mem_size,
756+
cuda::CUmem_advise::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
757+
0,
758+
)
759+
.to_result()?;
760+
}
761+
Ok(())
762+
}
763+
}
764+
765+
impl<T: DeviceCopy> MemoryAdvise<T> for UnifiedBox<T> {
766+
fn as_slice(&self) -> &[T] {
767+
// SAFETY: unified pointers are valid on the CPU
768+
unsafe { std::slice::from_raw_parts(self.as_unified_ptr().as_raw(), 1) }
769+
}
770+
}
771+
772+
impl<T: DeviceCopy> MemoryAdvise<T> for UnifiedBuffer<T> {
773+
fn as_slice(&self) -> &[T] {
774+
self
775+
}
776+
}
777+
778+
mod private {
779+
pub trait Sealed {}
780+
impl<T: super::DeviceCopy> Sealed for super::UnifiedBox<T> {}
781+
impl<T: super::DeviceCopy> Sealed for super::UnifiedBuffer<T> {}
782+
}
783+
594784
#[cfg(test)]
595785
mod test_unified_box {
596786
use super::*;
@@ -718,8 +908,8 @@ mod test_unified_buffer {
718908
#[test]
719909
fn test_unified_pointer_implements_traits_safely() {
720910
let _context = crate::quick_init().unwrap();
721-
let mut x = UnifiedBox::new(5u64).unwrap();
722-
let mut y = UnifiedBox::new(0u64).unwrap();
911+
let x = UnifiedBox::new(5u64).unwrap();
912+
let y = UnifiedBox::new(0u64).unwrap();
723913

724914
// If the impls dereference the pointer, this should segfault.
725915
let _ = Ord::cmp(&x.as_unified_ptr(), &y.as_unified_ptr());

0 commit comments

Comments
 (0)