|
1 | 1 | use super::DeviceCopy;
|
| 2 | +use crate::device::Device; |
| 3 | +#[allow(unused_imports)] |
| 4 | +use crate::device::DeviceAttribute; |
2 | 5 | use crate::error::*;
|
3 | 6 | use crate::memory::malloc::{cuda_free_unified, cuda_malloc_unified};
|
4 | 7 | use crate::memory::UnifiedPointer;
|
| 8 | +use crate::prelude::Stream; |
| 9 | +use crate::sys as cuda; |
5 | 10 | use std::borrow::{Borrow, BorrowMut};
|
6 | 11 | use std::cmp::Ordering;
|
7 | 12 | use std::convert::{AsMut, AsRef};
|
@@ -175,7 +180,7 @@ impl<T: DeviceCopy> UnifiedBox<T> {
|
175 | 180 | /// let ptr = x.as_unified_ptr();
|
176 | 181 | /// println!("{:p}", ptr);
|
177 | 182 | /// ```
|
178 |
| - pub fn as_unified_ptr(&mut self) -> UnifiedPointer<T> { |
| 183 | + pub fn as_unified_ptr(&self) -> UnifiedPointer<T> { |
179 | 184 | self.ptr
|
180 | 185 | }
|
181 | 186 |
|
@@ -591,6 +596,191 @@ impl<T: DeviceCopy> Drop for UnifiedBuffer<T> {
|
591 | 596 | }
|
592 | 597 | }
|
593 | 598 |
|
| 599 | +/// Functions for advising the driver about certain uses of unified memory. Such as advising the driver |
| 600 | +/// to prefetch memory or to treat memory as read-mostly. |
| 601 | +/// |
| 602 | +/// Note that none of the following APIs are required for correctness and/or safety, any use of the memory |
| 603 | +/// will be valid no matter the use of the following functions. However, such uses may be very inefficient and/or |
| 604 | +/// have increased memory consumption. |
| 605 | +pub trait MemoryAdvise<T: DeviceCopy>: private::Sealed { |
| 606 | + fn as_slice(&self) -> &[T]; |
| 607 | + |
| 608 | + // prefetch is documented as only being able to return Success, InvalidValue, or InvalidDevice. |
| 609 | + // None of which should ever happen because Streams, Devices, and unified buffers are always valid. |
| 610 | + // So we don't return a CUDA result. |
| 611 | + |
| 612 | + /// Advises the driver to enqueue an operation on the stream to prefetch the memory to the CPU. |
| 613 | + /// This will cause the driver to fetch the data back to the CPU as soon as the operation is reached |
| 614 | + /// on the stream. |
| 615 | + /// |
| 616 | + /// The CPU must have the attribute [`DeviceAttribute::ConcurrentManagedAccess`]. |
| 617 | + /// |
| 618 | + /// # Example |
| 619 | + /// |
| 620 | + /// ```no_run |
| 621 | + /// # fn main() -> Result<(), Box<dyn std::error::Error>> { |
| 622 | + /// # let _context = cust::quick_init().unwrap(); |
| 623 | + /// # use cust::prelude::*; |
| 624 | + /// use cust::memory::*; |
| 625 | + /// let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?; |
| 626 | + /// let x = UnifiedBuffer::from_slice(&[10u32, 20, 30])?; |
| 627 | + /// x.prefetch_to_host(&stream); |
| 628 | + /// stream.synchronize()?; |
| 629 | + /// # Ok(()) |
| 630 | + /// # } |
| 631 | + /// ``` |
| 632 | + fn prefetch_to_host(&self, stream: &Stream) -> CudaResult<()> { |
| 633 | + let slice = self.as_slice(); |
| 634 | + let mem_size = std::mem::size_of_val(slice); |
| 635 | + |
| 636 | + unsafe { |
| 637 | + cuda::cuMemPrefetchAsync( |
| 638 | + slice.as_ptr() as cuda::CUdeviceptr, |
| 639 | + mem_size, |
| 640 | + -1, // CU_DEVICE_CPU #define |
| 641 | + stream.as_inner(), |
| 642 | + ) |
| 643 | + .to_result()?; |
| 644 | + } |
| 645 | + Ok(()) |
| 646 | + } |
| 647 | + |
| 648 | + /// Advises the driver to enqueue an operation on the stream to prefetch the memory to a certain GPU. |
| 649 | + /// This will cause the driver to fetch the data to the specified device as soon as the operation |
| 650 | + /// is reached on the stream. |
| 651 | + /// |
| 652 | + /// The device must have the attribute [`DeviceAttribute::ConcurrentManagedAccess`]. |
| 653 | + /// |
| 654 | + /// # Example |
| 655 | + /// |
| 656 | + /// ```no_run |
| 657 | + /// # fn main() -> Result<(), Box<dyn std::error::Error>> { |
| 658 | + /// # let _context = cust::quick_init().unwrap(); |
| 659 | + /// # use cust::prelude::*; |
| 660 | + /// use cust::memory::*; |
| 661 | + /// let device = Device::get_device(0)?; |
| 662 | + /// let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?; |
| 663 | + /// let x = UnifiedBuffer::from_slice(&[10u32, 20, 30])?; |
| 664 | + /// x.prefetch_to_device(&stream, &device); |
| 665 | + /// stream.synchronize()?; |
| 666 | + /// # Ok(()) |
| 667 | + /// # } |
| 668 | + /// ``` |
| 669 | + fn prefetch_to_device(&self, stream: &Stream, device: &Device) -> CudaResult<()> { |
| 670 | + let slice = self.as_slice(); |
| 671 | + let mem_size = std::mem::size_of_val(slice); |
| 672 | + |
| 673 | + unsafe { |
| 674 | + cuda::cuMemPrefetchAsync( |
| 675 | + slice.as_ptr() as cuda::CUdeviceptr, |
| 676 | + mem_size, |
| 677 | + device.as_raw(), |
| 678 | + stream.as_inner(), |
| 679 | + ) |
| 680 | + .to_result()?; |
| 681 | + } |
| 682 | + Ok(()) |
| 683 | + } |
| 684 | + |
| 685 | + /// Advises the driver that this memory range is mostly going to be read to, and occasionally written to. |
| 686 | + /// |
| 687 | + /// Any read accesses from any processor will create a read-only copy of at least the accessed pages in that processor's memory. |
| 688 | + /// |
| 689 | + /// Additionally, when prefetching, a read-only copy of the data will be created on the destination processor. If any processor |
| 690 | + /// attempts to write to this data, all copies of the corresponding page will be invalidated except for the one where the write occurred. |
| 691 | + /// |
| 692 | + /// For a page to be read-duplicated, the accessing processor must have a non-zero value for [`DeviceAttribute::ConcurrentManagedAccess`]. |
| 693 | + /// Additionally, if a context is created on a device that does not have [`DeviceAttribute::ConcurrentManagedAccess`], then read-duplication |
| 694 | + /// will not occur until all such contexts are destroyed. |
| 695 | + fn advise_read_mostly(&self, read_mostly: bool) -> CudaResult<()> { |
| 696 | + let slice = self.as_slice(); |
| 697 | + let mem_size = std::mem::size_of_val(slice); |
| 698 | + |
| 699 | + let advice = if read_mostly { |
| 700 | + cuda::CUmem_advise::CU_MEM_ADVISE_SET_READ_MOSTLY |
| 701 | + } else { |
| 702 | + cuda::CUmem_advise::CU_MEM_ADVISE_UNSET_READ_MOSTLY |
| 703 | + }; |
| 704 | + |
| 705 | + unsafe { |
| 706 | + cuda::cuMemAdvise(slice.as_ptr() as cuda::CUdeviceptr, mem_size, advice, 0) |
| 707 | + .to_result()?; |
| 708 | + } |
| 709 | + Ok(()) |
| 710 | + } |
| 711 | + |
| 712 | + /// Advises the driver as to the preferred device for this memory range. Either |
| 713 | + /// a device with `Some(device)` or the CPU with `None`. If the device is a GPU, |
| 714 | + /// it must have [`DeviceAttribute::ConcurrentManagedAccess`]. |
| 715 | + /// |
| 716 | + /// Setting the preferred location does not cause the data to be migrated to that location immediately. |
| 717 | + /// It instead guides the migration policy when a fault occurs on the memory region. If the data is already in |
| 718 | + /// its preferred location and the faulting processor can establish a mapping without requiring the data to be migrated, |
| 719 | + /// then data migration will be avoided. On the other hand, if the data is not there or a mapping cannot be established, |
| 720 | + /// then it will be migrated to the accessing processor. |
| 721 | + /// |
| 722 | + /// Having a preferred location can override the page thrash detection and resolution logic in the unified memory driver. |
| 723 | + /// Normally if a page is detected to be constantly thrashing between processors, the page may eventually be pinned to |
| 724 | + /// host memory by the driver. But if the preferred location is set as device memory, then the page will continue |
| 725 | + /// to thrash indefinitely. |
| 726 | + /// |
| 727 | + /// If [`advise_read_mostly`](Self::advise_read_mostly) is set on this memory region or a subset of it, then the policies |
| 728 | + /// associated with that device will override the policies of this advice. |
| 729 | + /// |
| 730 | + /// This advice does not prevent the use of [`prefetch_to_host`](Self::prefetch_to_host) or [`prefetch_to_device`](Self::prefetch_to_device). |
| 731 | + fn preferred_location(&self, preferred_location: Option<Device>) -> CudaResult<()> { |
| 732 | + let slice = self.as_slice(); |
| 733 | + let mem_size = std::mem::size_of_val(slice); |
| 734 | + |
| 735 | + unsafe { |
| 736 | + cuda::cuMemAdvise( |
| 737 | + slice.as_ptr() as cuda::CUdeviceptr, |
| 738 | + mem_size, |
| 739 | + cuda::CUmem_advise::CU_MEM_ADVISE_SET_PREFERRED_LOCATION, |
| 740 | + preferred_location.map(|d| d.as_raw()).unwrap_or(-1), |
| 741 | + ) |
| 742 | + .to_result()?; |
| 743 | + } |
| 744 | + Ok(()) |
| 745 | + } |
| 746 | + |
| 747 | + /// Undoes the most recent changes by [`set_preferred_location`](Self::set_preferred_location). |
| 748 | + fn unset_preferred_location(&self) -> CudaResult<()> { |
| 749 | + let slice = self.as_slice(); |
| 750 | + let mem_size = std::mem::size_of_val(slice); |
| 751 | + |
| 752 | + unsafe { |
| 753 | + cuda::cuMemAdvise( |
| 754 | + slice.as_ptr() as cuda::CUdeviceptr, |
| 755 | + mem_size, |
| 756 | + cuda::CUmem_advise::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION, |
| 757 | + 0, |
| 758 | + ) |
| 759 | + .to_result()?; |
| 760 | + } |
| 761 | + Ok(()) |
| 762 | + } |
| 763 | +} |
| 764 | + |
| 765 | +impl<T: DeviceCopy> MemoryAdvise<T> for UnifiedBox<T> { |
| 766 | + fn as_slice(&self) -> &[T] { |
| 767 | + // SAFETY: unified pointers are valid on the CPU |
| 768 | + unsafe { std::slice::from_raw_parts(self.as_unified_ptr().as_raw(), 1) } |
| 769 | + } |
| 770 | +} |
| 771 | + |
| 772 | +impl<T: DeviceCopy> MemoryAdvise<T> for UnifiedBuffer<T> { |
| 773 | + fn as_slice(&self) -> &[T] { |
| 774 | + self |
| 775 | + } |
| 776 | +} |
| 777 | + |
| 778 | +mod private { |
| 779 | + pub trait Sealed {} |
| 780 | + impl<T: super::DeviceCopy> Sealed for super::UnifiedBox<T> {} |
| 781 | + impl<T: super::DeviceCopy> Sealed for super::UnifiedBuffer<T> {} |
| 782 | +} |
| 783 | + |
594 | 784 | #[cfg(test)]
|
595 | 785 | mod test_unified_box {
|
596 | 786 | use super::*;
|
@@ -718,8 +908,8 @@ mod test_unified_buffer {
|
718 | 908 | #[test]
|
719 | 909 | fn test_unified_pointer_implements_traits_safely() {
|
720 | 910 | let _context = crate::quick_init().unwrap();
|
721 |
| - let mut x = UnifiedBox::new(5u64).unwrap(); |
722 |
| - let mut y = UnifiedBox::new(0u64).unwrap(); |
| 911 | + let x = UnifiedBox::new(5u64).unwrap(); |
| 912 | + let y = UnifiedBox::new(0u64).unwrap(); |
723 | 913 |
|
724 | 914 | // If the impls dereference the pointer, this should segfault.
|
725 | 915 | let _ = Ord::cmp(&x.as_unified_ptr(), &y.as_unified_ptr());
|
|
0 commit comments