11/*
2+ * Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
23 * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
34 * University Research and Technology
45 * Corporation. All rights reserved.
@@ -106,6 +107,13 @@ struct cudaFunctionTable {
106107 int (* cuStreamDestroy )(CUstream );
107108#if OPAL_CUDA_GET_ATTRIBUTES
108109 int (* cuPointerGetAttributes )(unsigned int , CUpointer_attribute * , void * * , CUdeviceptr );
110+ #if OPAL_CUDA_VMM_SUPPORT
111+ int (* cuDeviceGetCount )(int * );
112+ int (* cuMemRelease )(CUmemGenericAllocationHandle );
113+ int (* cuMemRetainAllocationHandle )(CUmemGenericAllocationHandle * , void * );
114+ int (* cuMemGetAllocationPropertiesFromHandle )(CUmemAllocationProp * , CUmemGenericAllocationHandle );
115+ int (* cuMemGetAccess )(unsigned long long * , const CUmemLocation * , CUdeviceptr );
116+ #endif
109117#endif /* OPAL_CUDA_GET_ATTRIBUTES */
110118};
111119typedef struct cudaFunctionTable cudaFunctionTable_t ;
@@ -479,6 +487,13 @@ int mca_common_cuda_stage_one_init(void)
479487#if OPAL_CUDA_GET_ATTRIBUTES
480488 OPAL_CUDA_DLSYM (libcuda_handle , cuPointerGetAttributes );
481489#endif /* OPAL_CUDA_GET_ATTRIBUTES */
490+ #if OPAL_CUDA_VMM_SUPPORT
491+ OPAL_CUDA_DLSYM (libcuda_handle , cuDeviceGetCount );
492+ OPAL_CUDA_DLSYM (libcuda_handle , cuMemRelease );
493+ OPAL_CUDA_DLSYM (libcuda_handle , cuMemRetainAllocationHandle );
494+ OPAL_CUDA_DLSYM (libcuda_handle , cuMemGetAllocationPropertiesFromHandle );
495+ OPAL_CUDA_DLSYM (libcuda_handle , cuMemGetAccess );
496+ #endif
482497 return 0 ;
483498}
484499
@@ -1730,13 +1745,77 @@ static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end) {
17301745}
17311746#endif /* OPAL_ENABLE_DEBUG */
17321747
1748+ static int mca_common_cuda_check_vmm (CUdeviceptr dbuf , CUmemorytype * mem_type )
1749+ {
1750+ #if OPAL_CUDA_VMM_SUPPORT
1751+ static int device_count = -1 ;
1752+ CUmemAllocationProp prop ;
1753+ CUmemLocation location ;
1754+ CUresult result ;
1755+ unsigned long long flags ;
1756+ CUmemGenericAllocationHandle alloc_handle ;
1757+
1758+ if (device_count == -1 ) {
1759+ result = cuFunc .cuDeviceGetCount (& device_count );
1760+ if (result != CUDA_SUCCESS ) {
1761+ return 0 ;
1762+ }
1763+ }
1764+
1765+ result = cuFunc .cuMemRetainAllocationHandle (& alloc_handle , (void * )dbuf );
1766+ if (result != CUDA_SUCCESS ) {
1767+ return 0 ;
1768+ }
1769+
1770+ result = cuFunc .cuMemGetAllocationPropertiesFromHandle (& prop , alloc_handle );
1771+ if (result != CUDA_SUCCESS ) {
1772+ cuFunc .cuMemRelease (alloc_handle );
1773+ return 0 ;
1774+ }
1775+
1776+ if (prop .location .type == CU_MEM_LOCATION_TYPE_DEVICE ) {
1777+ * mem_type = CU_MEMORYTYPE_DEVICE ;
1778+ cuFunc .cuMemRelease (alloc_handle );
1779+ return 1 ;
1780+ }
1781+
1782+ if (prop .location .type == CU_MEM_LOCATION_TYPE_HOST_NUMA ) {
1783+ /* check if device has access */
1784+ for (int i = 0 ; i < device_count ; i ++ ) {
1785+ location .type = CU_MEM_LOCATION_TYPE_DEVICE ;
1786+ location .id = i ;
1787+ result = cuFunc .cuMemGetAccess (& flags , & location , dbuf );
1788+ if ((CUDA_SUCCESS == result ) &&
1789+ (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags )) {
1790+ * mem_type = CU_MEMORYTYPE_DEVICE ;
1791+ cuFunc .cuMemRelease (alloc_handle );
1792+ return 1 ;
1793+ }
1794+ }
1795+ }
1796+
1797+ /* host must have access as device access possibility is exhausted */
1798+ * mem_type = CU_MEMORYTYPE_HOST ;
1799+ cuFunc .cuMemRelease (alloc_handle );
1800+ return 1 ;
1801+
1802+ #endif
1803+
1804+ return 0 ;
1805+ }
1806+
17331807/* Routines that get plugged into the opal datatype code */
17341808static int mca_common_cuda_is_gpu_buffer (const void * pUserBuf , opal_convertor_t * convertor )
17351809{
17361810 int res ;
1811+ int is_vmm = 0 ;
1812+ CUmemorytype vmm_mem_type = 0 ;
17371813 CUmemorytype memType = 0 ;
17381814 CUdeviceptr dbuf = (CUdeviceptr )pUserBuf ;
17391815 CUcontext ctx = NULL , memCtx = NULL ;
1816+
1817+ is_vmm = mca_common_cuda_check_vmm (dbuf , & vmm_mem_type );
1818+
17401819#if OPAL_CUDA_GET_ATTRIBUTES
17411820 uint32_t isManaged = 0 ;
17421821 /* With CUDA 7.0, we can get multiple attributes with a single call */
@@ -1763,8 +1842,12 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
17631842 * just assume it is not. */
17641843 return 0 ;
17651844 } else if (memType == CU_MEMORYTYPE_HOST ) {
1766- /* Host memory, nothing to do here */
1767- return 0 ;
1845+ if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE )) {
1846+ memType = CU_MEMORYTYPE_DEVICE ;
1847+ } else {
1848+ /* Host memory, nothing to do here */
1849+ return 0 ;
1850+ }
17681851 } else if (memType == 0 ) {
17691852 /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
17701853 return 0 ;
@@ -1779,8 +1862,12 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
17791862 * just assume it is not. */
17801863 return 0 ;
17811864 } else if (memType == CU_MEMORYTYPE_HOST ) {
1782- /* Host memory, nothing to do here */
1783- return 0 ;
1865+ if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE )) {
1866+ memType = CU_MEMORYTYPE_DEVICE ;
1867+ } else {
1868+ /* Host memory, nothing to do here */
1869+ return 0 ;
1870+ }
17841871 }
17851872 /* Must be a device pointer */
17861873 assert (memType == CU_MEMORYTYPE_DEVICE );
@@ -1806,6 +1893,16 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
18061893 return OPAL_ERROR ;
18071894 }
18081895#endif /* OPAL_CUDA_GET_ATTRIBUTES */
1896+ if (is_vmm ) {
1897+ /* This function is expected to set context if pointer is device
1898+ * accessible but VMM allocations have NULL context associated
1899+ * which cannot be set against the calling thread */
1900+ opal_output (0 ,
1901+ "CUDA: unable to set context with the given pointer"
1902+ "ptr=%p aborting..." , dbuf );
1903+ return OPAL_ERROR ;
1904+ }
1905+
18091906 res = cuFunc .cuCtxSetCurrent (memCtx );
18101907 if (OPAL_UNLIKELY (res != CUDA_SUCCESS )) {
18111908 opal_output (0 , "CUDA: error calling cuCtxSetCurrent: "
0 commit comments