11/*
2+ * Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
23 * Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
34 * Copyright (c) 2014 Research Organization for Information Science
45 * and Technology (RIST). All rights reserved.
@@ -77,9 +78,93 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
7778 accelerator_cuda_get_buffer_id
7879};
7980
81+ static int accelerator_cuda_check_vmm (CUdeviceptr dbuf , CUmemorytype * mem_type ,
82+ int * dev_id )
83+ {
84+ #if OPAL_CUDA_VMM_SUPPORT
85+ static int device_count = -1 ;
86+ CUmemAllocationProp prop ;
87+ CUmemLocation location ;
88+ CUresult result ;
89+ unsigned long long flags ;
90+ CUmemGenericAllocationHandle alloc_handle ;
91+
92+ if (device_count == -1 ) {
93+ result = cuDeviceGetCount (& device_count );
94+ if (result != CUDA_SUCCESS ) {
95+ return 0 ;
96+ }
97+ }
98+
99+ result = cuMemRetainAllocationHandle (& alloc_handle , (void * )dbuf );
100+ if (result != CUDA_SUCCESS ) {
101+ return 0 ;
102+ }
103+
104+ result = cuMemGetAllocationPropertiesFromHandle (& prop , alloc_handle );
105+ if (result != CUDA_SUCCESS ) {
106+ cuMemRelease (alloc_handle );
107+ return 0 ;
108+ }
109+
110+ if (prop .location .type == CU_MEM_LOCATION_TYPE_DEVICE ) {
111+ * mem_type = CU_MEMORYTYPE_DEVICE ;
112+ * dev_id = prop .location .id ;
113+ cuMemRelease (alloc_handle );
114+ return 1 ;
115+ }
116+
117+ if (prop .location .type == CU_MEM_LOCATION_TYPE_HOST_NUMA ) {
118+ /* check if device has access */
119+ for (int i = 0 ; i < device_count ; i ++ ) {
120+ location .type = CU_MEM_LOCATION_TYPE_DEVICE ;
121+ location .id = i ;
122+ result = cuMemGetAccess (& flags , & location , dbuf );
123+ if ((CUDA_SUCCESS == result ) &&
124+ (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags )) {
125+ * mem_type = CU_MEMORYTYPE_DEVICE ;
126+ * dev_id = i ;
127+ cuMemRelease (alloc_handle );
128+ return 1 ;
129+ }
130+ }
131+ }
132+
133+ /* host must have access as device access possibility is exhausted */
134+ * mem_type = CU_MEMORYTYPE_HOST ;
135+ * dev_id = MCA_ACCELERATOR_NO_DEVICE_ID ;
136+ cuMemRelease (alloc_handle );
137+ return 1 ;
138+
139+ #endif
140+
141+ return 0 ;
142+ }
143+
144+ static int accelerator_cuda_get_device_id (CUcontext mem_ctx ) {
145+ /* query the device from the context */
146+ int dev_id = -1 ;
147+ CUdevice ptr_dev ;
148+ cuCtxPushCurrent (mem_ctx );
149+ cuCtxGetDevice (& ptr_dev );
150+ for (int i = 0 ; i < opal_accelerator_cuda_num_devices ; ++ i ) {
151+ CUdevice dev ;
152+ cuDeviceGet (& dev , i );
153+ if (dev == ptr_dev ) {
154+ dev_id = i ;
155+ break ;
156+ }
157+ }
158+ cuCtxPopCurrent (& mem_ctx );
159+ return dev_id ;
160+ }
161+
80162static int accelerator_cuda_check_addr (const void * addr , int * dev_id , uint64_t * flags )
81163{
82164 CUresult result ;
165+ int is_vmm = 0 ;
166+ int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID ;
167+ CUmemorytype vmm_mem_type = 0 ;
83168 CUmemorytype mem_type = 0 ;
84169 CUdeviceptr dbuf = (CUdeviceptr ) addr ;
85170 CUcontext ctx = NULL , mem_ctx = NULL ;
@@ -91,6 +176,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
91176
92177 * flags = 0 ;
93178
179+ is_vmm = accelerator_cuda_check_vmm (dbuf , & vmm_mem_type , & vmm_dev_id );
180+
94181#if OPAL_CUDA_GET_ATTRIBUTES
95182 uint32_t is_managed = 0 ;
96183 /* With CUDA 7.0, we can get multiple attributes with a single call */
@@ -120,14 +207,24 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
120207 return OPAL_ERROR ;
121208 }
122209 } else if (CU_MEMORYTYPE_HOST == mem_type ) {
123- /* Host memory, nothing to do here */
124- return 0 ;
210+ if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE )) {
211+ mem_type = CU_MEMORYTYPE_DEVICE ;
212+ * dev_id = vmm_dev_id ;
213+ } else {
214+ /* Host memory, nothing to do here */
215+ return 0 ;
216+ }
125217 } else if (0 == mem_type ) {
126218 /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
127219 return 0 ;
220+ } else {
221+ if (is_vmm ) {
222+ * dev_id = vmm_dev_id ;
223+ } else {
224+ /* query the device from the context */
225+ * dev_id = accelerator_cuda_get_device_id (mem_ctx );
226+ }
128227 }
129- /* Must be a device pointer */
130- assert (CU_MEMORYTYPE_DEVICE == mem_type );
131228#else /* OPAL_CUDA_GET_ATTRIBUTES */
132229 result = cuPointerGetAttribute (& mem_type , CU_POINTER_ATTRIBUTE_MEMORY_TYPE , dbuf );
133230 if (CUDA_SUCCESS != result ) {
@@ -138,12 +235,27 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
138235 return OPAL_ERROR ;
139236 }
140237 } else if (CU_MEMORYTYPE_HOST == mem_type ) {
141- /* Host memory, nothing to do here */
142- return 0 ;
238+ if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE )) {
239+ mem_type = CU_MEMORYTYPE_DEVICE ;
240+ * dev_id = vmm_dev_id ;
241+ } else {
242+ /* Host memory, nothing to do here */
243+ return 0 ;
244+ }
245+ } else {
246+ if (is_vmm ) {
247+ * dev_id = vmm_dev_id ;
248+ } else {
249+ result = cuPointerGetAttribute (& mem_ctx ,
250+ CU_POINTER_ATTRIBUTE_CONTEXT , dbuf );
251+ /* query the device from the context */
252+ * dev_id = accelerator_cuda_get_device_id (mem_ctx );
253+ }
143254 }
255+ #endif /* OPAL_CUDA_GET_ATTRIBUTES */
256+
144257 /* Must be a device pointer */
145258 assert (CU_MEMORYTYPE_DEVICE == mem_type );
146- #endif /* OPAL_CUDA_GET_ATTRIBUTES */
147259
148260 /* This piece of code was added in to handle in a case involving
149261 * OMP threads. The user had initialized CUDA and then spawned
@@ -166,6 +278,16 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
166278 return OPAL_ERROR ;
167279 }
168280#endif /* OPAL_CUDA_GET_ATTRIBUTES */
281+ if (is_vmm ) {
282+ /* This function is expected to set context if pointer is device
283+ * accessible but VMM allocations have NULL context associated
284+ * which cannot be set against the calling thread */
285+ opal_output (0 ,
286+ "CUDA: unable to set context with the given pointer"
287+ "ptr=%p aborting..." , addr );
288+ return OPAL_ERROR ;
289+ }
290+
169291 result = cuCtxSetCurrent (mem_ctx );
170292 if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
171293 opal_output (0 ,
0 commit comments