11/*
2+ * Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
23 * Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
34 * Copyright (c) 2014 Research Organization for Information Science
45 * and Technology (RIST). All rights reserved.
56 * Copyright (c) 2014 Mellanox Technologies, Inc.
67 * All rights reserved.
78 * Copyright (c) Amazon.com, Inc. or its affiliates.
89 * All Rights reserved.
9- * Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
1010 * Copyright (c) 2024 The University of Tennessee and The University
1111 * of Tennessee Research Foundation. All rights
1212 * reserved.
@@ -154,9 +154,75 @@ static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
154154 return dev_id ;
155155}
156156
157+ static int accelerator_cuda_check_vmm (CUdeviceptr dbuf , CUmemorytype * mem_type ,
158+ int * dev_id )
159+ {
160+ #if OPAL_CUDA_VMM_SUPPORT
161+ static int device_count = -1 ;
162+ CUmemAllocationProp prop ;
163+ CUmemLocation location ;
164+ CUresult result ;
165+ unsigned long long flags ;
166+ CUmemGenericAllocationHandle alloc_handle ;
167+
168+ if (device_count == -1 ) {
169+ result = cuDeviceGetCount (& device_count );
170+ if (result != CUDA_SUCCESS ) {
171+ return 0 ;
172+ }
173+ }
174+
175+ result = cuMemRetainAllocationHandle (& alloc_handle , (void * )dbuf );
176+ if (result != CUDA_SUCCESS ) {
177+ return 0 ;
178+ }
179+
180+ result = cuMemGetAllocationPropertiesFromHandle (& prop , alloc_handle );
181+ if (result != CUDA_SUCCESS ) {
182+ cuMemRelease (alloc_handle );
183+ return 0 ;
184+ }
185+
186+ if (prop .location .type == CU_MEM_LOCATION_TYPE_DEVICE ) {
187+ * mem_type = CU_MEMORYTYPE_DEVICE ;
188+ * dev_id = prop .location .id ;
189+ cuMemRelease (alloc_handle );
190+ return 1 ;
191+ }
192+
193+ if (prop .location .type == CU_MEM_LOCATION_TYPE_HOST_NUMA ) {
194+ /* check if device has access */
195+ for (int i = 0 ; i < device_count ; i ++ ) {
196+ location .type = CU_MEM_LOCATION_TYPE_DEVICE ;
197+ location .id = i ;
198+ result = cuMemGetAccess (& flags , & location , dbuf );
199+ if ((CUDA_SUCCESS == result ) &&
200+ (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags )) {
201+ * mem_type = CU_MEMORYTYPE_DEVICE ;
202+ * dev_id = i ;
203+ cuMemRelease (alloc_handle );
204+ return 1 ;
205+ }
206+ }
207+ }
208+
209+ /* host must have access as device access possibility is exhausted */
210+ * mem_type = CU_MEMORYTYPE_HOST ;
211+ * dev_id = MCA_ACCELERATOR_NO_DEVICE_ID ;
212+ cuMemRelease (alloc_handle );
213+ return 1 ;
214+
215+ #endif
216+
217+ return 0 ;
218+ }
219+
157220static int accelerator_cuda_check_addr (const void * addr , int * dev_id , uint64_t * flags )
158221{
159222 CUresult result ;
223+ int is_vmm = 0 ;
224+ int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID ;
225+ CUmemorytype vmm_mem_type = 0 ;
160226 CUmemorytype mem_type = 0 ;
161227 CUdeviceptr dbuf = (CUdeviceptr ) addr ;
162228 CUcontext ctx = NULL , mem_ctx = NULL ;
@@ -168,6 +234,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
168234
169235 * flags = 0 ;
170236
237+ is_vmm = accelerator_cuda_check_vmm (dbuf , & vmm_mem_type , & vmm_dev_id );
238+
171239#if OPAL_CUDA_GET_ATTRIBUTES
172240 uint32_t is_managed = 0 ;
173241 /* With CUDA 7.0, we can get multiple attributes with a single call */
@@ -197,17 +265,24 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
197265 return OPAL_ERROR ;
198266 }
199267 } else if (CU_MEMORYTYPE_HOST == mem_type ) {
200- /* Host memory, nothing to do here */
201- return 0 ;
268+ if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE )) {
269+ mem_type = CU_MEMORYTYPE_DEVICE ;
270+ * dev_id = vmm_dev_id ;
271+ } else {
272+ /* Host memory, nothing to do here */
273+ return 0 ;
274+ }
202275 } else if (0 == mem_type ) {
203276 /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
204277 return 0 ;
205278 } else {
206- /* query the device from the context */
207- * dev_id = accelerator_cuda_get_device_id (mem_ctx );
279+ if (is_vmm ) {
280+ * dev_id = vmm_dev_id ;
281+ } else {
282+ /* query the device from the context */
283+ * dev_id = accelerator_cuda_get_device_id (mem_ctx );
284+ }
208285 }
209- /* Must be a device pointer */
210- assert (CU_MEMORYTYPE_DEVICE == mem_type );
211286#else /* OPAL_CUDA_GET_ATTRIBUTES */
212287 result = cuPointerGetAttribute (& mem_type , CU_POINTER_ATTRIBUTE_MEMORY_TYPE , dbuf );
213288 if (CUDA_SUCCESS != result ) {
@@ -218,16 +293,27 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
218293 return OPAL_ERROR ;
219294 }
220295 } else if (CU_MEMORYTYPE_HOST == mem_type ) {
221- /* Host memory, nothing to do here */
222- return 0 ;
296+ if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE )) {
297+ mem_type = CU_MEMORYTYPE_DEVICE ;
298+ * dev_id = vmm_dev_id ;
299+ } else {
300+ /* Host memory, nothing to do here */
301+ return 0 ;
302+ }
223303 } else {
224- result = cuPointerGetAttribute (& mem_ctx , CU_POINTER_ATTRIBUTE_CONTEXT , dbuf );
225- /* query the device from the context */
226- * dev_id = accelerator_cuda_get_device_id (mem_ctx );
304+ if (is_vmm ) {
305+ * dev_id = vmm_dev_id ;
306+ } else {
307+ result = cuPointerGetAttribute (& mem_ctx ,
308+ CU_POINTER_ATTRIBUTE_CONTEXT , dbuf );
309+ /* query the device from the context */
310+ * dev_id = accelerator_cuda_get_device_id (mem_ctx );
311+ }
227312 }
313+ #endif /* OPAL_CUDA_GET_ATTRIBUTES */
314+
228315 /* Must be a device pointer */
229316 assert (CU_MEMORYTYPE_DEVICE == mem_type );
230- #endif /* OPAL_CUDA_GET_ATTRIBUTES */
231317
232318 /* This piece of code was added in to handle in a case involving
233319 * OMP threads. The user had initialized CUDA and then spawned
@@ -250,6 +336,16 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
250336 return OPAL_ERROR ;
251337 }
252338#endif /* OPAL_CUDA_GET_ATTRIBUTES */
339+ if (is_vmm ) {
340+ /* This function is expected to set context if pointer is device
341+ * accessible but VMM allocations have NULL context associated
342+ * which cannot be set against the calling thread */
343+ opal_output (0 ,
344+ "CUDA: unable to set context with the given pointer"
345+ "ptr=%p aborting..." , addr );
346+ return OPAL_ERROR ;
347+ }
348+
253349 result = cuCtxSetCurrent (mem_ctx );
254350 if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
255351 opal_output (0 ,
0 commit comments