Skip to content

Commit 6f08eaf

Browse files
authored
Merge pull request #12781 from janjust/v5.0.x
V5.0.x: opal/cuda: Handle VMM pointers in cuda_check_addr
2 parents 6f91498 + c86408f commit 6f08eaf

File tree

3 files changed

+142
-7
lines changed

3 files changed

+142
-7
lines changed

config/opal_check_cuda.m4

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
dnl -*- autoconf -*-
22
dnl
3+
dnl Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
34
dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
45
dnl University Research and Technology
56
dnl Corporation. All rights reserved.
@@ -118,6 +119,12 @@ AS_IF([test "$opal_check_cuda_happy" = "yes"],
118119
[#include <$opal_cuda_incdir/cuda.h>])],
119120
[])
120121
122+
# If we have CUDA support, check to see if we have support for cuMemCreate memory on host NUMA.
123+
AS_IF([test "$opal_check_cuda_happy"="yes"],
124+
[AC_CHECK_DECL([CU_MEM_LOCATION_TYPE_HOST_NUMA], [CUDA_VMM_SUPPORT=1], [CUDA_VMM_SUPPORT=0],
125+
[#include <$opal_cuda_incdir/cuda.h>])],
126+
[])
127+
121128
# If we have CUDA support, check to see if we have support for SYNC_MEMOPS
122129
# which was first introduced in CUDA 6.0.
123130
AS_IF([test "$opal_check_cuda_happy" = "yes"],
@@ -160,6 +167,10 @@ AM_CONDITIONAL([OPAL_cuda_support], [test "x$CUDA_SUPPORT" = "x1"])
160167
AC_DEFINE_UNQUOTED([OPAL_CUDA_SUPPORT],$CUDA_SUPPORT,
161168
[Whether we want cuda device pointer support])
162169
170+
AM_CONDITIONAL([OPAL_cuda_vmm_support], [test "x$CUDA_VMM_SUPPORT" = "x1"])
171+
AC_DEFINE_UNQUOTED([OPAL_CUDA_VMM_SUPPORT],$CUDA_VMM_SUPPORT,
172+
[Whether we have CU_MEM_LOCATION_TYPE_HOST_NUMA support available])
173+
163174
AM_CONDITIONAL([OPAL_cuda_sync_memops], [test "x$CUDA_SYNC_MEMOPS" = "x1"])
164175
AC_DEFINE_UNQUOTED([OPAL_CUDA_SYNC_MEMOPS],$CUDA_SYNC_MEMOPS,
165176
[Whether we have CUDA CU_POINTER_ATTRIBUTE_SYNC_MEMOPS support available])

opal/mca/accelerator/cuda/accelerator_cuda.c

Lines changed: 129 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/*
2+
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
23
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
34
* Copyright (c) 2014 Research Organization for Information Science
45
* and Technology (RIST). All rights reserved.
@@ -77,9 +78,93 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
7778
accelerator_cuda_get_buffer_id
7879
};
7980

81+
static int accelerator_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type,
82+
int *dev_id)
83+
{
84+
#if OPAL_CUDA_VMM_SUPPORT
85+
static int device_count = -1;
86+
CUmemAllocationProp prop;
87+
CUmemLocation location;
88+
CUresult result;
89+
unsigned long long flags;
90+
CUmemGenericAllocationHandle alloc_handle;
91+
92+
if (device_count == -1) {
93+
result = cuDeviceGetCount(&device_count);
94+
if (result != CUDA_SUCCESS) {
95+
return 0;
96+
}
97+
}
98+
99+
result = cuMemRetainAllocationHandle(&alloc_handle, (void*)dbuf);
100+
if (result != CUDA_SUCCESS) {
101+
return 0;
102+
}
103+
104+
result = cuMemGetAllocationPropertiesFromHandle(&prop, alloc_handle);
105+
if (result != CUDA_SUCCESS) {
106+
cuMemRelease(alloc_handle);
107+
return 0;
108+
}
109+
110+
if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) {
111+
*mem_type = CU_MEMORYTYPE_DEVICE;
112+
*dev_id = prop.location.id;
113+
cuMemRelease(alloc_handle);
114+
return 1;
115+
}
116+
117+
if (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) {
118+
/* check if device has access */
119+
for (int i = 0; i < device_count; i++) {
120+
location.type = CU_MEM_LOCATION_TYPE_DEVICE;
121+
location.id = i;
122+
result = cuMemGetAccess(&flags, &location, dbuf);
123+
if ((CUDA_SUCCESS == result) &&
124+
(CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) {
125+
*mem_type = CU_MEMORYTYPE_DEVICE;
126+
*dev_id = i;
127+
cuMemRelease(alloc_handle);
128+
return 1;
129+
}
130+
}
131+
}
132+
133+
/* host must have access as device access possibility is exhausted */
134+
*mem_type = CU_MEMORYTYPE_HOST;
135+
*dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
136+
cuMemRelease(alloc_handle);
137+
return 1;
138+
139+
#endif
140+
141+
return 0;
142+
}
143+
144+
static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
145+
/* query the device from the context */
146+
int dev_id = -1;
147+
CUdevice ptr_dev;
148+
cuCtxPushCurrent(mem_ctx);
149+
cuCtxGetDevice(&ptr_dev);
150+
for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
151+
CUdevice dev;
152+
cuDeviceGet(&dev, i);
153+
if (dev == ptr_dev) {
154+
dev_id = i;
155+
break;
156+
}
157+
}
158+
cuCtxPopCurrent(&mem_ctx);
159+
return dev_id;
160+
}
161+
80162
static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
81163
{
82164
CUresult result;
165+
int is_vmm = 0;
166+
int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
167+
CUmemorytype vmm_mem_type = 0;
83168
CUmemorytype mem_type = 0;
84169
CUdeviceptr dbuf = (CUdeviceptr) addr;
85170
CUcontext ctx = NULL, mem_ctx = NULL;
@@ -91,6 +176,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
91176

92177
*flags = 0;
93178

179+
is_vmm = accelerator_cuda_check_vmm(dbuf, &vmm_mem_type, &vmm_dev_id);
180+
94181
#if OPAL_CUDA_GET_ATTRIBUTES
95182
uint32_t is_managed = 0;
96183
/* With CUDA 7.0, we can get multiple attributes with a single call */
@@ -120,14 +207,24 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
120207
return OPAL_ERROR;
121208
}
122209
} else if (CU_MEMORYTYPE_HOST == mem_type) {
123-
/* Host memory, nothing to do here */
124-
return 0;
210+
if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
211+
mem_type = CU_MEMORYTYPE_DEVICE;
212+
*dev_id = vmm_dev_id;
213+
} else {
214+
/* Host memory, nothing to do here */
215+
return 0;
216+
}
125217
} else if (0 == mem_type) {
126218
/* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
127219
return 0;
220+
} else {
221+
if (is_vmm) {
222+
*dev_id = vmm_dev_id;
223+
} else {
224+
/* query the device from the context */
225+
*dev_id = accelerator_cuda_get_device_id(mem_ctx);
226+
}
128227
}
129-
/* Must be a device pointer */
130-
assert(CU_MEMORYTYPE_DEVICE == mem_type);
131228
#else /* OPAL_CUDA_GET_ATTRIBUTES */
132229
result = cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
133230
if (CUDA_SUCCESS != result) {
@@ -138,12 +235,27 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
138235
return OPAL_ERROR;
139236
}
140237
} else if (CU_MEMORYTYPE_HOST == mem_type) {
141-
/* Host memory, nothing to do here */
142-
return 0;
238+
if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) {
239+
mem_type = CU_MEMORYTYPE_DEVICE;
240+
*dev_id = vmm_dev_id;
241+
} else {
242+
/* Host memory, nothing to do here */
243+
return 0;
244+
}
245+
} else {
246+
if (is_vmm) {
247+
*dev_id = vmm_dev_id;
248+
} else {
249+
result = cuPointerGetAttribute(&mem_ctx,
250+
CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
251+
/* query the device from the context */
252+
*dev_id = accelerator_cuda_get_device_id(mem_ctx);
253+
}
143254
}
255+
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
256+
144257
/* Must be a device pointer */
145258
assert(CU_MEMORYTYPE_DEVICE == mem_type);
146-
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
147259

148260
/* This piece of code was added in to handle in a case involving
149261
* OMP threads. The user had initialized CUDA and then spawned
@@ -166,6 +278,16 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
166278
return OPAL_ERROR;
167279
}
168280
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
281+
if (is_vmm) {
282+
/* This function is expected to set context if pointer is device
283+
* accessible but VMM allocations have NULL context associated
284+
* which cannot be set against the calling thread */
285+
opal_output(0,
286+
"CUDA: unable to set context with the given pointer"
287+
"ptr=%p aborting...", addr);
288+
return OPAL_ERROR;
289+
}
290+
169291
result = cuCtxSetCurrent(mem_ctx);
170292
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
171293
opal_output(0,

opal/mca/accelerator/cuda/accelerator_cuda.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_comp
4545

4646
OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;
4747

48+
OPAL_DECLSPEC extern int opal_accelerator_cuda_num_devices;
49+
4850
OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);
4951

5052
END_C_DECLS

0 commit comments

Comments
 (0)