1
1
/*
2
+ * Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
2
3
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
3
4
* Copyright (c) 2014 Research Organization for Information Science
4
5
* and Technology (RIST). All rights reserved.
@@ -77,9 +78,93 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
77
78
accelerator_cuda_get_buffer_id
78
79
};
79
80
81
+ static int accelerator_cuda_check_vmm (CUdeviceptr dbuf , CUmemorytype * mem_type ,
82
+ int * dev_id )
83
+ {
84
+ #if OPAL_CUDA_VMM_SUPPORT
85
+ static int device_count = -1 ;
86
+ CUmemAllocationProp prop ;
87
+ CUmemLocation location ;
88
+ CUresult result ;
89
+ unsigned long long flags ;
90
+ CUmemGenericAllocationHandle alloc_handle ;
91
+
92
+ if (device_count == -1 ) {
93
+ result = cuDeviceGetCount (& device_count );
94
+ if (result != CUDA_SUCCESS ) {
95
+ return 0 ;
96
+ }
97
+ }
98
+
99
+ result = cuMemRetainAllocationHandle (& alloc_handle , (void * )dbuf );
100
+ if (result != CUDA_SUCCESS ) {
101
+ return 0 ;
102
+ }
103
+
104
+ result = cuMemGetAllocationPropertiesFromHandle (& prop , alloc_handle );
105
+ if (result != CUDA_SUCCESS ) {
106
+ cuMemRelease (alloc_handle );
107
+ return 0 ;
108
+ }
109
+
110
+ if (prop .location .type == CU_MEM_LOCATION_TYPE_DEVICE ) {
111
+ * mem_type = CU_MEMORYTYPE_DEVICE ;
112
+ * dev_id = prop .location .id ;
113
+ cuMemRelease (alloc_handle );
114
+ return 1 ;
115
+ }
116
+
117
+ if (prop .location .type == CU_MEM_LOCATION_TYPE_HOST_NUMA ) {
118
+ /* check if device has access */
119
+ for (int i = 0 ; i < device_count ; i ++ ) {
120
+ location .type = CU_MEM_LOCATION_TYPE_DEVICE ;
121
+ location .id = i ;
122
+ result = cuMemGetAccess (& flags , & location , dbuf );
123
+ if ((CUDA_SUCCESS == result ) &&
124
+ (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags )) {
125
+ * mem_type = CU_MEMORYTYPE_DEVICE ;
126
+ * dev_id = i ;
127
+ cuMemRelease (alloc_handle );
128
+ return 1 ;
129
+ }
130
+ }
131
+ }
132
+
133
+ /* host must have access as device access possibility is exhausted */
134
+ * mem_type = CU_MEMORYTYPE_HOST ;
135
+ * dev_id = MCA_ACCELERATOR_NO_DEVICE_ID ;
136
+ cuMemRelease (alloc_handle );
137
+ return 1 ;
138
+
139
+ #endif
140
+
141
+ return 0 ;
142
+ }
143
+
144
+ static int accelerator_cuda_get_device_id (CUcontext mem_ctx ) {
145
+ /* query the device from the context */
146
+ int dev_id = -1 ;
147
+ CUdevice ptr_dev ;
148
+ cuCtxPushCurrent (mem_ctx );
149
+ cuCtxGetDevice (& ptr_dev );
150
+ for (int i = 0 ; i < opal_accelerator_cuda_num_devices ; ++ i ) {
151
+ CUdevice dev ;
152
+ cuDeviceGet (& dev , i );
153
+ if (dev == ptr_dev ) {
154
+ dev_id = i ;
155
+ break ;
156
+ }
157
+ }
158
+ cuCtxPopCurrent (& mem_ctx );
159
+ return dev_id ;
160
+ }
161
+
80
162
static int accelerator_cuda_check_addr (const void * addr , int * dev_id , uint64_t * flags )
81
163
{
82
164
CUresult result ;
165
+ int is_vmm = 0 ;
166
+ int vmm_dev_id = MCA_ACCELERATOR_NO_DEVICE_ID ;
167
+ CUmemorytype vmm_mem_type = 0 ;
83
168
CUmemorytype mem_type = 0 ;
84
169
CUdeviceptr dbuf = (CUdeviceptr ) addr ;
85
170
CUcontext ctx = NULL , mem_ctx = NULL ;
@@ -91,6 +176,8 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
91
176
92
177
* flags = 0 ;
93
178
179
+ is_vmm = accelerator_cuda_check_vmm (dbuf , & vmm_mem_type , & vmm_dev_id );
180
+
94
181
#if OPAL_CUDA_GET_ATTRIBUTES
95
182
uint32_t is_managed = 0 ;
96
183
/* With CUDA 7.0, we can get multiple attributes with a single call */
@@ -120,14 +207,24 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
120
207
return OPAL_ERROR ;
121
208
}
122
209
} else if (CU_MEMORYTYPE_HOST == mem_type ) {
123
- /* Host memory, nothing to do here */
124
- return 0 ;
210
+ if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE )) {
211
+ mem_type = CU_MEMORYTYPE_DEVICE ;
212
+ * dev_id = vmm_dev_id ;
213
+ } else {
214
+ /* Host memory, nothing to do here */
215
+ return 0 ;
216
+ }
125
217
} else if (0 == mem_type ) {
126
218
/* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
127
219
return 0 ;
220
+ } else {
221
+ if (is_vmm ) {
222
+ * dev_id = vmm_dev_id ;
223
+ } else {
224
+ /* query the device from the context */
225
+ * dev_id = accelerator_cuda_get_device_id (mem_ctx );
226
+ }
128
227
}
129
- /* Must be a device pointer */
130
- assert (CU_MEMORYTYPE_DEVICE == mem_type );
131
228
#else /* OPAL_CUDA_GET_ATTRIBUTES */
132
229
result = cuPointerGetAttribute (& mem_type , CU_POINTER_ATTRIBUTE_MEMORY_TYPE , dbuf );
133
230
if (CUDA_SUCCESS != result ) {
@@ -138,12 +235,27 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
138
235
return OPAL_ERROR ;
139
236
}
140
237
} else if (CU_MEMORYTYPE_HOST == mem_type ) {
141
- /* Host memory, nothing to do here */
142
- return 0 ;
238
+ if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE )) {
239
+ mem_type = CU_MEMORYTYPE_DEVICE ;
240
+ * dev_id = vmm_dev_id ;
241
+ } else {
242
+ /* Host memory, nothing to do here */
243
+ return 0 ;
244
+ }
245
+ } else {
246
+ if (is_vmm ) {
247
+ * dev_id = vmm_dev_id ;
248
+ } else {
249
+ result = cuPointerGetAttribute (& mem_ctx ,
250
+ CU_POINTER_ATTRIBUTE_CONTEXT , dbuf );
251
+ /* query the device from the context */
252
+ * dev_id = accelerator_cuda_get_device_id (mem_ctx );
253
+ }
143
254
}
255
+ #endif /* OPAL_CUDA_GET_ATTRIBUTES */
256
+
144
257
/* Must be a device pointer */
145
258
assert (CU_MEMORYTYPE_DEVICE == mem_type );
146
- #endif /* OPAL_CUDA_GET_ATTRIBUTES */
147
259
148
260
/* This piece of code was added in to handle in a case involving
149
261
* OMP threads. The user had initialized CUDA and then spawned
@@ -166,6 +278,16 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
166
278
return OPAL_ERROR ;
167
279
}
168
280
#endif /* OPAL_CUDA_GET_ATTRIBUTES */
281
+ if (is_vmm ) {
282
+ /* This function is expected to set context if pointer is device
283
+ * accessible but VMM allocations have NULL context associated
284
+ * which cannot be set against the calling thread */
285
+ opal_output (0 ,
286
+ "CUDA: unable to set context with the given pointer"
287
+ "ptr=%p aborting..." , addr );
288
+ return OPAL_ERROR ;
289
+ }
290
+
169
291
result = cuCtxSetCurrent (mem_ctx );
170
292
if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
171
293
opal_output (0 ,
0 commit comments