4141#include "opal_config.h"
4242#include "opal/mca/rcache/base/base.h"
4343#include "opal/mca/rcache/gpusm/rcache_gpusm.h"
44- #include "opal/cuda/common_cuda.h"
44+ #include "opal/include/opal/opal_cuda.h"
45+ #include <cuda.h>
4546
4647/**
4748 * Called when the registration free list is created. An event is created
4849 * for each entry.
4950 */
5051static void mca_rcache_gpusm_registration_constructor (mca_rcache_gpusm_registration_t * item )
5152{
52- mca_common_cuda_construct_event_and_handle (& item -> event , (void * ) & item -> evtHandle );
53+ uintptr_t * event = & item -> event ;
54+ void * handle = (void * ) & item -> evtHandle ;
55+ CUresult result ;
56+
57+ result = cuEventCreate ((CUevent * ) event ,
58+ CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING );
59+ if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
60+ opal_output (0 , "cuEventCreate failed\n" );
61+ }
62+
63+ result = cuIpcGetEventHandle ((CUipcEventHandle * ) handle , (CUevent ) * event );
64+ if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
65+ opal_output (0 , "cuIpcGetEventHandle failed\n" );
66+ }
5367}
5468
5569/**
5670 * Called when the program is exiting. This destroys the events.
5771 */
5872static void mca_rcache_gpusm_registration_destructor (mca_rcache_gpusm_registration_t * item )
5973{
60- mca_common_cuda_destruct_event (item -> event );
74+ uintptr_t event = item -> event ;
75+ CUresult result ;
6176
77+ result = cuEventDestroy ((CUevent ) event );
78+ if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
79+ opal_output (0 , "cuEventDestroy failed" );
80+ }
6281}
6382
6483OBJ_CLASS_INSTANCE (mca_rcache_gpusm_registration_t , mca_rcache_base_registration_t ,
@@ -81,7 +100,7 @@ void mca_rcache_gpusm_module_init(mca_rcache_gpusm_module_t *rcache)
81100 /* Start with 0 entries in the free list since CUDA may not have
82101 * been initialized when this free list is created and there is
83102 * some CUDA specific activities that need to be done. */
84- opal_free_list_init (& rcache -> reg_list , sizeof (struct mca_rcache_common_cuda_reg_t ),
103+ opal_free_list_init (& rcache -> reg_list , sizeof (struct mca_opal_cuda_reg_t ),
85104 opal_cache_line_size , OBJ_CLASS (mca_rcache_gpusm_registration_t ), 0 ,
86105 opal_cache_line_size , 0 , -1 , 64 , NULL , 0 , NULL , NULL , NULL );
87106}
@@ -96,6 +115,77 @@ int mca_rcache_gpusm_find(mca_rcache_base_module_t *rcache, void *addr, size_t s
96115 return mca_rcache_gpusm_register (rcache , addr , size , 0 , 0 , reg );
97116}
98117
118+ /*
119+ * Get the memory handle of a local section of memory that can be sent
120+ * to the remote size so it can access the memory. This is the
121+ * registration function for the sending side of a message transfer.
122+ */
123+ static int mca_rcache_gpusm_get_mem_handle (void * base , size_t size , mca_rcache_base_registration_t * newreg )
124+ {
125+ CUmemorytype memType ;
126+ CUresult result ;
127+ CUipcMemHandle * memHandle ;
128+ CUdeviceptr pbase ;
129+ size_t psize ;
130+
131+ mca_opal_cuda_reg_t * cuda_reg = (mca_opal_cuda_reg_t * ) newreg ;
132+ memHandle = (CUipcMemHandle * ) cuda_reg -> data .memHandle ;
133+
134+ /* We should only be there if this is a CUDA device pointer */
135+ result = cuPointerGetAttribute (& memType , CU_POINTER_ATTRIBUTE_MEMORY_TYPE ,
136+ (CUdeviceptr ) base );
137+ assert (CUDA_SUCCESS == result );
138+ assert (CU_MEMORYTYPE_DEVICE == memType );
139+
140+ /* Get the memory handle so we can send it to the remote process. */
141+ result = cuIpcGetMemHandle (memHandle , (CUdeviceptr ) base );
142+
143+ if (CUDA_SUCCESS != result ) {
144+ return OPAL_ERROR ;
145+ }
146+
147+ /* Need to get the real base and size of the memory handle. This is
148+ * how the remote side saves the handles in a cache. */
149+ result = cuMemGetAddressRange (& pbase , & psize , (CUdeviceptr ) base );
150+ if (CUDA_SUCCESS != result ) {
151+ return OPAL_ERROR ;
152+ }
153+
154+ /* Store all the information in the registration */
155+ cuda_reg -> base .base = (void * ) pbase ;
156+ cuda_reg -> base .bound = (unsigned char * ) pbase + psize - 1 ;
157+ cuda_reg -> data .memh_seg_addr .pval = (void * ) pbase ;
158+ cuda_reg -> data .memh_seg_len = psize ;
159+
160+ #if OPAL_CUDA_SYNC_MEMOPS
161+ /* With CUDA 6.0, we can set an attribute on the memory pointer that will
162+ * ensure any synchronous copies are completed prior to any other access
163+ * of the memory region. This means we do not need to record an event
164+ * and send to the remote side.
165+ */
166+ memType = 1 ; /* Just use this variable since we already have it */
167+ result = cuPointerSetAttribute (& memType , CU_POINTER_ATTRIBUTE_SYNC_MEMOPS ,
168+ (CUdeviceptr ) base );
169+ if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
170+ return OPAL_ERROR ;
171+ }
172+ #else
173+ /* Need to record the event to ensure that any memcopies into the
174+ * device memory have completed. The event handle associated with
175+ * this event is sent to the remote process so that it will wait
176+ * on this event prior to copying data out of the device memory.
177+ * Note that this needs to be the NULL stream to make since it is
178+ * unknown what stream any copies into the device memory were done
179+ * with. */
180+ result = cuEventRecord ((CUevent ) cuda_reg -> data .event , 0 );
181+ if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
182+ return OPAL_ERROR ;
183+ }
184+ #endif /* OPAL_CUDA_SYNC_MEMOPS */
185+
186+ return OPAL_SUCCESS ;
187+ }
188+
99189/*
100190 * This is the one function that does all the work. It will call into
101191 * the register function to get the memory handle for the sending
@@ -133,7 +223,7 @@ int mca_rcache_gpusm_register(mca_rcache_base_module_t *rcache, void *addr, size
133223 gpusm_reg -> flags = flags ;
134224 gpusm_reg -> access_flags = access_flags ;
135225
136- rc = cuda_getmemhandle (base , size , gpusm_reg , NULL );
226+ rc = mca_rcache_gpusm_get_mem_handle (base , size , gpusm_reg );
137227
138228 if (rc != OPAL_SUCCESS ) {
139229 opal_free_list_return (& rcache_gpusm -> reg_list , item );
0 commit comments