@@ -126,6 +126,7 @@ static CUstream ipcStream = NULL;
126126static CUstream dtohStream = NULL ;
127127static CUstream htodStream = NULL ;
128128static CUstream memcpyStream = NULL ;
129+ static int mca_common_cuda_gpu_mem_check_workaround = (CUDA_VERSION > 7000 ) ? 0 : 1 ;
129130static opal_mutex_t common_cuda_init_lock ;
130131static opal_mutex_t common_cuda_htod_lock ;
131132static opal_mutex_t common_cuda_dtoh_lock ;
@@ -300,6 +301,13 @@ void mca_common_cuda_register_mca_variables(void)
300301 MCA_BASE_VAR_SCOPE_READONLY ,
301302 & mca_common_cuda_cumemcpy_timing );
302303#endif /* OPAL_ENABLE_DEBUG */
304+
305+ (void ) mca_base_var_register ("ompi" , "mpi" , "common_cuda" , "gpu_mem_check_workaround" ,
306+ "Set to 0 to disable GPU memory check workaround. A user would rarely have to do this." ,
307+ MCA_BASE_VAR_TYPE_INT , NULL , 0 , 0 ,
308+ OPAL_INFO_LVL_9 ,
309+ MCA_BASE_VAR_SCOPE_READONLY ,
310+ & mca_common_cuda_gpu_mem_check_workaround );
303311}
304312
305313/**
@@ -774,6 +782,9 @@ static int mca_common_cuda_stage_three_init(void)
774782 "CUDA: cuMemHostRegister OK on test region" );
775783 }
776784
785+ opal_output_verbose (20 , mca_common_cuda_output ,
786+ "CUDA: the extra gpu memory check is %s" , (mca_common_cuda_gpu_mem_check_workaround == 1 ) ? "on" :"off" );
787+
777788 opal_output_verbose (30 , mca_common_cuda_output ,
778789 "CUDA: initialized" );
779790 opal_atomic_mb (); /* Make sure next statement does not get reordered */
@@ -1832,7 +1843,7 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
18321843 * made it this far, then the assumption at this point is we have GPU memory.
18331844 * Unfotunately, this extra call is costing us another 100 ns almost doubling
18341845 * the cost of this entire function. */
1835- {
1846+ if ( OPAL_LIKELY ( mca_common_cuda_gpu_mem_check_workaround )) {
18361847 CUdeviceptr pbase ;
18371848 size_t psize ;
18381849 res = cuFunc .cuMemGetAddressRange (& pbase , & psize , dbuf );
0 commit comments