Skip to content

Commit 4ab272a

Browse files
committed
Merge pull request open-mpi#555 from rolfv/pr/fix-cuda-war-config-2.x
Add config code to check for need of workaround. Add runtime way to turn off just in case.
2 parents 8f8590e + c3534a3 commit 4ab272a

File tree

1 file changed

+12
-1
lines changed

1 file changed

+12
-1
lines changed

opal/mca/common/cuda/common_cuda.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ static CUstream ipcStream = NULL;
126126
static CUstream dtohStream = NULL;
127127
static CUstream htodStream = NULL;
128128
static CUstream memcpyStream = NULL;
129+
static int mca_common_cuda_gpu_mem_check_workaround = (CUDA_VERSION > 7000) ? 0 : 1;
129130
static opal_mutex_t common_cuda_init_lock;
130131
static opal_mutex_t common_cuda_htod_lock;
131132
static opal_mutex_t common_cuda_dtoh_lock;
@@ -300,6 +301,13 @@ void mca_common_cuda_register_mca_variables(void)
300301
MCA_BASE_VAR_SCOPE_READONLY,
301302
&mca_common_cuda_cumemcpy_timing);
302303
#endif /* OPAL_ENABLE_DEBUG */
304+
305+
(void) mca_base_var_register("ompi", "mpi", "common_cuda", "gpu_mem_check_workaround",
306+
"Set to 0 to disable GPU memory check workaround. A user would rarely have to do this.",
307+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
308+
OPAL_INFO_LVL_9,
309+
MCA_BASE_VAR_SCOPE_READONLY,
310+
&mca_common_cuda_gpu_mem_check_workaround);
303311
}
304312

305313
/**
@@ -774,6 +782,9 @@ static int mca_common_cuda_stage_three_init(void)
774782
"CUDA: cuMemHostRegister OK on test region");
775783
}
776784

785+
opal_output_verbose(20, mca_common_cuda_output,
786+
"CUDA: the extra gpu memory check is %s", (mca_common_cuda_gpu_mem_check_workaround == 1) ? "on":"off");
787+
777788
opal_output_verbose(30, mca_common_cuda_output,
778789
"CUDA: initialized");
779790
opal_atomic_mb(); /* Make sure next statement does not get reordered */
@@ -1832,7 +1843,7 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
18321843
* made it this far, then the assumption at this point is we have GPU memory.
18331844
* Unfotunately, this extra call is costing us another 100 ns almost doubling
18341845
* the cost of this entire function. */
1835-
{
1846+
if (OPAL_LIKELY(mca_common_cuda_gpu_mem_check_workaround)) {
18361847
CUdeviceptr pbase;
18371848
size_t psize;
18381849
res = cuFunc.cuMemGetAddressRange(&pbase, &psize, dbuf);

0 commit comments

Comments
 (0)