Skip to content

Commit c3534a3

Browse files
author
Rolf vandeVaart
committed
Add config code to check for need of workaround. Add runtime way to turn it off just in case
(cherry picked from commit open-mpi/ompi@54ab0d1)
1 parent 8f8590e commit c3534a3

File tree

1 file changed

+12
-1
lines changed

1 file changed

+12
-1
lines changed

opal/mca/common/cuda/common_cuda.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ static CUstream ipcStream = NULL;
126126
static CUstream dtohStream = NULL;
127127
static CUstream htodStream = NULL;
128128
static CUstream memcpyStream = NULL;
129+
static int mca_common_cuda_gpu_mem_check_workaround = (CUDA_VERSION > 7000) ? 0 : 1;
129130
static opal_mutex_t common_cuda_init_lock;
130131
static opal_mutex_t common_cuda_htod_lock;
131132
static opal_mutex_t common_cuda_dtoh_lock;
@@ -300,6 +301,13 @@ void mca_common_cuda_register_mca_variables(void)
300301
MCA_BASE_VAR_SCOPE_READONLY,
301302
&mca_common_cuda_cumemcpy_timing);
302303
#endif /* OPAL_ENABLE_DEBUG */
304+
305+
(void) mca_base_var_register("ompi", "mpi", "common_cuda", "gpu_mem_check_workaround",
306+
"Set to 0 to disable GPU memory check workaround. A user would rarely have to do this.",
307+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
308+
OPAL_INFO_LVL_9,
309+
MCA_BASE_VAR_SCOPE_READONLY,
310+
&mca_common_cuda_gpu_mem_check_workaround);
303311
}
304312

305313
/**
@@ -774,6 +782,9 @@ static int mca_common_cuda_stage_three_init(void)
774782
"CUDA: cuMemHostRegister OK on test region");
775783
}
776784

785+
opal_output_verbose(20, mca_common_cuda_output,
786+
"CUDA: the extra gpu memory check is %s", (mca_common_cuda_gpu_mem_check_workaround == 1) ? "on":"off");
787+
777788
opal_output_verbose(30, mca_common_cuda_output,
778789
"CUDA: initialized");
779790
opal_atomic_mb(); /* Make sure next statement does not get reordered */
@@ -1832,7 +1843,7 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
18321843
* made it this far, then the assumption at this point is we have GPU memory.
18331844
* Unfotunately, this extra call is costing us another 100 ns almost doubling
18341845
* the cost of this entire function. */
1835-
{
1846+
if (OPAL_LIKELY(mca_common_cuda_gpu_mem_check_workaround)) {
18361847
CUdeviceptr pbase;
18371848
size_t psize;
18381849
res = cuFunc.cuMemGetAddressRange(&pbase, &psize, dbuf);

0 commit comments

Comments
 (0)