Skip to content

Commit 8be2c97

Browse files
author
Rolf vandeVaart
committed
Merge pull request #702 from rolfv/pr/fix-cuda-mca-register
Make explicit call to initalize MCA parameters in common CUDA code.
2 parents c0e0510 + ae0f3cf commit 8be2c97

File tree

4 files changed

+50
-35
lines changed

4 files changed

+50
-35
lines changed

opal/mca/btl/openib/btl_openib_component.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* reserved.
1717
* Copyright (c) 2006-2007 Voltaire All rights reserved.
1818
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
19-
* Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved.
19+
* Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
2020
* Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved
2121
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
2222
* Copyright (c) 2014-2015 Research Organization for Information Science
@@ -200,6 +200,10 @@ static int btl_openib_component_register(void)
200200
return OPAL_ERR_NOT_AVAILABLE;
201201
}
202202

203+
#if OPAL_CUDA_SUPPORT
204+
mca_common_cuda_register_mca_variables();
205+
#endif
206+
203207
return OPAL_SUCCESS;
204208
}
205209

opal/mca/btl/smcuda/btl_smcuda_component.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ static int smcuda_register(void)
195195
if (0 == mca_btl_smcuda.super.btl_cuda_eager_limit) {
196196
mca_btl_smcuda.super.btl_cuda_eager_limit = SIZE_MAX; /* magic number */
197197
}
198+
mca_common_cuda_register_mca_variables();
198199
#endif /* OPAL_CUDA_SUPPORT */
199200
return mca_btl_smcuda_component_verify();
200201
}

opal/mca/common/cuda/common_cuda.c

Lines changed: 43 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ cudaFunctionTable_t cuFunc;
115115
static int stage_one_init_ref_count = 0;
116116
static bool stage_three_init_complete = false;
117117
static bool common_cuda_initialized = false;
118+
static bool common_cuda_mca_parames_registered = false;
118119
static int mca_common_cuda_verbose;
119120
static int mca_common_cuda_output = 0;
120121
bool mca_common_cuda_enabled = false;
@@ -223,42 +224,14 @@ static void cuda_dump_memhandle(int, void *, char *) __opal_attribute_unused__ ;
223224

224225
#endif /* OPAL_CUDA_SUPPORT_41 */
225226

226-
227-
/**
228-
* This is the first stage of initialization. This function is
229-
* called explicitly by any BTLs that can support CUDA-aware.
230-
* It is called during the component open phase of initialization.
231-
* This function will register some mca variables and then open
232-
* and load the symbols needed from the CUDA driver library. Look for
233-
* the SONAME of the library which is libcuda.so.1. In most cases,
234-
* this will result in the library found. However, there are some
235-
* setups that require the extra steps for searching. Any failure
236-
* will result in this initialization failing and status will be set
237-
* showing that.
238-
*/
239-
int mca_common_cuda_stage_one_init(void)
227+
/* This is a seperate function so we can see these variables with ompi_info and
228+
* also set them with the tools interface */
229+
void mca_common_cuda_register_mca_variables(void)
240230
{
241-
int retval, i, j;
242-
char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL};
243-
char *searchpaths[] = {"", "/usr/lib64", NULL};
244-
char **errmsgs = NULL;
245-
char *errmsg = NULL;
246-
int errsize;
247-
bool stage_one_init_passed = false;
248231

249-
stage_one_init_ref_count++;
250-
if (stage_one_init_ref_count > 1) {
251-
opal_output_verbose(10, mca_common_cuda_output,
252-
"CUDA: stage_one_init_ref_count is now %d, no need to init",
253-
stage_one_init_ref_count);
254-
return OPAL_SUCCESS;
232+
if (false == common_cuda_mca_parames_registered) {
233+
common_cuda_mca_parames_registered = true;
255234
}
256-
257-
OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);
258-
OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t);
259-
OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t);
260-
OBJ_CONSTRUCT(&common_cuda_ipc_lock, opal_mutex_t);
261-
262235
/* Set different levels of verbosity in the cuda related code. */
263236
mca_common_cuda_verbose = 0;
264237
(void) mca_base_var_register("ompi", "mpi", "common_cuda", "verbose",
@@ -327,6 +300,43 @@ int mca_common_cuda_stage_one_init(void)
327300
MCA_BASE_VAR_SCOPE_READONLY,
328301
&mca_common_cuda_cumemcpy_timing);
329302
#endif /* OPAL_ENABLE_DEBUG */
303+
}
304+
305+
/**
306+
* This is the first stage of initialization. This function is called
307+
* explicitly by any BTLs that can support CUDA-aware. It is called during
308+
* the component open phase of initialization. This fuction will look for
309+
* the SONAME of the library which is libcuda.so.1. In most cases, this will
310+
* result in the library found. However, there are some setups that require
311+
* the extra steps for searching. This function will then load the symbols
312+
* needed from the CUDA driver library. Any failure will result in this
313+
* initialization failing and status will be set showing that.
314+
*/
315+
int mca_common_cuda_stage_one_init(void)
316+
{
317+
int retval, i, j;
318+
char *cudalibs[] = {"libcuda.so.1", "libcuda.dylib", NULL};
319+
char *searchpaths[] = {"", "/usr/lib64", NULL};
320+
char **errmsgs = NULL;
321+
char *errmsg = NULL;
322+
int errsize;
323+
bool stage_one_init_passed = false;
324+
325+
stage_one_init_ref_count++;
326+
if (stage_one_init_ref_count > 1) {
327+
opal_output_verbose(10, mca_common_cuda_output,
328+
"CUDA: stage_one_init_ref_count is now %d, no need to init",
329+
stage_one_init_ref_count);
330+
return OPAL_SUCCESS;
331+
}
332+
333+
/* This is a no-op in most cases as the parameters were registered earlier */
334+
mca_common_cuda_register_mca_variables();
335+
336+
OBJ_CONSTRUCT(&common_cuda_init_lock, opal_mutex_t);
337+
OBJ_CONSTRUCT(&common_cuda_htod_lock, opal_mutex_t);
338+
OBJ_CONSTRUCT(&common_cuda_dtoh_lock, opal_mutex_t);
339+
OBJ_CONSTRUCT(&common_cuda_ipc_lock, opal_mutex_t);
330340

331341
mca_common_cuda_output = opal_output_open(NULL);
332342
opal_output_set_verbosity(mca_common_cuda_output, mca_common_cuda_verbose);

opal/mca/common/cuda/common_cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ struct mca_mpool_common_cuda_reg_t {
4444
typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t;
4545
extern bool mca_common_cuda_enabled;
4646

47-
OPAL_DECLSPEC int mca_common_cuda_register_mca_variables(void);
47+
OPAL_DECLSPEC void mca_common_cuda_register_mca_variables(void);
4848

4949
OPAL_DECLSPEC void mca_common_cuda_register(void *ptr, size_t amount, char *msg);
5050

0 commit comments

Comments
 (0)