Skip to content

Commit f8a2b7f

Browse files
committed
Use opal_show_help to warn about PSM2_CUDA envvar setting
If Open MPI is configured with CUDA, then user also should be using a CUDA build of PSM2 and therefore be setting PSM2_CUDA environment variable to 1 while using CUDA buffers for transfers. If we detect this setting to be missing, force set it. If user wants to use this build for regular (Host buffer) transfers, we allow the option of setting PSM2_CUDA=0, but print a warning message to user that it is not a recommended usage scenario. Signed-off-by: Aravind Gopalakrishnan <[email protected]>
1 parent b11841a commit f8a2b7f

File tree

6 files changed

+51
-28
lines changed

6 files changed

+51
-28
lines changed

ompi/mca/mtl/psm2/help-mtl-psm2.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,7 @@ Unknown path record query mechanism %s. Supported mechanisms are %s.
4545
#
4646
[message too big]
4747
Message size %llu bigger than supported by PSM2 API. Max = %llu
48+
#
49+
[no psm2 cuda env]
50+
Using CUDA enabled OpenMPI but PSM2_CUDA environment variable is %s.
51+
This is not a recommended combination. If the application uses %s.

ompi/mca/mtl/psm2/mtl_psm2.c

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,6 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
100100
char *generated_key;
101101
char env_string[256];
102102
int rc;
103-
#if OPAL_CUDA_SUPPORT
104-
char *cuda_env;
105-
#endif
106103

107104
generated_key = getenv(OPAL_MCA_PREFIX"orte_precondition_transports");
108105
memset(uu, 0, sizeof(psm2_uuid_t));
@@ -178,11 +175,6 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
178175

179176
#if OPAL_CUDA_SUPPORT
180177
ompi_mtl_psm2.super.mtl_flags |= MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE;
181-
182-
cuda_env = getenv("PSM2_CUDA");
183-
if (!cuda_env || ( strcmp(cuda_env, "0") == 0) )
184-
opal_output(0, "Warning: If running with device buffers, there is a"
185-
" chance the application might fail. Try setting PSM2_CUDA=1.\n");
186178
#endif
187179

188180
return OMPI_SUCCESS;

ompi/mca/mtl/psm2/mtl_psm2_component.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,9 @@ static int
199199
ompi_mtl_psm2_component_register(void)
200200
{
201201
int num_local_procs, num_total_procs;
202+
#if OPAL_CUDA_SUPPORT
203+
char *cuda_env;
204+
#endif
202205

203206
ompi_mtl_psm2.connect_timeout = 180;
204207
(void) mca_base_component_var_register(&mca_mtl_psm2_component.super.mtl_version,
@@ -223,6 +226,30 @@ ompi_mtl_psm2_component_register(void)
223226
param_priority = 40;
224227
}
225228

229+
#if OPAL_CUDA_SUPPORT
230+
/*
231+
* If using CUDA enabled OpenMPI, the user likely intends to
232+
* run with CUDA buffers. So, force-set the envvar here if user failed
233+
* to set it.
234+
*/
235+
cuda_env = getenv("PSM2_CUDA");
236+
if (!cuda_env) {
237+
opal_show_help("help-mtl-psm2.txt",
238+
"no psm2 cuda env", true,
239+
"not set",
240+
"Host buffers,\nthere will be a performance penalty"
241+
" due to OMPI force setting this variable now.\n"
242+
"Set environment variable to 0 if using Host buffers" );
243+
setenv("PSM2_CUDA", "1", 0);
244+
} else if (strcmp(cuda_env, "0") == 0) {
245+
opal_show_help("help-mtl-psm2.txt",
246+
"no psm2 cuda env", true,
247+
"set to 0",
248+
"CUDA buffers,\nthe execution will SEGFAULT."
249+
" Set environment variable to 1 if using CUDA buffers");
250+
}
251+
#endif
252+
226253
(void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version,
227254
"priority", "Priority of the PSM2 MTL component",
228255
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,

ompi/mca/pml/cm/pml_cm.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ mca_pml_cm_recv(void *addr,
185185
&(datatype->super),
186186
count,
187187
addr,
188-
flags,
188+
flags,
189189
&convertor );
190190
#else
191191
MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
@@ -195,7 +195,7 @@ mca_pml_cm_recv(void *addr,
195195
&(datatype->super),
196196
count,
197197
addr,
198-
flags,
198+
flags,
199199
&convertor );
200200
#endif
201201

ompi/mca/pml/cm/pml_cm_recvreq.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ do { \
9494
datatype, \
9595
addr, \
9696
count, \
97-
flags ) \
97+
flags ) \
9898
do { \
9999
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false); \
100100
(request)->req_base.req_ompi.req_mpi_object.comm = comm; \
@@ -116,7 +116,7 @@ do { \
116116
&(datatype->super), \
117117
count, \
118118
addr, \
119-
flags, \
119+
flags, \
120120
&(request)->req_base.req_convertor ); \
121121
} while(0)
122122
#else
@@ -127,7 +127,7 @@ do { \
127127
datatype, \
128128
addr, \
129129
count, \
130-
flags ) \
130+
flags ) \
131131
do { \
132132
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false); \
133133
(request)->req_base.req_ompi.req_mpi_object.comm = comm; \
@@ -144,7 +144,7 @@ do { \
144144
&(datatype->super), \
145145
count, \
146146
addr, \
147-
flags, \
147+
flags, \
148148
&(request)->req_base.req_convertor ); \
149149
} while(0)
150150
#endif
@@ -158,7 +158,7 @@ do { \
158158
datatype, \
159159
addr, \
160160
count, \
161-
flags, \
161+
flags, \
162162
persistent) \
163163
do { \
164164
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent); \
@@ -197,7 +197,7 @@ do { \
197197
datatype, \
198198
addr, \
199199
count, \
200-
flags, \
200+
flags, \
201201
persistent) \
202202
do { \
203203
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent); \
@@ -219,7 +219,7 @@ do { \
219219
&(datatype->super), \
220220
count, \
221221
addr, \
222-
flags, \
222+
flags, \
223223
&(request)->req_base.req_convertor ); \
224224
} while(0)
225225
#endif

ompi/mca/pml/cm/pml_cm_sendreq.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ do { \
127127
sendmode, \
128128
buf, \
129129
count, \
130-
flags ) \
130+
flags ) \
131131
{ \
132132
OBJ_RETAIN(comm); \
133133
OMPI_DATATYPE_RETAIN(datatype); \
@@ -139,7 +139,7 @@ do { \
139139
&(datatype->super), \
140140
count, \
141141
buf, \
142-
flags, \
142+
flags, \
143143
&(req_send)->req_base.req_convertor ); \
144144
(req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \
145145
(req_send)->req_base.req_ompi.req_status.MPI_SOURCE = \
@@ -158,7 +158,7 @@ do { \
158158
sendmode, \
159159
buf, \
160160
count, \
161-
flags ) \
161+
flags ) \
162162
{ \
163163
OBJ_RETAIN(comm); \
164164
OMPI_DATATYPE_RETAIN(datatype); \
@@ -170,7 +170,7 @@ do { \
170170
&(datatype->super), \
171171
count, \
172172
buf, \
173-
flags, \
173+
flags, \
174174
&(req_send)->req_base.req_convertor ); \
175175
(req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \
176176
(req_send)->req_base.req_ompi.req_status.MPI_SOURCE = \
@@ -191,7 +191,7 @@ do { \
191191
sendmode, \
192192
buf, \
193193
count, \
194-
flags ) \
194+
flags ) \
195195
{ \
196196
OBJ_RETAIN(comm); \
197197
OMPI_DATATYPE_RETAIN(datatype); \
@@ -203,7 +203,7 @@ do { \
203203
&(datatype->super), \
204204
count, \
205205
buf, \
206-
flags, \
206+
flags, \
207207
&(req_send)->req_base.req_convertor ); \
208208
(req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \
209209
(req_send)->req_base.req_ompi.req_status.MPI_SOURCE = \
@@ -223,7 +223,7 @@ do { \
223223
sendmode, \
224224
buf, \
225225
count, \
226-
flags ) \
226+
flags ) \
227227
{ \
228228
OBJ_RETAIN(comm); \
229229
OMPI_DATATYPE_RETAIN(datatype); \
@@ -249,7 +249,7 @@ do { \
249249
&(datatype->super), \
250250
count, \
251251
buf, \
252-
flags, \
252+
flags, \
253253
&(req_send)->req_base.req_convertor ); \
254254
} \
255255
(req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \
@@ -273,7 +273,7 @@ do { \
273273
blocking, \
274274
buf, \
275275
count, \
276-
flags ) \
276+
flags ) \
277277
do { \
278278
OMPI_REQUEST_INIT(&(sendreq->req_send.req_base.req_ompi), \
279279
persistent); \
@@ -289,7 +289,7 @@ do { \
289289
sendmode, \
290290
buf, \
291291
count, \
292-
flags ) \
292+
flags ) \
293293
opal_convertor_get_packed_size( \
294294
&sendreq->req_send.req_base.req_convertor, \
295295
&sendreq->req_count ); \
@@ -309,7 +309,7 @@ do { \
309309
sendmode, \
310310
buf, \
311311
count, \
312-
flags ) \
312+
flags ) \
313313
do { \
314314
OMPI_REQUEST_INIT(&(sendreq->req_send.req_base.req_ompi), \
315315
false); \

0 commit comments

Comments
 (0)