Skip to content

Commit d9b2c94

Browse files
authored
Merge pull request #4286 from aravindksg/master
Use opal_show_help to warn about PSM2_CUDA envvar setting
2 parents 2a2db13 + f8a2b7f commit d9b2c94

File tree

6 files changed

+51
-28
lines changed

6 files changed

+51
-28
lines changed

ompi/mca/mtl/psm2/help-mtl-psm2.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,7 @@ Unknown path record query mechanism %s. Supported mechanisms are %s.
4545
#
4646
[message too big]
4747
Message size %llu bigger than supported by PSM2 API. Max = %llu
48+
#
49+
[no psm2 cuda env]
50+
Using CUDA enabled OpenMPI but PSM2_CUDA environment variable is %s.
51+
This is not a recommended combination. If the application uses %s.

ompi/mca/mtl/psm2/mtl_psm2.c

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,6 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
100100
char *generated_key;
101101
char env_string[256];
102102
int rc;
103-
#if OPAL_CUDA_SUPPORT
104-
char *cuda_env;
105-
#endif
106103

107104
generated_key = getenv(OPAL_MCA_PREFIX"orte_precondition_transports");
108105
memset(uu, 0, sizeof(psm2_uuid_t));
@@ -178,11 +175,6 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
178175

179176
#if OPAL_CUDA_SUPPORT
180177
ompi_mtl_psm2.super.mtl_flags |= MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE;
181-
182-
cuda_env = getenv("PSM2_CUDA");
183-
if (!cuda_env || ( strcmp(cuda_env, "0") == 0) )
184-
opal_output(0, "Warning: If running with device buffers, there is a"
185-
" chance the application might fail. Try setting PSM2_CUDA=1.\n");
186178
#endif
187179

188180
return OMPI_SUCCESS;

ompi/mca/mtl/psm2/mtl_psm2_component.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,9 @@ static int
199199
ompi_mtl_psm2_component_register(void)
200200
{
201201
int num_local_procs, num_total_procs;
202+
#if OPAL_CUDA_SUPPORT
203+
char *cuda_env;
204+
#endif
202205

203206
ompi_mtl_psm2.connect_timeout = 180;
204207
(void) mca_base_component_var_register(&mca_mtl_psm2_component.super.mtl_version,
@@ -223,6 +226,30 @@ ompi_mtl_psm2_component_register(void)
223226
param_priority = 40;
224227
}
225228

229+
#if OPAL_CUDA_SUPPORT
230+
/*
231+
* If using CUDA enabled OpenMPI, the user likely intends to
232+
* run with CUDA buffers. So, force-set the envvar here if user failed
233+
* to set it.
234+
*/
235+
cuda_env = getenv("PSM2_CUDA");
236+
if (!cuda_env) {
237+
opal_show_help("help-mtl-psm2.txt",
238+
"no psm2 cuda env", true,
239+
"not set",
240+
"Host buffers,\nthere will be a performance penalty"
241+
" due to OMPI force setting this variable now.\n"
242+
"Set environment variable to 0 if using Host buffers" );
243+
setenv("PSM2_CUDA", "1", 0);
244+
} else if (strcmp(cuda_env, "0") == 0) {
245+
opal_show_help("help-mtl-psm2.txt",
246+
"no psm2 cuda env", true,
247+
"set to 0",
248+
"CUDA buffers,\nthe execution will SEGFAULT."
249+
" Set environment variable to 1 if using CUDA buffers");
250+
}
251+
#endif
252+
226253
(void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version,
227254
"priority", "Priority of the PSM2 MTL component",
228255
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,

ompi/mca/pml/cm/pml_cm.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ mca_pml_cm_recv(void *addr,
185185
&(datatype->super),
186186
count,
187187
addr,
188-
flags,
188+
flags,
189189
&convertor );
190190
#else
191191
MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
@@ -195,7 +195,7 @@ mca_pml_cm_recv(void *addr,
195195
&(datatype->super),
196196
count,
197197
addr,
198-
flags,
198+
flags,
199199
&convertor );
200200
#endif
201201

ompi/mca/pml/cm/pml_cm_recvreq.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ do { \
9494
datatype, \
9595
addr, \
9696
count, \
97-
flags ) \
97+
flags ) \
9898
do { \
9999
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false); \
100100
(request)->req_base.req_ompi.req_mpi_object.comm = comm; \
@@ -116,7 +116,7 @@ do { \
116116
&(datatype->super), \
117117
count, \
118118
addr, \
119-
flags, \
119+
flags, \
120120
&(request)->req_base.req_convertor ); \
121121
} while(0)
122122
#else
@@ -127,7 +127,7 @@ do { \
127127
datatype, \
128128
addr, \
129129
count, \
130-
flags ) \
130+
flags ) \
131131
do { \
132132
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false); \
133133
(request)->req_base.req_ompi.req_mpi_object.comm = comm; \
@@ -144,7 +144,7 @@ do { \
144144
&(datatype->super), \
145145
count, \
146146
addr, \
147-
flags, \
147+
flags, \
148148
&(request)->req_base.req_convertor ); \
149149
} while(0)
150150
#endif
@@ -158,7 +158,7 @@ do { \
158158
datatype, \
159159
addr, \
160160
count, \
161-
flags, \
161+
flags, \
162162
persistent) \
163163
do { \
164164
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent); \
@@ -197,7 +197,7 @@ do { \
197197
datatype, \
198198
addr, \
199199
count, \
200-
flags, \
200+
flags, \
201201
persistent) \
202202
do { \
203203
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent); \
@@ -219,7 +219,7 @@ do { \
219219
&(datatype->super), \
220220
count, \
221221
addr, \
222-
flags, \
222+
flags, \
223223
&(request)->req_base.req_convertor ); \
224224
} while(0)
225225
#endif

ompi/mca/pml/cm/pml_cm_sendreq.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ do { \
127127
sendmode, \
128128
buf, \
129129
count, \
130-
flags ) \
130+
flags ) \
131131
{ \
132132
OBJ_RETAIN(comm); \
133133
OMPI_DATATYPE_RETAIN(datatype); \
@@ -139,7 +139,7 @@ do { \
139139
&(datatype->super), \
140140
count, \
141141
buf, \
142-
flags, \
142+
flags, \
143143
&(req_send)->req_base.req_convertor ); \
144144
(req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \
145145
(req_send)->req_base.req_ompi.req_status.MPI_SOURCE = \
@@ -158,7 +158,7 @@ do { \
158158
sendmode, \
159159
buf, \
160160
count, \
161-
flags ) \
161+
flags ) \
162162
{ \
163163
OBJ_RETAIN(comm); \
164164
OMPI_DATATYPE_RETAIN(datatype); \
@@ -170,7 +170,7 @@ do { \
170170
&(datatype->super), \
171171
count, \
172172
buf, \
173-
flags, \
173+
flags, \
174174
&(req_send)->req_base.req_convertor ); \
175175
(req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \
176176
(req_send)->req_base.req_ompi.req_status.MPI_SOURCE = \
@@ -191,7 +191,7 @@ do { \
191191
sendmode, \
192192
buf, \
193193
count, \
194-
flags ) \
194+
flags ) \
195195
{ \
196196
OBJ_RETAIN(comm); \
197197
OMPI_DATATYPE_RETAIN(datatype); \
@@ -203,7 +203,7 @@ do { \
203203
&(datatype->super), \
204204
count, \
205205
buf, \
206-
flags, \
206+
flags, \
207207
&(req_send)->req_base.req_convertor ); \
208208
(req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \
209209
(req_send)->req_base.req_ompi.req_status.MPI_SOURCE = \
@@ -223,7 +223,7 @@ do { \
223223
sendmode, \
224224
buf, \
225225
count, \
226-
flags ) \
226+
flags ) \
227227
{ \
228228
OBJ_RETAIN(comm); \
229229
OMPI_DATATYPE_RETAIN(datatype); \
@@ -249,7 +249,7 @@ do { \
249249
&(datatype->super), \
250250
count, \
251251
buf, \
252-
flags, \
252+
flags, \
253253
&(req_send)->req_base.req_convertor ); \
254254
} \
255255
(req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \
@@ -273,7 +273,7 @@ do { \
273273
blocking, \
274274
buf, \
275275
count, \
276-
flags ) \
276+
flags ) \
277277
do { \
278278
OMPI_REQUEST_INIT(&(sendreq->req_send.req_base.req_ompi), \
279279
persistent); \
@@ -289,7 +289,7 @@ do { \
289289
sendmode, \
290290
buf, \
291291
count, \
292-
flags ) \
292+
flags ) \
293293
opal_convertor_get_packed_size( \
294294
&sendreq->req_send.req_base.req_convertor, \
295295
&sendreq->req_count ); \
@@ -309,7 +309,7 @@ do { \
309309
sendmode, \
310310
buf, \
311311
count, \
312-
flags ) \
312+
flags ) \
313313
do { \
314314
OMPI_REQUEST_INIT(&(sendreq->req_send.req_base.req_ompi), \
315315
false); \

0 commit comments

Comments
 (0)