Skip to content

Commit 75684dc

Browse files
author
Ralph Castain
committed
Resolve a race condition for setting our working directory when fork/exec'ing application procs. We have to ensure we do it after the fork occurs since we want to use multiple threads in the odls. Otherwise, the different threads are bouncing the entire process around.
Signed-off-by: Ralph Castain <[email protected]>
1 parent 20bf0dd commit 75684dc

File tree

5 files changed

+186
-155
lines changed

5 files changed

+186
-155
lines changed

orte/mca/odls/alps/odls_alps_module.c

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -342,11 +342,7 @@ static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opt
342342
return ORTE_SUCCESS;
343343
}
344344

345-
static int do_child( orte_proc_t *child,
346-
char *app, char **argv,
347-
char **environ_copy,
348-
orte_job_t *jobdat, int write_fd,
349-
orte_iof_base_io_conf_t opts)
345+
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
350346
{
351347
int i, rc;
352348
sigset_t sigs;
@@ -355,7 +351,7 @@ static int do_child( orte_proc_t *child,
355351
/* Setup the pipe to be close-on-exec */
356352
opal_fd_set_cloexec(write_fd);
357353

358-
if (NULL != child) {
354+
if (NULL != cd->child) {
359355
/* setup stdout/stderr so that any error messages that we
360356
may print out will get displayed back at orterun.
361357
@@ -369,20 +365,19 @@ static int do_child( orte_proc_t *child,
369365
always outputs a nice, single message indicating what
370366
happened
371367
*/
372-
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
373-
&environ_copy))) {
368+
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
374369
ORTE_ERROR_LOG(i);
375370
send_error_show_help(write_fd, 1,
376371
"help-orte-odls-alps.txt",
377372
"iof setup failed",
378-
orte_process_info.nodename, app);
373+
orte_process_info.nodename, cd->app->app);
379374
/* Does not return */
380375
}
381376

382377
/* now set any child-level controls such as binding */
383-
orte_rtc.set(jobdat, child, &environ_copy, write_fd);
378+
orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
384379

385-
} else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
380+
} else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
386381
/* tie stdin/out/err/internal to /dev/null */
387382
int fdnull;
388383
for (i=0; i < 3; i++) {
@@ -393,24 +388,24 @@ static int do_child( orte_proc_t *child,
393388
close(fdnull);
394389
}
395390
fdnull = open("/dev/null", O_RDONLY, 0);
396-
if (fdnull > opts.p_internal[1]) {
397-
dup2(fdnull, opts.p_internal[1]);
391+
if (fdnull > cd->opts.p_internal[1]) {
392+
dup2(fdnull, cd->opts.p_internal[1]);
398393
}
399394
close(fdnull);
400395
}
401396

402-
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
397+
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
403398
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
404399
"close fds",
405-
orte_process_info.nodename, app,
400+
orte_process_info.nodename, cd->app->app,
406401
__FILE__, __LINE__);
407402
}
408403

409404

410-
if (argv == NULL) {
411-
argv = malloc(sizeof(char*)*2);
412-
argv[0] = strdup(app);
413-
argv[1] = NULL;
405+
if (cd->argv == NULL) {
406+
cd->argv = malloc(sizeof(char*)*2);
407+
cd->argv[0] = strdup(cd->app->app);
408+
cd->argv[1] = NULL;
414409
}
415410

416411
/* Set signal handlers back to the default. Do this close to
@@ -437,19 +432,19 @@ static int do_child( orte_proc_t *child,
437432

438433
if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
439434
int jout;
440-
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app);
441-
for (jout=0; NULL != argv[jout]; jout++) {
442-
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, argv[jout]);
435+
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cd->app->app);
436+
for (jout=0; NULL != cd->argv[jout]; jout++) {
437+
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->argv[jout]);
443438
}
444-
for (jout=0; NULL != environ_copy[jout]; jout++) {
445-
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
439+
for (jout=0; NULL != cd->env[jout]; jout++) {
440+
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->env[jout]);
446441
}
447442
}
448443

449-
execve(app, argv, environ_copy);
444+
execve(cd->app->app, cd->argv, cd->env);
450445
send_error_show_help(write_fd, 1,
451446
"help-orte-odls-alps.txt", "execve error",
452-
orte_process_info.nodename, app, strerror(errno));
447+
orte_process_info.nodename, cd->app->app, strerror(errno));
453448
/* Does not return */
454449
}
455450

@@ -729,4 +724,3 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child)
729724
}
730725
return rc;
731726
}
732-

0 commit comments

Comments
 (0)