Skip to content

Commit 10d401b

Browse files
author
Ralph Castain
authored
Merge pull request #3217 from rhc54/topic/wdirs
Resolve a race condition for setting our working directory when fork/exec'ing application procs.
2 parents 09a7b0f + 74fd2c3 commit 10d401b

File tree

5 files changed

+227
-209
lines changed

5 files changed

+227
-209
lines changed

orte/mca/odls/alps/odls_alps_module.c

Lines changed: 62 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -144,11 +144,7 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child);
144144
static void send_error_show_help(int fd, int exit_status,
145145
const char *file, const char *topic, ...)
146146
__opal_attribute_noreturn__;
147-
static int do_child(orte_proc_t *child,
148-
char *app, char **argv,
149-
char **environ_copy,
150-
orte_job_t *jobdat, int write_fd,
151-
orte_iof_base_io_conf_t opts)
147+
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
152148
__opal_attribute_noreturn__;
153149

154150

@@ -342,20 +338,15 @@ static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opt
342338
return ORTE_SUCCESS;
343339
}
344340

345-
static int do_child( orte_proc_t *child,
346-
char *app, char **argv,
347-
char **environ_copy,
348-
orte_job_t *jobdat, int write_fd,
349-
orte_iof_base_io_conf_t opts)
341+
static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
350342
{
351-
int i, rc;
343+
int i;
352344
sigset_t sigs;
353-
char *param, *msg;
354345

355346
/* Setup the pipe to be close-on-exec */
356347
opal_fd_set_cloexec(write_fd);
357348

358-
if (NULL != child) {
349+
if (NULL != cd->child) {
359350
/* setup stdout/stderr so that any error messages that we
360351
may print out will get displayed back at orterun.
361352
@@ -369,20 +360,19 @@ static int do_child( orte_proc_t *child,
369360
always outputs a nice, single message indicating what
370361
happened
371362
*/
372-
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts,
373-
&environ_copy))) {
363+
if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
374364
ORTE_ERROR_LOG(i);
375365
send_error_show_help(write_fd, 1,
376366
"help-orte-odls-alps.txt",
377367
"iof setup failed",
378-
orte_process_info.nodename, app);
368+
orte_process_info.nodename, cd->app->app);
379369
/* Does not return */
380370
}
381371

382372
/* now set any child-level controls such as binding */
383-
orte_rtc.set(jobdat, child, &environ_copy, write_fd);
373+
orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
384374

385-
} else if (!ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
375+
} else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
386376
/* tie stdin/out/err/internal to /dev/null */
387377
int fdnull;
388378
for (i=0; i < 3; i++) {
@@ -393,24 +383,24 @@ static int do_child( orte_proc_t *child,
393383
close(fdnull);
394384
}
395385
fdnull = open("/dev/null", O_RDONLY, 0);
396-
if (fdnull > opts.p_internal[1]) {
397-
dup2(fdnull, opts.p_internal[1]);
386+
if (fdnull > cd->opts.p_internal[1]) {
387+
dup2(fdnull, cd->opts.p_internal[1]);
398388
}
399389
close(fdnull);
400390
}
401391

402-
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, opts)) {
392+
if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
403393
send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
404394
"close fds",
405-
orte_process_info.nodename, app,
395+
orte_process_info.nodename, cd->app->app,
406396
__FILE__, __LINE__);
407397
}
408398

409399

410-
if (argv == NULL) {
411-
argv = malloc(sizeof(char*)*2);
412-
argv[0] = strdup(app);
413-
argv[1] = NULL;
400+
if (cd->argv == NULL) {
401+
cd->argv = malloc(sizeof(char*)*2);
402+
cd->argv[0] = strdup(cd->app->app);
403+
cd->argv[1] = NULL;
414404
}
415405

416406
/* Set signal handlers back to the default. Do this close to
@@ -437,37 +427,33 @@ static int do_child( orte_proc_t *child,
437427

438428
if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
439429
int jout;
440-
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app);
441-
for (jout=0; NULL != argv[jout]; jout++) {
442-
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, argv[jout]);
430+
opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cd->app->app);
431+
for (jout=0; NULL != cd->argv[jout]; jout++) {
432+
opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->argv[jout]);
443433
}
444-
for (jout=0; NULL != environ_copy[jout]; jout++) {
445-
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]);
434+
for (jout=0; NULL != cd->env[jout]; jout++) {
435+
opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->env[jout]);
446436
}
447437
}
448438

449-
execve(app, argv, environ_copy);
439+
execve(cd->app->app, cd->argv, cd->env);
450440
send_error_show_help(write_fd, 1,
451441
"help-orte-odls-alps.txt", "execve error",
452-
orte_process_info.nodename, app, strerror(errno));
442+
orte_process_info.nodename, cd->app->app, strerror(errno));
453443
/* Does not return */
454444
}
455445

456446

457-
static int do_parent(orte_proc_t *child,
458-
char *app, char **argv,
459-
char **environ_copy,
460-
orte_job_t *jobdat, int read_fd,
461-
orte_iof_base_io_conf_t opts)
447+
static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
462448
{
463449
int rc;
464450
orte_odls_pipe_err_msg_t msg;
465451
char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
466452

467-
close(opts.p_stdin[0]);
468-
close(opts.p_stdout[1]);
469-
close(opts.p_stderr[1]);
470-
close(opts.p_internal[1]);
453+
close(cd->opts.p_stdin[0]);
454+
close(cd->opts.p_stdout[1]);
455+
close(cd->opts.p_stderr[1]);
456+
close(cd->opts.p_internal[1]);
471457

472458
/* Block reading a message from the pipe */
473459
while (1) {
@@ -483,18 +469,18 @@ static int do_parent(orte_proc_t *child,
483469
ORTE_ERROR_LOG(rc);
484470
close(read_fd);
485471

486-
if (NULL != child) {
487-
child->state = ORTE_PROC_STATE_UNDEF;
472+
if (NULL != cd->child) {
473+
cd->child->state = ORTE_PROC_STATE_UNDEF;
488474
}
489475
return rc;
490476
}
491477

492478
/* Otherwise, we got a warning or error message from the child */
493-
if (NULL != child) {
479+
if (NULL != cd->child) {
494480
if (msg.fatal) {
495-
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
481+
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
496482
} else {
497-
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
483+
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
498484
}
499485
}
500486

@@ -504,10 +490,10 @@ static int do_parent(orte_proc_t *child,
504490
if (OPAL_SUCCESS != rc) {
505491
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
506492
true,
507-
orte_process_info.nodename, app,
493+
orte_process_info.nodename, cd->app,
508494
"opal_fd_read", __FILE__, __LINE__);
509-
if (NULL != child) {
510-
child->state = ORTE_PROC_STATE_UNDEF;
495+
if (NULL != cd->child) {
496+
cd->child->state = ORTE_PROC_STATE_UNDEF;
511497
}
512498
return rc;
513499
}
@@ -518,10 +504,10 @@ static int do_parent(orte_proc_t *child,
518504
if (OPAL_SUCCESS != rc) {
519505
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
520506
true,
521-
orte_process_info.nodename, app,
507+
orte_process_info.nodename, cd->app,
522508
"opal_fd_read", __FILE__, __LINE__);
523-
if (NULL != child) {
524-
child->state = ORTE_PROC_STATE_UNDEF;
509+
if (NULL != cd->child) {
510+
cd->child->state = ORTE_PROC_STATE_UNDEF;
525511
}
526512
return rc;
527513
}
@@ -532,10 +518,10 @@ static int do_parent(orte_proc_t *child,
532518
if (NULL == str) {
533519
orte_show_help("help-orte-odls-alps.txt", "syscall fail",
534520
true,
535-
orte_process_info.nodename, app,
521+
orte_process_info.nodename, cd->app,
536522
"opal_fd_read", __FILE__, __LINE__);
537-
if (NULL != child) {
538-
child->state = ORTE_PROC_STATE_UNDEF;
523+
if (NULL != cd->child) {
524+
cd->child->state = ORTE_PROC_STATE_UNDEF;
539525
}
540526
return rc;
541527
}
@@ -556,9 +542,9 @@ static int do_parent(orte_proc_t *child,
556542
closed, indicating that the child launched
557543
successfully). */
558544
if (msg.fatal) {
559-
if (NULL != child) {
560-
child->state = ORTE_PROC_STATE_FAILED_TO_START;
561-
ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
545+
if (NULL != cd->child) {
546+
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
547+
ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
562548
}
563549
close(read_fd);
564550
return ORTE_ERR_FAILED_TO_START;
@@ -568,9 +554,9 @@ static int do_parent(orte_proc_t *child,
568554
/* If we got here, it means that the pipe closed without
569555
indication of a fatal error, meaning that the child process
570556
launched successfully. */
571-
if (NULL != child) {
572-
child->state = ORTE_PROC_STATE_RUNNING;
573-
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
557+
if (NULL != cd->child) {
558+
cd->child->state = ORTE_PROC_STATE_RUNNING;
559+
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
574560
}
575561
close(read_fd);
576562

@@ -581,14 +567,10 @@ static int do_parent(orte_proc_t *child,
581567
/**
582568
* Fork/exec the specified processes
583569
*/
584-
static int odls_alps_fork_local_proc(orte_proc_t *child,
585-
char *app,
586-
char **argv,
587-
char **environ_copy,
588-
orte_job_t *jobdat,
589-
orte_iof_base_io_conf_t opts)
570+
static int odls_alps_fork_local_proc(void *cdptr)
590571
{
591-
int rc, p[2];
572+
orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
573+
int p[2];
592574
pid_t pid;
593575

594576
/* A pipe is used to communicate between the parent and child to
@@ -601,24 +583,24 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
601583
the pipe, then the child was letting us know why it failed. */
602584
if (pipe(p) < 0) {
603585
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
604-
if (NULL != child) {
605-
child->state = ORTE_PROC_STATE_FAILED_TO_START;
606-
child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
586+
if (NULL != cd->child) {
587+
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
588+
cd->child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
607589
}
608590
return ORTE_ERR_SYS_LIMITS_PIPES;
609591
}
610592

611593
/* Fork off the child */
612594
pid = fork();
613-
if (NULL != child) {
614-
child->pid = pid;
595+
if (NULL != cd->child) {
596+
cd->child->pid = pid;
615597
}
616598

617599
if (pid < 0) {
618600
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
619-
if (NULL != child) {
620-
child->state = ORTE_PROC_STATE_FAILED_TO_START;
621-
child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
601+
if (NULL != cd->child) {
602+
cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
603+
cd->child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
622604
}
623605
return ORTE_ERR_SYS_LIMITS_CHILDREN;
624606
}
@@ -628,12 +610,12 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
628610
#if HAVE_SETPGID
629611
setpgid(0, 0);
630612
#endif
631-
do_child(child, app, argv, environ_copy, jobdat, p[1], opts);
613+
do_child(cd, p[1]);
632614
/* Does not return */
633615
}
634616

635617
close(p[1]);
636-
return do_parent(child, app, argv, environ_copy, jobdat, p[0], opts);
618+
return do_parent(cd, p[0]);
637619
}
638620

639621

@@ -643,8 +625,8 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
643625

644626
int orte_odls_alps_launch_local_procs(opal_buffer_t *data)
645627
{
646-
int rc;
647628
orte_jobid_t job;
629+
int rc;
648630

649631
/* construct the list of children we are to launch */
650632
if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {
@@ -729,4 +711,3 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child)
729711
}
730712
return rc;
731713
}
732-

0 commit comments

Comments
 (0)