@@ -144,11 +144,7 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child);
144144static void send_error_show_help (int fd , int exit_status ,
145145 const char * file , const char * topic , ...)
146146 __opal_attribute_noreturn__ ;
147- static int do_child (orte_proc_t * child ,
148- char * app , char * * argv ,
149- char * * environ_copy ,
150- orte_job_t * jobdat , int write_fd ,
151- orte_iof_base_io_conf_t opts )
147+ static int do_child (orte_odls_spawn_caddy_t * cd , int write_fd )
152148 __opal_attribute_noreturn__ ;
153149
154150
@@ -342,20 +338,15 @@ static int close_open_file_descriptors(int write_fd, orte_iof_base_io_conf_t opt
342338 return ORTE_SUCCESS ;
343339}
344340
345- static int do_child ( orte_proc_t * child ,
346- char * app , char * * argv ,
347- char * * environ_copy ,
348- orte_job_t * jobdat , int write_fd ,
349- orte_iof_base_io_conf_t opts )
341+ static int do_child (orte_odls_spawn_caddy_t * cd , int write_fd )
350342{
351- int i , rc ;
343+ int i ;
352344 sigset_t sigs ;
353- char * param , * msg ;
354345
355346 /* Setup the pipe to be close-on-exec */
356347 opal_fd_set_cloexec (write_fd );
357348
358- if (NULL != child ) {
349+ if (NULL != cd -> child ) {
359350 /* setup stdout/stderr so that any error messages that we
360351 may print out will get displayed back at orterun.
361352
@@ -369,20 +360,19 @@ static int do_child( orte_proc_t *child,
369360 always outputs a nice, single message indicating what
370361 happened
371362 */
372- if (ORTE_SUCCESS != (i = orte_iof_base_setup_child (& opts ,
373- & environ_copy ))) {
363+ if (ORTE_SUCCESS != (i = orte_iof_base_setup_child (& cd -> opts , & cd -> env ))) {
374364 ORTE_ERROR_LOG (i );
375365 send_error_show_help (write_fd , 1 ,
376366 "help-orte-odls-alps.txt" ,
377367 "iof setup failed" ,
378- orte_process_info .nodename , app );
368+ orte_process_info .nodename , cd -> app -> app );
379369 /* Does not return */
380370 }
381371
382372 /* now set any child-level controls such as binding */
383- orte_rtc .set (jobdat , child , & environ_copy , write_fd );
373+ orte_rtc .set (cd -> jdata , cd -> child , & cd -> env , write_fd );
384374
385- } else if (!ORTE_FLAG_TEST (jobdat , ORTE_JOB_FLAG_FORWARD_OUTPUT )) {
375+ } else if (!ORTE_FLAG_TEST (cd -> jdata , ORTE_JOB_FLAG_FORWARD_OUTPUT )) {
386376 /* tie stdin/out/err/internal to /dev/null */
387377 int fdnull ;
388378 for (i = 0 ; i < 3 ; i ++ ) {
@@ -393,24 +383,24 @@ static int do_child( orte_proc_t *child,
393383 close (fdnull );
394384 }
395385 fdnull = open ("/dev/null" , O_RDONLY , 0 );
396- if (fdnull > opts .p_internal [1 ]) {
397- dup2 (fdnull , opts .p_internal [1 ]);
386+ if (fdnull > cd -> opts .p_internal [1 ]) {
387+ dup2 (fdnull , cd -> opts .p_internal [1 ]);
398388 }
399389 close (fdnull );
400390 }
401391
402- if (ORTE_SUCCESS != close_open_file_descriptors (write_fd , opts )) {
392+ if (ORTE_SUCCESS != close_open_file_descriptors (write_fd , cd -> opts )) {
403393 send_error_show_help (write_fd , 1 , "help-orte-odls-alps.txt" ,
404394 "close fds" ,
405- orte_process_info .nodename , app ,
395+ orte_process_info .nodename , cd -> app -> app ,
406396 __FILE__ , __LINE__ );
407397 }
408398
409399
410- if (argv == NULL ) {
411- argv = malloc (sizeof (char * )* 2 );
412- argv [0 ] = strdup (app );
413- argv [1 ] = NULL ;
400+ if (cd -> argv == NULL ) {
401+ cd -> argv = malloc (sizeof (char * )* 2 );
402+ cd -> argv [0 ] = strdup (cd -> app -> app );
403+ cd -> argv [1 ] = NULL ;
414404 }
415405
416406 /* Set signal handlers back to the default. Do this close to
@@ -437,37 +427,33 @@ static int do_child( orte_proc_t *child,
437427
438428 if (10 < opal_output_get_verbosity (orte_odls_base_framework .framework_output )) {
439429 int jout ;
440- opal_output (0 , "%s STARTING %s" , ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), app );
441- for (jout = 0 ; NULL != argv [jout ]; jout ++ ) {
442- opal_output (0 , "%s\tARGV[%d]: %s" , ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), jout , argv [jout ]);
430+ opal_output (0 , "%s STARTING %s" , ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), cd -> app -> app );
431+ for (jout = 0 ; NULL != cd -> argv [jout ]; jout ++ ) {
432+ opal_output (0 , "%s\tARGV[%d]: %s" , ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), jout , cd -> argv [jout ]);
443433 }
444- for (jout = 0 ; NULL != environ_copy [jout ]; jout ++ ) {
445- opal_output (0 , "%s\tENVIRON[%d]: %s" , ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), jout , environ_copy [jout ]);
434+ for (jout = 0 ; NULL != cd -> env [jout ]; jout ++ ) {
435+ opal_output (0 , "%s\tENVIRON[%d]: %s" , ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), jout , cd -> env [jout ]);
446436 }
447437 }
448438
449- execve (app , argv , environ_copy );
439+ execve (cd -> app -> app , cd -> argv , cd -> env );
450440 send_error_show_help (write_fd , 1 ,
451441 "help-orte-odls-alps.txt" , "execve error" ,
452- orte_process_info .nodename , app , strerror (errno ));
442+ orte_process_info .nodename , cd -> app -> app , strerror (errno ));
453443 /* Does not return */
454444}
455445
456446
457- static int do_parent (orte_proc_t * child ,
458- char * app , char * * argv ,
459- char * * environ_copy ,
460- orte_job_t * jobdat , int read_fd ,
461- orte_iof_base_io_conf_t opts )
447+ static int do_parent (orte_odls_spawn_caddy_t * cd , int read_fd )
462448{
463449 int rc ;
464450 orte_odls_pipe_err_msg_t msg ;
465451 char file [ORTE_ODLS_MAX_FILE_LEN + 1 ], topic [ORTE_ODLS_MAX_TOPIC_LEN + 1 ], * str = NULL ;
466452
467- close (opts .p_stdin [0 ]);
468- close (opts .p_stdout [1 ]);
469- close (opts .p_stderr [1 ]);
470- close (opts .p_internal [1 ]);
453+ close (cd -> opts .p_stdin [0 ]);
454+ close (cd -> opts .p_stdout [1 ]);
455+ close (cd -> opts .p_stderr [1 ]);
456+ close (cd -> opts .p_internal [1 ]);
471457
472458 /* Block reading a message from the pipe */
473459 while (1 ) {
@@ -483,18 +469,18 @@ static int do_parent(orte_proc_t *child,
483469 ORTE_ERROR_LOG (rc );
484470 close (read_fd );
485471
486- if (NULL != child ) {
487- child -> state = ORTE_PROC_STATE_UNDEF ;
472+ if (NULL != cd -> child ) {
473+ cd -> child -> state = ORTE_PROC_STATE_UNDEF ;
488474 }
489475 return rc ;
490476 }
491477
492478 /* Otherwise, we got a warning or error message from the child */
493- if (NULL != child ) {
479+ if (NULL != cd -> child ) {
494480 if (msg .fatal ) {
495- ORTE_FLAG_UNSET (child , ORTE_PROC_FLAG_ALIVE );
481+ ORTE_FLAG_UNSET (cd -> child , ORTE_PROC_FLAG_ALIVE );
496482 } else {
497- ORTE_FLAG_SET (child , ORTE_PROC_FLAG_ALIVE );
483+ ORTE_FLAG_SET (cd -> child , ORTE_PROC_FLAG_ALIVE );
498484 }
499485 }
500486
@@ -504,10 +490,10 @@ static int do_parent(orte_proc_t *child,
504490 if (OPAL_SUCCESS != rc ) {
505491 orte_show_help ("help-orte-odls-alps.txt" , "syscall fail" ,
506492 true,
507- orte_process_info .nodename , app ,
493+ orte_process_info .nodename , cd -> app ,
508494 "opal_fd_read" , __FILE__ , __LINE__ );
509- if (NULL != child ) {
510- child -> state = ORTE_PROC_STATE_UNDEF ;
495+ if (NULL != cd -> child ) {
496+ cd -> child -> state = ORTE_PROC_STATE_UNDEF ;
511497 }
512498 return rc ;
513499 }
@@ -518,10 +504,10 @@ static int do_parent(orte_proc_t *child,
518504 if (OPAL_SUCCESS != rc ) {
519505 orte_show_help ("help-orte-odls-alps.txt" , "syscall fail" ,
520506 true,
521- orte_process_info .nodename , app ,
507+ orte_process_info .nodename , cd -> app ,
522508 "opal_fd_read" , __FILE__ , __LINE__ );
523- if (NULL != child ) {
524- child -> state = ORTE_PROC_STATE_UNDEF ;
509+ if (NULL != cd -> child ) {
510+ cd -> child -> state = ORTE_PROC_STATE_UNDEF ;
525511 }
526512 return rc ;
527513 }
@@ -532,10 +518,10 @@ static int do_parent(orte_proc_t *child,
532518 if (NULL == str ) {
533519 orte_show_help ("help-orte-odls-alps.txt" , "syscall fail" ,
534520 true,
535- orte_process_info .nodename , app ,
521+ orte_process_info .nodename , cd -> app ,
536522 "opal_fd_read" , __FILE__ , __LINE__ );
537- if (NULL != child ) {
538- child -> state = ORTE_PROC_STATE_UNDEF ;
523+ if (NULL != cd -> child ) {
524+ cd -> child -> state = ORTE_PROC_STATE_UNDEF ;
539525 }
540526 return rc ;
541527 }
@@ -556,9 +542,9 @@ static int do_parent(orte_proc_t *child,
556542 closed, indicating that the child launched
557543 successfully). */
558544 if (msg .fatal ) {
559- if (NULL != child ) {
560- child -> state = ORTE_PROC_STATE_FAILED_TO_START ;
561- ORTE_FLAG_UNSET (child , ORTE_PROC_FLAG_ALIVE );
545+ if (NULL != cd -> child ) {
546+ cd -> child -> state = ORTE_PROC_STATE_FAILED_TO_START ;
547+ ORTE_FLAG_UNSET (cd -> child , ORTE_PROC_FLAG_ALIVE );
562548 }
563549 close (read_fd );
564550 return ORTE_ERR_FAILED_TO_START ;
@@ -568,9 +554,9 @@ static int do_parent(orte_proc_t *child,
568554 /* If we got here, it means that the pipe closed without
569555 indication of a fatal error, meaning that the child process
570556 launched successfully. */
571- if (NULL != child ) {
572- child -> state = ORTE_PROC_STATE_RUNNING ;
573- ORTE_FLAG_SET (child , ORTE_PROC_FLAG_ALIVE );
557+ if (NULL != cd -> child ) {
558+ cd -> child -> state = ORTE_PROC_STATE_RUNNING ;
559+ ORTE_FLAG_SET (cd -> child , ORTE_PROC_FLAG_ALIVE );
574560 }
575561 close (read_fd );
576562
@@ -581,14 +567,10 @@ static int do_parent(orte_proc_t *child,
581567/**
582568 * Fork/exec the specified processes
583569 */
584- static int odls_alps_fork_local_proc (orte_proc_t * child ,
585- char * app ,
586- char * * argv ,
587- char * * environ_copy ,
588- orte_job_t * jobdat ,
589- orte_iof_base_io_conf_t opts )
570+ static int odls_alps_fork_local_proc (void * cdptr )
590571{
591- int rc , p [2 ];
572+ orte_odls_spawn_caddy_t * cd = (orte_odls_spawn_caddy_t * )cdptr ;
573+ int p [2 ];
592574 pid_t pid ;
593575
594576 /* A pipe is used to communicate between the parent and child to
@@ -601,24 +583,24 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
601583 the pipe, then the child was letting us know why it failed. */
602584 if (pipe (p ) < 0 ) {
603585 ORTE_ERROR_LOG (ORTE_ERR_SYS_LIMITS_PIPES );
604- if (NULL != child ) {
605- child -> state = ORTE_PROC_STATE_FAILED_TO_START ;
606- child -> exit_code = ORTE_ERR_SYS_LIMITS_PIPES ;
586+ if (NULL != cd -> child ) {
587+ cd -> child -> state = ORTE_PROC_STATE_FAILED_TO_START ;
588+ cd -> child -> exit_code = ORTE_ERR_SYS_LIMITS_PIPES ;
607589 }
608590 return ORTE_ERR_SYS_LIMITS_PIPES ;
609591 }
610592
611593 /* Fork off the child */
612594 pid = fork ();
613- if (NULL != child ) {
614- child -> pid = pid ;
595+ if (NULL != cd -> child ) {
596+ cd -> child -> pid = pid ;
615597 }
616598
617599 if (pid < 0 ) {
618600 ORTE_ERROR_LOG (ORTE_ERR_SYS_LIMITS_CHILDREN );
619- if (NULL != child ) {
620- child -> state = ORTE_PROC_STATE_FAILED_TO_START ;
621- child -> exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN ;
601+ if (NULL != cd -> child ) {
602+ cd -> child -> state = ORTE_PROC_STATE_FAILED_TO_START ;
603+ cd -> child -> exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN ;
622604 }
623605 return ORTE_ERR_SYS_LIMITS_CHILDREN ;
624606 }
@@ -628,12 +610,12 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
628610#if HAVE_SETPGID
629611 setpgid (0 , 0 );
630612#endif
631- do_child (child , app , argv , environ_copy , jobdat , p [1 ], opts );
613+ do_child (cd , p [1 ]);
632614 /* Does not return */
633615 }
634616
635617 close (p [1 ]);
636- return do_parent (child , app , argv , environ_copy , jobdat , p [0 ], opts );
618+ return do_parent (cd , p [0 ]);
637619}
638620
639621
@@ -643,8 +625,8 @@ static int odls_alps_fork_local_proc(orte_proc_t *child,
643625
644626int orte_odls_alps_launch_local_procs (opal_buffer_t * data )
645627{
646- int rc ;
647628 orte_jobid_t job ;
629+ int rc ;
648630
649631 /* construct the list of children we are to launch */
650632 if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list (data , & job ))) {
@@ -729,4 +711,3 @@ static int orte_odls_alps_restart_proc(orte_proc_t *child)
729711 }
730712 return rc ;
731713}
732-
0 commit comments