Skip to content

Commit d72c1c7

Browse files
author
Ralph Castain
committed
Do not push child processes into separate process groups so that any host RM can still "see" them, and ensure that any signal sent to the orted's themselves will be provided to all child processes. Forward all signals from mpirun to the child processes, removing the old MCA parameter required to turn that behavior "on".
1 parent 36a6a3b commit d72c1c7

File tree

8 files changed

+15
-75
lines changed

8 files changed

+15
-75
lines changed

orte/mca/ess/hnp/ess_hnp_module.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -786,10 +786,8 @@ static int rte_finalize(void)
786786
/** Remove the USR signal handlers */
787787
opal_event_signal_del(&sigusr1_handler);
788788
opal_event_signal_del(&sigusr2_handler);
789-
if (orte_forward_job_control) {
790-
opal_event_signal_del(&sigtstp_handler);
791-
opal_event_signal_del(&sigcont_handler);
792-
}
789+
opal_event_signal_del(&sigtstp_handler);
790+
opal_event_signal_del(&sigcont_handler);
793791
signals_set = false;
794792
}
795793

orte/mca/odls/alps/odls_alps_module.c

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -416,13 +416,6 @@ static int do_child(orte_app_context_t* context,
416416
sigset_t sigs;
417417
char *param, *msg;
418418

419-
if (orte_forward_job_control) {
420-
/* Set a new process group for this child, so that a
421-
SIGSTOP can be sent to it without being sent to the
422-
orted. */
423-
setpgid(0, 0);
424-
}
425-
426419
/* Setup the pipe to be close-on-exec */
427420
opal_fd_set_cloexec(write_fd);
428421

@@ -798,11 +791,6 @@ static int send_signal(pid_t pid, int signal)
798791
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
799792
signal, (long)pid));
800793

801-
if (orte_forward_job_control) {
802-
/* Send the signal to the process group rather than the
803-
process. The child is the leader of its process group. */
804-
pid = -pid;
805-
}
806794
if (kill(pid, signal) != 0) {
807795
switch(errno) {
808796
case EINVAL:

orte/mca/odls/default/odls_default_module.c

Lines changed: 11 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -193,18 +193,18 @@ static bool odls_default_child_died(orte_proc_t *child)
193193
* that occasionally causes us to incorrectly report a proc
194194
* as refusing to die. Unfortunately, errno may not be reset
195195
* by waitpid in this case, so we cannot check it.
196-
*
197-
* (note the previous fix to this, to return 'process dead'
198-
* here, fixes the race condition at the cost of reporting
199-
* all live processes have immediately died! Better to
200-
* occasionally report a dead process as still living -
201-
* which will occasionally trip the timeout for cases that
202-
* are right on the edge.)
196+
*
197+
* (note the previous fix to this, to return 'process dead'
198+
* here, fixes the race condition at the cost of reporting
199+
* all live processes have immediately died! Better to
200+
* occasionally report a dead process as still living -
201+
* which will occasionally trip the timeout for cases that
202+
* are right on the edge.)
203203
*/
204204
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
205205
"%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
206206
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
207-
/* Do nothing, process still alive */
207+
/* Do nothing, process still alive */
208208
} else if (-1 == ret && ECHILD == errno) {
209209
/* The pid no longer exists, so we'll call this "good
210210
enough for government work" */
@@ -228,23 +228,10 @@ static bool odls_default_child_died(orte_proc_t *child)
228228
return false;
229229
}
230230

231+
232+
/* deliver a signal to a specified pid. */
231233
static int odls_default_kill_local(pid_t pid, int signum)
232234
{
233-
pid_t pgrp;
234-
235-
#if HAVE_SETPGID
236-
pgrp = getpgid(pid);
237-
if (-1 != pgrp) {
238-
/* target the lead process of the process
239-
* group so we ensure that the signal is
240-
* seen by all members of that group. This
241-
* ensures that the signal is seen by any
242-
* child processes our child may have
243-
* started
244-
*/
245-
pid = pgrp;
246-
}
247-
#endif
248235
if (0 != kill(pid, signum)) {
249236
if (ESRCH != errno) {
250237
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
@@ -391,13 +378,6 @@ static int do_child(orte_app_context_t* context,
391378
long fd, fdmax = sysconf(_SC_OPEN_MAX);
392379
char *param, *msg;
393380

394-
if (orte_forward_job_control) {
395-
/* Set a new process group for this child, so that a
396-
SIGSTOP can be sent to it without being sent to the
397-
orted. */
398-
setpgid(0, 0);
399-
}
400-
401381
/* Setup the pipe to be close-on-exec */
402382
opal_fd_set_cloexec(write_fd);
403383

@@ -720,10 +700,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
720700
}
721701

722702
if (pid == 0) {
723-
close(p[0]);
724-
#if HAVE_SETPGID
725-
setpgid(0, 0);
726-
#endif
703+
close(p[0]);
727704
do_child(context, child, environ_copy, jobdat, p[1], opts);
728705
/* Does not return */
729706
}
@@ -770,11 +747,6 @@ static int send_signal(pid_t pid, int signal)
770747
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
771748
signal, (long)pid));
772749

773-
if (orte_forward_job_control) {
774-
/* Send the signal to the process group rather than the
775-
process. The child is the leader of its process group. */
776-
pid = -pid;
777-
}
778750
if (kill(pid, signal) != 0) {
779751
switch(errno) {
780752
case EINVAL:

orte/runtime/orte_globals.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,9 +143,6 @@ char *orte_output_filename = NULL;
143143
/* generate new xterm windows to display output from specified ranks */
144144
char *orte_xterm = NULL;
145145

146-
/* whether or not to forward SIGTSTP and SIGCONT signals */
147-
bool orte_forward_job_control = false;
148-
149146
/* report launch progress */
150147
bool orte_report_launch_progress = false;
151148

orte/runtime/orte_globals.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -521,9 +521,6 @@ ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies;
521521
ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children;
522522
ORTE_DECLSPEC extern orte_vpid_t orte_total_procs;
523523

524-
/* whether or not to forward SIGTSTP and SIGCONT signals */
525-
ORTE_DECLSPEC extern bool orte_forward_job_control;
526-
527524
/* IOF controls */
528525
ORTE_DECLSPEC extern bool orte_tag_output;
529526
ORTE_DECLSPEC extern bool orte_timestamp_output;

orte/runtime/orte_mca_params.c

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -543,14 +543,6 @@ int orte_register_params(void)
543543
orte_map_stddiag_to_stderr = true;
544544
}
545545

546-
/* whether or not to forward SIGTSTP and SIGCONT signals */
547-
orte_forward_job_control = false;
548-
(void) mca_base_var_register ("orte", "orte", NULL, "forward_job_control",
549-
"Forward SIGTSTP (after converting to SIGSTOP) and SIGCONT signals to the application procs [default: no]",
550-
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
551-
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
552-
&orte_forward_job_control);
553-
554546
/* whether or not to report launch progress */
555547
orte_report_launch_progress = false;
556548
(void) mca_base_var_register ("orte", "orte", NULL, "report_launch_progress",

orte/tools/orte-submit/orte-submit.1in

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,9 +1133,7 @@ SIGUSR1 and SIGUSR2 signals received by orte-submit are propagated to
11331133
all processes in the job.
11341134
.
11351135
.PP
1136-
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
1137-
by ompi-submit by setting the MCA parameter orte_forward_job_control to 1.
1138-
A SIGTSTOP signal to ompi-submit will then cause a SIGSTOP signal to be sent
1136+
A SIGTSTOP signal to ompi-submit will cause a SIGSTOP signal to be sent
11391137
to all of the programs started by ompi-submit and likewise a SIGCONT signal
11401138
to ompi-submit will cause a SIGCONT sent.
11411139
.

orte/tools/orterun/orterun.1in

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,9 +1240,7 @@ SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
12401240
all processes in the job.
12411241
.
12421242
.PP
1243-
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
1244-
by mpirun by setting the MCA parameter orte_forward_job_control to 1.
1245-
A SIGTSTOP signal to mpirun will then cause a SIGSTOP signal to be sent
1243+
A SIGTSTOP signal to mpirun will cause a SIGSTOP signal to be sent
12461244
to all of the programs started by mpirun and likewise a SIGCONT signal
12471245
to mpirun will cause a SIGCONT sent.
12481246
.

0 commit comments

Comments
 (0)