Skip to content

Commit a17b547

Browse files
authored
Merge pull request #2957 from jjhursey/topic/ibm/rsh-sigint-fix
plm/rsh: Fix signal handling for rsh launcher
2 parents 8562b87 + 843fcca commit a17b547

File tree

1 file changed

+37
-1
lines changed

1 file changed

+37
-1
lines changed

orte/mca/plm/rsh/plm_rsh_module.c

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
1414
* reserved.
1515
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
16-
* Copyright (c) 2011 IBM Corporation. All rights reserved.
16+
* Copyright (c) 2011-2017 IBM Corporation. All rights reserved.
1717
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1818
* Copyright (c) 2015-2017 Research Organization for Information Science
1919
* and Technology (RIST). All rights reserved.
@@ -957,9 +957,45 @@ static void process_launch_list(int fd, short args, void *cbdata)
957957

958958
/* child */
959959
if (pid == 0) {
960+
/*
961+
* When the user presses CTRL-C, SIGINT is sent to the whole process
962+
* group which terminates the rsh/ssh command. This can cause the
963+
* remote daemon to crash with a SIGPIPE when it tried to print out
964+
* status information. This has two concequences:
965+
* 1) The remote node is not cleaned up as it should. The local
966+
* processes will notice that the orted failed and cleanup their
967+
* part of the session directory, but the job level part will
968+
* remain littered.
969+
* 2) Any debugging information we expected to see from the orted
970+
* during shutdown is lost.
971+
*
972+
* The solution here is to put the child processes in a separate
973+
* process group from the HNP. So when the user presses CTRL-C
974+
* then only the HNP receives the signal, and not the rsh/ssh
975+
* child processes.
976+
*/
977+
#if HAVE_SETPGID
978+
if( 0 != setpgid(0, 0) ) {
979+
opal_output(0, "plm:rsh: Error: setpgid(0,0) failed in child with errno=%s(%d)\n",
980+
strerror(errno), errno);
981+
exit(-1);
982+
}
983+
#endif
984+
960985
/* do the ssh launch - this will exit if it fails */
961986
ssh_child(caddy->argc, caddy->argv);
962987
} else { /* father */
988+
// Put the child in a separate progress group
989+
// - see comment in child section.
990+
#if HAVE_SETPGID
991+
if( 0 != setpgid(pid, pid) ) {
992+
opal_output(0, "plm:rsh: Warning: setpgid(%ld,%ld) failed in parent with errno=%s(%d)\n",
993+
(long)pid, (long)pid, strerror(errno), errno);
994+
// Ignore this error since the child is off and running.
995+
// We still need to track it.
996+
}
997+
#endif
998+
963999
/* indicate this daemon has been launched */
9641000
caddy->daemon->state = ORTE_PROC_STATE_RUNNING;
9651001
/* record the pid of the ssh fork */

0 commit comments

Comments
 (0)