|
13 | 13 | * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
|
14 | 14 | * reserved.
|
15 | 15 | * Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
|
16 |
| - * Copyright (c) 2011 IBM Corporation. All rights reserved. |
| 16 | + * Copyright (c) 2011-2017 IBM Corporation. All rights reserved. |
17 | 17 | * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
18 | 18 | * Copyright (c) 2015-2017 Research Organization for Information Science
|
19 | 19 | * and Technology (RIST). All rights reserved.
|
@@ -957,9 +957,45 @@ static void process_launch_list(int fd, short args, void *cbdata)
|
957 | 957 |
|
958 | 958 | /* child */
|
959 | 959 | if (pid == 0) {
|
| 960 | + /* |
| 961 | + * When the user presses CTRL-C, SIGINT is sent to the whole process |
| 962 | + * group which terminates the rsh/ssh command. This can cause the |
| 963 | + * remote daemon to crash with a SIGPIPE when it tried to print out |
| 964 | + * status information. This has two concequences: |
| 965 | + * 1) The remote node is not cleaned up as it should. The local |
| 966 | + * processes will notice that the orted failed and cleanup their |
| 967 | + * part of the session directory, but the job level part will |
| 968 | + * remain littered. |
| 969 | + * 2) Any debugging information we expected to see from the orted |
| 970 | + * during shutdown is lost. |
| 971 | + * |
| 972 | + * The solution here is to put the child processes in a separate |
| 973 | + * process group from the HNP. So when the user presses CTRL-C |
| 974 | + * then only the HNP receives the signal, and not the rsh/ssh |
| 975 | + * child processes. |
| 976 | + */ |
| 977 | +#if HAVE_SETPGID |
| 978 | + if( 0 != setpgid(0, 0) ) { |
| 979 | + opal_output(0, "plm:rsh: Error: setpgid(0,0) failed in child with errno=%s(%d)\n", |
| 980 | + strerror(errno), errno); |
| 981 | + exit(-1); |
| 982 | + } |
| 983 | +#endif |
| 984 | + |
960 | 985 | /* do the ssh launch - this will exit if it fails */
|
961 | 986 | ssh_child(caddy->argc, caddy->argv);
|
962 | 987 | } else { /* father */
|
| 988 | + // Put the child in a separate progress group |
| 989 | + // - see comment in child section. |
| 990 | +#if HAVE_SETPGID |
| 991 | + if( 0 != setpgid(pid, pid) ) { |
| 992 | + opal_output(0, "plm:rsh: Warning: setpgid(%ld,%ld) failed in parent with errno=%s(%d)\n", |
| 993 | + (long)pid, (long)pid, strerror(errno), errno); |
| 994 | + // Ignore this error since the child is off and running. |
| 995 | + // We still need to track it. |
| 996 | + } |
| 997 | +#endif |
| 998 | + |
963 | 999 | /* indicate this daemon has been launched */
|
964 | 1000 | caddy->daemon->state = ORTE_PROC_STATE_RUNNING;
|
965 | 1001 | /* record the pid of the ssh fork */
|
|
0 commit comments