|
13 | 13 | * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights |
14 | 14 | * reserved. |
15 | 15 | * Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved. |
16 | | - * Copyright (c) 2011 IBM Corporation. All rights reserved. |
| 16 | + * Copyright (c) 2011-2017 IBM Corporation. All rights reserved. |
17 | 17 | * Copyright (c) 2014-2015 Intel Corporation. All rights reserved. |
18 | 18 | * Copyright (c) 2015 Research Organization for Information Science |
19 | 19 | * and Technology (RIST). All rights reserved. |
@@ -963,9 +963,45 @@ static void process_launch_list(int fd, short args, void *cbdata) |
963 | 963 |
|
964 | 964 | /* child */ |
965 | 965 | if (pid == 0) { |
| 966 | + /* |
| 967 | + * When the user presses CTRL-C, SIGINT is sent to the whole process |
| 968 | + * group which terminates the rsh/ssh command. This can cause the |
| 969 | + * remote daemon to crash with a SIGPIPE when it tried to print out |
| 970 | + * status information. This has two concequences: |
| 971 | + * 1) The remote node is not cleaned up as it should. The local |
| 972 | + * processes will notice that the orted failed and cleanup their |
| 973 | + * part of the session directory, but the job level part will |
| 974 | + * remain littered. |
| 975 | + * 2) Any debugging information we expected to see from the orted |
| 976 | + * during shutdown is lost. |
| 977 | + * |
| 978 | + * The solution here is to put the child processes in a separate |
| 979 | + * process group from the HNP. So when the user presses CTRL-C |
| 980 | + * then only the HNP receives the signal, and not the rsh/ssh |
| 981 | + * child processes. |
| 982 | + */ |
| 983 | +#if HAVE_SETPGID |
| 984 | + if( 0 != setpgid(0, 0) ) { |
| 985 | + opal_output(0, "plm:rsh: Error: setpgid(0,0) failed in child with errno=%s(%d)\n", |
| 986 | + strerror(errno), errno); |
| 987 | + exit(-1); |
| 988 | + } |
| 989 | +#endif |
| 990 | + |
966 | 991 | /* do the ssh launch - this will exit if it fails */ |
967 | 992 | ssh_child(caddy->argc, caddy->argv); |
968 | 993 | } else { /* father */ |
| 994 | + // Put the child in a separate progress group |
| 995 | + // - see comment in child section. |
| 996 | +#if HAVE_SETPGID |
| 997 | + if( 0 != setpgid(pid, pid) ) { |
| 998 | + opal_output(0, "plm:rsh: Warning: setpgid(%ld,%ld) failed in parent with errno=%s(%d)\n", |
| 999 | + (long)pid, (long)pid, strerror(errno), errno); |
| 1000 | + // Ignore this error since the child is off and running. |
| 1001 | + // We still need to track it. |
| 1002 | + } |
| 1003 | +#endif |
| 1004 | + |
969 | 1005 | /* indicate this daemon has been launched */ |
970 | 1006 | caddy->daemon->state = ORTE_PROC_STATE_RUNNING; |
971 | 1007 | /* record the pid of the ssh fork */ |
|
0 commit comments