Skip to content

Commit 85a6349

Browse files
author
Ralph Castain
committed
Update signal handling to introduce a pause between SIGCONT and SIGTERM, followed by another pause before SIGKILL. Do this within the odls/kill_local_procs function while we know we are blocked in an event, and before the daemon shuts down the event progress loop
Signed-off-by: Ralph Castain <[email protected]>
1 parent a3e4c33 commit 85a6349

File tree

3 files changed

+60
-51
lines changed

3 files changed

+60
-51
lines changed

orte/mca/odls/base/odls_base_default_fns.c

Lines changed: 48 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1379,14 +1379,12 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
13791379
}
13801380

13811381
typedef struct {
1382-
opal_object_t super;
1382+
opal_list_item_t super;
13831383
orte_proc_t *child;
1384-
orte_odls_base_kill_local_fn_t kill_local;
13851384
} orte_odls_quick_caddy_t;
13861385
static void qcdcon(orte_odls_quick_caddy_t *p)
13871386
{
13881387
p->child = NULL;
1389-
p->kill_local = NULL;
13901388
}
13911389
static void qcddes(orte_odls_quick_caddy_t *p)
13921390
{
@@ -1395,38 +1393,9 @@ static void qcddes(orte_odls_quick_caddy_t *p)
13951393
}
13961394
}
13971395
OBJ_CLASS_INSTANCE(orte_odls_quick_caddy_t,
1398-
opal_object_t,
1396+
opal_list_item_t,
13991397
qcdcon, qcddes);
14001398

1401-
static void send_kill(int sd, short args, void *cbdata)
1402-
{
1403-
orte_timer_t *tm = (orte_timer_t*)cbdata;
1404-
orte_odls_quick_caddy_t *cd = (orte_odls_quick_caddy_t*)tm->payload;
1405-
1406-
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1407-
"%s SENDING FORCE SIGKILL TO %s",
1408-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1409-
ORTE_NAME_PRINT(&cd->child->name)));
1410-
1411-
cd->kill_local(cd->child->pid, SIGKILL);
1412-
/* indicate the waitpid fired as this is effectively what
1413-
* has happened
1414-
*/
1415-
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID);
1416-
cd->child->pid = 0;
1417-
1418-
/* ensure the child's session directory is cleaned up */
1419-
orte_session_dir_finalize(&cd->child->name);
1420-
/* check for everything complete - this will remove
1421-
* the child object from our local list
1422-
*/
1423-
if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
1424-
ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) {
1425-
ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state);
1426-
}
1427-
OBJ_RELEASE(cd);
1428-
}
1429-
14301399
int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
14311400
orte_odls_base_kill_local_fn_t kill_local)
14321401
{
@@ -1536,11 +1505,6 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
15361505
}
15371506
}
15381507

1539-
/* mark the child as "killed" since the waitpid will
1540-
* fire as soon as we kill it
1541-
*/
1542-
child->state = ORTE_PROC_STATE_KILLED_BY_CMD; /* we ordered it to die */
1543-
15441508
/* ensure the stdin IOF channel for this child is closed. The other
15451509
* channels will automatically close when the proc is killed
15461510
*/
@@ -1561,21 +1525,11 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
15611525
"%s SENDING SIGCONT TO %s",
15621526
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
15631527
ORTE_NAME_PRINT(&child->name)));
1564-
kill_local(child->pid, SIGCONT);
1565-
1566-
/* Send a sigterm to the process before sigkill to be nice */
1567-
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1568-
"%s SENDING SIGTERM TO %s",
1569-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1570-
ORTE_NAME_PRINT(&child->name)));
1571-
kill_local(child->pid, SIGTERM);
1572-
15731528
cd = OBJ_NEW(orte_odls_quick_caddy_t);
15741529
OBJ_RETAIN(child);
15751530
cd->child = child;
1576-
cd->kill_local = kill_local;
1577-
ORTE_DETECT_TIMEOUT(1, orte_odls_globals.timeout_before_sigkill,
1578-
10000000, send_kill, cd);
1531+
opal_list_append(&procs_killed, &cd->super);
1532+
kill_local(child->pid, SIGCONT);
15791533
continue;
15801534

15811535
CLEANUP:
@@ -1591,7 +1545,50 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
15911545
}
15921546
}
15931547

1594-
/* cleanup, if required */
1548+
/* if we are issuing signals, then we need to wait a little
1549+
* and send the next in sequence */
1550+
if (0 < opal_list_get_size(&procs_killed)) {
1551+
sleep(orte_odls_globals.timeout_before_sigkill);
1552+
/* issue a SIGTERM to all */
1553+
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
1554+
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1555+
"%s SENDING SIGTERM TO %s",
1556+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1557+
ORTE_NAME_PRINT(&child->name)));
1558+
kill_local(cd->child->pid, SIGTERM);
1559+
}
1560+
/* wait a little again */
1561+
sleep(orte_odls_globals.timeout_before_sigkill);
1562+
/* issue a SIGKILL to all */
1563+
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
1564+
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1565+
"%s SENDING SIGKILL TO %s",
1566+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1567+
ORTE_NAME_PRINT(&child->name)));
1568+
kill_local(cd->child->pid, SIGKILL);
1569+
/* indicate the waitpid fired as this is effectively what
1570+
* has happened
1571+
*/
1572+
ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID);
1573+
cd->child->pid = 0;
1574+
1575+
/* mark the child as "killed" */
1576+
cd->child->state = ORTE_PROC_STATE_KILLED_BY_CMD; /* we ordered it to die */
1577+
1578+
/* ensure the child's session directory is cleaned up */
1579+
orte_session_dir_finalize(&cd->child->name);
1580+
/* check for everything complete - this will remove
1581+
* the child object from our local list
1582+
*/
1583+
if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
1584+
ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) {
1585+
ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state);
1586+
}
1587+
}
1588+
}
1589+
OPAL_LIST_DESTRUCT(&procs_killed);
1590+
1591+
/* cleanup arrays, if required */
15951592
if (do_cleanup) {
15961593
OBJ_DESTRUCT(&procarray);
15971594
OBJ_DESTRUCT(&proctmp);

orte/mca/odls/default/odls_default_module.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,9 +329,11 @@ static int do_child(orte_app_context_t* context,
329329
long fd, fdmax = sysconf(_SC_OPEN_MAX);
330330
char *param, *msg;
331331

332+
#if HAVE_SETPGID
332333
/* Set a new process group for this child, so that any
333334
* signals we send to it will reach any children it spawns */
334335
setpgid(0, 0);
336+
#endif
335337

336338
/* Setup the pipe to be close-on-exec */
337339
opal_fd_set_cloexec(write_fd);

orte/test/system/sigusr_trap.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ void sigusr_handler(int signum)
2828
fprintf(stderr, "%s Trapped SIGUSR2\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
2929
return;
3030

31+
case SIGCONT:
32+
fprintf(stderr, "%s Trapped SIGCONT\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
33+
return;
34+
3135
default:
3236
fprintf(stderr, "%s Undefined signal %d trapped\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum);
3337
return;
@@ -55,6 +59,7 @@ void exit_handler(int signum)
5559
fprintf(stderr, "%s Undefined signal %d trapped\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum);
5660
break;
5761
}
62+
return;
5863

5964
exit(1);
6065
}
@@ -79,6 +84,11 @@ int main(int argc, char* argv[])
7984
exit(1);
8085
}
8186

87+
if (signal(SIGCONT, sigusr_handler) == SIG_IGN) {
88+
fprintf(stderr, "Could not setup signal trap for SIGUSR2\n");
89+
exit(1);
90+
}
91+
8292
if (signal(SIGINT, exit_handler) == SIG_IGN) {
8393
fprintf(stderr, "Could not setup signal trap for SIGINT\n");
8494
exit(1);

0 commit comments

Comments
 (0)