@@ -1361,21 +1361,22 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
13611361}
13621362
13631363typedef struct {
1364+ opal_list_item_t super ;
13641365 orte_proc_t * child ;
1365- orte_odls_base_kill_local_fn_t kill_local ;
1366- } odls_kill_caddy_t ;
1367-
1368- static void kill_cbfunc (int fd , short args , void * cbdata )
1366+ } orte_odls_quick_caddy_t ;
1367+ static void qcdcon (orte_odls_quick_caddy_t * p )
13691368{
1370- odls_kill_caddy_t * cd = (odls_kill_caddy_t * )cbdata ;
1371-
1372- if (!ORTE_FLAG_TEST (cd -> child , ORTE_PROC_FLAG_ALIVE ) || 0 == cd -> child -> pid ) {
1373- free (cd );
1374- return ;
1369+ p -> child = NULL ;
1370+ }
1371+ static void qcddes (orte_odls_quick_caddy_t * p )
1372+ {
1373+ if (NULL != p -> child ) {
1374+ OBJ_RELEASE (p -> child );
13751375 }
1376- cd -> kill_local (cd -> child -> pid , SIGKILL );
1377- free (cd );
13781376}
1377+ OBJ_CLASS_INSTANCE (orte_odls_quick_caddy_t ,
1378+ opal_list_item_t ,
1379+ qcdcon , qcddes );
13791380
13801381int orte_odls_base_default_kill_local_procs (opal_pointer_array_t * procs ,
13811382 orte_odls_base_kill_local_fn_t kill_local ,
@@ -1387,6 +1388,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
13871388 int i , j ;
13881389 opal_pointer_array_t procarray , * procptr ;
13891390 bool do_cleanup ;
1391+ orte_odls_quick_caddy_t * cd ;
13901392
13911393 OBJ_CONSTRUCT (& procs_killed , opal_list_t );
13921394
@@ -1503,68 +1505,78 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
15031505 */
15041506 orte_wait_cb_cancel (child );
15051507
1506- if (!do_cleanup ) {
1507- odls_kill_caddy_t * cd ;
1508-
1509- /* if we are killing only selected procs, then do so in a gentle
1510- fashion. First send a SIGCONT in case the process is in stopped state.
1511- If it is in a stopped state and we do not first change it to
1512- running, then SIGTERM will not get delivered. Ignore return
1513- value. */
1514- OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1515- "%s SENDING SIGCONT TO %s" ,
1516- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1517- ORTE_NAME_PRINT (& child -> name )));
1518- kill_local (child -> pid , SIGCONT );
1508+ /* First send a SIGCONT in case the process is in stopped state.
1509+ If it is in a stopped state and we do not first change it to
1510+ running, then SIGTERM will not get delivered. Ignore return
1511+ value. */
1512+ OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1513+ "%s SENDING SIGCONT TO %s" ,
1514+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1515+ ORTE_NAME_PRINT (& child -> name )));
1516+ cd = OBJ_NEW (orte_odls_quick_caddy_t );
1517+ OBJ_RETAIN (child );
1518+ cd -> child = child ;
1519+ opal_list_append (& procs_killed , & cd -> super );
1520+ kill_local (child -> pid , SIGCONT );
1521+ continue ;
15191522
1520- /* Send a sigterm to the process before sigkill to be nice */
1521- OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1522- "%s SENDING SIGTERM TO %s" ,
1523- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1524- ORTE_NAME_PRINT (& child -> name )));
1525- kill_local (child -> pid , SIGTERM );
1526- /* provide a polite delay so the proc has a chance to react */
1527- cd = (odls_kill_caddy_t * )malloc (sizeof (odls_kill_caddy_t ));
1528- OBJ_RETAIN (child ); // protect against race conditions
1529- cd -> child = child ;
1530- cd -> kill_local = kill_local ;
1531- ORTE_TIMER_EVENT (1 , 0 , kill_cbfunc , ORTE_SYS_PRI );
1532- continue ;
1523+ CLEANUP :
1524+ /* ensure the child's session directory is cleaned up */
1525+ orte_session_dir_finalize (& child -> name );
1526+ /* check for everything complete - this will remove
1527+ * the child object from our local list
1528+ */
1529+ if (ORTE_FLAG_TEST (child , ORTE_PROC_FLAG_IOF_COMPLETE ) &&
1530+ ORTE_FLAG_TEST (child , ORTE_PROC_FLAG_WAITPID )) {
1531+ ORTE_ACTIVATE_PROC_STATE (& child -> name , child -> state );
15331532 }
1533+ }
1534+ }
15341535
1535- /* Force the SIGKILL just to make sure things are dead
1536- * This fixes an issue that, if the application is masking
1537- * SIGTERM, then the child_died()
1538- * may return 'true' even though waipid returns with 0.
1539- * It does this to avoid a race condition, per documentation
1540- * in odls_default_module.c.
1541- */
1536+ /* if we are issuing signals, then we need to wait a little
1537+ * and send the next in sequence */
1538+ if (0 < opal_list_get_size (& procs_killed )) {
1539+ sleep (orte_odls_globals .timeout_before_sigkill );
1540+ /* issue a SIGTERM to all */
1541+ OPAL_LIST_FOREACH (cd , & procs_killed , orte_odls_quick_caddy_t ) {
15421542 OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1543- "%s SENDING FORCE SIGKILL TO %s pid %lu " ,
1543+ "%s SENDING SIGTERM TO %s" ,
15441544 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1545- ORTE_NAME_PRINT (& child -> name ), (unsigned long )child -> pid ));
1546- kill_local (child -> pid , SIGKILL );
1547-
1545+ ORTE_NAME_PRINT (& child -> name )));
1546+ kill_local (cd -> child -> pid , SIGTERM );
1547+ }
1548+ /* wait a little again */
1549+ sleep (orte_odls_globals .timeout_before_sigkill );
1550+ /* issue a SIGKILL to all */
1551+ OPAL_LIST_FOREACH (cd , & procs_killed , orte_odls_quick_caddy_t ) {
1552+ OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1553+ "%s SENDING SIGKILL TO %s" ,
1554+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1555+ ORTE_NAME_PRINT (& child -> name )));
1556+ kill_local (cd -> child -> pid , SIGKILL );
15481557 /* indicate the waitpid fired as this is effectively what
15491558 * has happened
15501559 */
1551- ORTE_FLAG_SET (child , ORTE_PROC_FLAG_WAITPID );
1552- child -> pid = 0 ;
1560+ ORTE_FLAG_SET (cd -> child , ORTE_PROC_FLAG_WAITPID );
1561+ cd -> child -> pid = 0 ;
1562+
1563+ /* mark the child as "killed" */
1564+ cd -> child -> state = ORTE_PROC_STATE_KILLED_BY_CMD ; /* we ordered it to die */
15531565
1554- CLEANUP :
15551566 /* ensure the child's session directory is cleaned up */
1556- orte_session_dir_finalize (& child -> name );
1567+ orte_session_dir_finalize (& cd -> child -> name );
15571568 /* check for everything complete - this will remove
15581569 * the child object from our local list
15591570 */
1560- if (ORTE_FLAG_TEST (child , ORTE_PROC_FLAG_IOF_COMPLETE ) &&
1561- ORTE_FLAG_TEST (child , ORTE_PROC_FLAG_WAITPID )) {
1562- ORTE_ACTIVATE_PROC_STATE (& child -> name , child -> state );
1571+ if (ORTE_FLAG_TEST (cd -> child , ORTE_PROC_FLAG_IOF_COMPLETE ) &&
1572+ ORTE_FLAG_TEST (cd -> child , ORTE_PROC_FLAG_WAITPID )) {
1573+ ORTE_ACTIVATE_PROC_STATE (& cd -> child -> name , cd -> child -> state );
15631574 }
15641575 }
15651576 }
1577+ OPAL_LIST_DESTRUCT (& procs_killed );
15661578
1567- /* cleanup, if required */
1579+ /* cleanup arrays , if required */
15681580 if (do_cleanup ) {
15691581 OBJ_DESTRUCT (& procarray );
15701582 OBJ_DESTRUCT (& proctmp );
0 commit comments