@@ -1379,14 +1379,12 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
13791379}
13801380
13811381typedef struct {
1382- opal_object_t super ;
1382+ opal_list_item_t super ;
13831383 orte_proc_t * child ;
1384- orte_odls_base_kill_local_fn_t kill_local ;
13851384} orte_odls_quick_caddy_t ;
13861385static void qcdcon (orte_odls_quick_caddy_t * p )
13871386{
13881387 p -> child = NULL ;
1389- p -> kill_local = NULL ;
13901388}
13911389static void qcddes (orte_odls_quick_caddy_t * p )
13921390{
@@ -1395,38 +1393,9 @@ static void qcddes(orte_odls_quick_caddy_t *p)
13951393 }
13961394}
13971395OBJ_CLASS_INSTANCE (orte_odls_quick_caddy_t ,
1398- opal_object_t ,
1396+ opal_list_item_t ,
13991397 qcdcon , qcddes );
14001398
1401- static void send_kill (int sd , short args , void * cbdata )
1402- {
1403- orte_timer_t * tm = (orte_timer_t * )cbdata ;
1404- orte_odls_quick_caddy_t * cd = (orte_odls_quick_caddy_t * )tm -> payload ;
1405-
1406- OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1407- "%s SENDING FORCE SIGKILL TO %s" ,
1408- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1409- ORTE_NAME_PRINT (& cd -> child -> name )));
1410-
1411- cd -> kill_local (cd -> child -> pid , SIGKILL );
1412- /* indicate the waitpid fired as this is effectively what
1413- * has happened
1414- */
1415- ORTE_FLAG_SET (cd -> child , ORTE_PROC_FLAG_WAITPID );
1416- cd -> child -> pid = 0 ;
1417-
1418- /* ensure the child's session directory is cleaned up */
1419- orte_session_dir_finalize (& cd -> child -> name );
1420- /* check for everything complete - this will remove
1421- * the child object from our local list
1422- */
1423- if (ORTE_FLAG_TEST (cd -> child , ORTE_PROC_FLAG_IOF_COMPLETE ) &&
1424- ORTE_FLAG_TEST (cd -> child , ORTE_PROC_FLAG_WAITPID )) {
1425- ORTE_ACTIVATE_PROC_STATE (& cd -> child -> name , cd -> child -> state );
1426- }
1427- OBJ_RELEASE (cd );
1428- }
1429-
14301399int orte_odls_base_default_kill_local_procs (opal_pointer_array_t * procs ,
14311400 orte_odls_base_kill_local_fn_t kill_local )
14321401{
@@ -1536,11 +1505,6 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
15361505 }
15371506 }
15381507
1539- /* mark the child as "killed" since the waitpid will
1540- * fire as soon as we kill it
1541- */
1542- child -> state = ORTE_PROC_STATE_KILLED_BY_CMD ; /* we ordered it to die */
1543-
15441508 /* ensure the stdin IOF channel for this child is closed. The other
15451509 * channels will automatically close when the proc is killed
15461510 */
@@ -1561,21 +1525,11 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
15611525 "%s SENDING SIGCONT TO %s" ,
15621526 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
15631527 ORTE_NAME_PRINT (& child -> name )));
1564- kill_local (child -> pid , SIGCONT );
1565-
1566- /* Send a sigterm to the process before sigkill to be nice */
1567- OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1568- "%s SENDING SIGTERM TO %s" ,
1569- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1570- ORTE_NAME_PRINT (& child -> name )));
1571- kill_local (child -> pid , SIGTERM );
1572-
15731528 cd = OBJ_NEW (orte_odls_quick_caddy_t );
15741529 OBJ_RETAIN (child );
15751530 cd -> child = child ;
1576- cd -> kill_local = kill_local ;
1577- ORTE_DETECT_TIMEOUT (1 , orte_odls_globals .timeout_before_sigkill ,
1578- 10000000 , send_kill , cd );
1531+ opal_list_append (& procs_killed , & cd -> super );
1532+ kill_local (child -> pid , SIGCONT );
15791533 continue ;
15801534
15811535 CLEANUP :
@@ -1591,7 +1545,50 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
15911545 }
15921546 }
15931547
1594- /* cleanup, if required */
1548+ /* if we are issuing signals, then we need to wait a little
1549+ * and send the next in sequence */
1550+ if (0 < opal_list_get_size (& procs_killed )) {
1551+ sleep (orte_odls_globals .timeout_before_sigkill );
1552+ /* issue a SIGTERM to all */
1553+ OPAL_LIST_FOREACH (cd , & procs_killed , orte_odls_quick_caddy_t ) {
1554+ OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1555+ "%s SENDING SIGTERM TO %s" ,
1556+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1557+ ORTE_NAME_PRINT (& child -> name )));
1558+ kill_local (cd -> child -> pid , SIGTERM );
1559+ }
1560+ /* wait a little again */
1561+ sleep (orte_odls_globals .timeout_before_sigkill );
1562+ /* issue a SIGKILL to all */
1563+ OPAL_LIST_FOREACH (cd , & procs_killed , orte_odls_quick_caddy_t ) {
1564+ OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1565+ "%s SENDING SIGKILL TO %s" ,
1566+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1567+ ORTE_NAME_PRINT (& child -> name )));
1568+ kill_local (cd -> child -> pid , SIGKILL );
1569+ /* indicate the waitpid fired as this is effectively what
1570+ * has happened
1571+ */
1572+ ORTE_FLAG_SET (cd -> child , ORTE_PROC_FLAG_WAITPID );
1573+ cd -> child -> pid = 0 ;
1574+
1575+ /* mark the child as "killed" */
1576+ cd -> child -> state = ORTE_PROC_STATE_KILLED_BY_CMD ; /* we ordered it to die */
1577+
1578+ /* ensure the child's session directory is cleaned up */
1579+ orte_session_dir_finalize (& cd -> child -> name );
1580+ /* check for everything complete - this will remove
1581+ * the child object from our local list
1582+ */
1583+ if (ORTE_FLAG_TEST (cd -> child , ORTE_PROC_FLAG_IOF_COMPLETE ) &&
1584+ ORTE_FLAG_TEST (cd -> child , ORTE_PROC_FLAG_WAITPID )) {
1585+ ORTE_ACTIVATE_PROC_STATE (& cd -> child -> name , cd -> child -> state );
1586+ }
1587+ }
1588+ }
1589+ OPAL_LIST_DESTRUCT (& procs_killed );
1590+
1591+ /* cleanup arrays, if required */
15951592 if (do_cleanup ) {
15961593 OBJ_DESTRUCT (& procarray );
15971594 OBJ_DESTRUCT (& proctmp );
0 commit comments