@@ -1379,14 +1379,12 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
1379
1379
}
1380
1380
1381
1381
typedef struct {
1382
- opal_object_t super ;
1382
+ opal_list_item_t super ;
1383
1383
orte_proc_t * child ;
1384
- orte_odls_base_kill_local_fn_t kill_local ;
1385
1384
} orte_odls_quick_caddy_t ;
1386
1385
static void qcdcon (orte_odls_quick_caddy_t * p )
1387
1386
{
1388
1387
p -> child = NULL ;
1389
- p -> kill_local = NULL ;
1390
1388
}
1391
1389
static void qcddes (orte_odls_quick_caddy_t * p )
1392
1390
{
@@ -1395,38 +1393,9 @@ static void qcddes(orte_odls_quick_caddy_t *p)
1395
1393
}
1396
1394
}
1397
1395
OBJ_CLASS_INSTANCE (orte_odls_quick_caddy_t ,
1398
- opal_object_t ,
1396
+ opal_list_item_t ,
1399
1397
qcdcon , qcddes );
1400
1398
1401
- static void send_kill (int sd , short args , void * cbdata )
1402
- {
1403
- orte_timer_t * tm = (orte_timer_t * )cbdata ;
1404
- orte_odls_quick_caddy_t * cd = (orte_odls_quick_caddy_t * )tm -> payload ;
1405
-
1406
- OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1407
- "%s SENDING FORCE SIGKILL TO %s" ,
1408
- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1409
- ORTE_NAME_PRINT (& cd -> child -> name )));
1410
-
1411
- cd -> kill_local (cd -> child -> pid , SIGKILL );
1412
- /* indicate the waitpid fired as this is effectively what
1413
- * has happened
1414
- */
1415
- ORTE_FLAG_SET (cd -> child , ORTE_PROC_FLAG_WAITPID );
1416
- cd -> child -> pid = 0 ;
1417
-
1418
- /* ensure the child's session directory is cleaned up */
1419
- orte_session_dir_finalize (& cd -> child -> name );
1420
- /* check for everything complete - this will remove
1421
- * the child object from our local list
1422
- */
1423
- if (ORTE_FLAG_TEST (cd -> child , ORTE_PROC_FLAG_IOF_COMPLETE ) &&
1424
- ORTE_FLAG_TEST (cd -> child , ORTE_PROC_FLAG_WAITPID )) {
1425
- ORTE_ACTIVATE_PROC_STATE (& cd -> child -> name , cd -> child -> state );
1426
- }
1427
- OBJ_RELEASE (cd );
1428
- }
1429
-
1430
1399
int orte_odls_base_default_kill_local_procs (opal_pointer_array_t * procs ,
1431
1400
orte_odls_base_kill_local_fn_t kill_local )
1432
1401
{
@@ -1536,11 +1505,6 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
1536
1505
}
1537
1506
}
1538
1507
1539
- /* mark the child as "killed" since the waitpid will
1540
- * fire as soon as we kill it
1541
- */
1542
- child -> state = ORTE_PROC_STATE_KILLED_BY_CMD ; /* we ordered it to die */
1543
-
1544
1508
/* ensure the stdin IOF channel for this child is closed. The other
1545
1509
* channels will automatically close when the proc is killed
1546
1510
*/
@@ -1561,21 +1525,11 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
1561
1525
"%s SENDING SIGCONT TO %s" ,
1562
1526
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1563
1527
ORTE_NAME_PRINT (& child -> name )));
1564
- kill_local (child -> pid , SIGCONT );
1565
-
1566
- /* Send a sigterm to the process before sigkill to be nice */
1567
- OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1568
- "%s SENDING SIGTERM TO %s" ,
1569
- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1570
- ORTE_NAME_PRINT (& child -> name )));
1571
- kill_local (child -> pid , SIGTERM );
1572
-
1573
1528
cd = OBJ_NEW (orte_odls_quick_caddy_t );
1574
1529
OBJ_RETAIN (child );
1575
1530
cd -> child = child ;
1576
- cd -> kill_local = kill_local ;
1577
- ORTE_DETECT_TIMEOUT (1 , orte_odls_globals .timeout_before_sigkill ,
1578
- 10000000 , send_kill , cd );
1531
+ opal_list_append (& procs_killed , & cd -> super );
1532
+ kill_local (child -> pid , SIGCONT );
1579
1533
continue ;
1580
1534
1581
1535
CLEANUP :
@@ -1591,7 +1545,50 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
1591
1545
}
1592
1546
}
1593
1547
1594
- /* cleanup, if required */
1548
+ /* if we are issuing signals, then we need to wait a little
1549
+ * and send the next in sequence */
1550
+ if (0 < opal_list_get_size (& procs_killed )) {
1551
+ sleep (orte_odls_globals .timeout_before_sigkill );
1552
+ /* issue a SIGTERM to all */
1553
+ OPAL_LIST_FOREACH (cd , & procs_killed , orte_odls_quick_caddy_t ) {
1554
+ OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1555
+ "%s SENDING SIGTERM TO %s" ,
1556
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1557
+ ORTE_NAME_PRINT (& child -> name )));
1558
+ kill_local (cd -> child -> pid , SIGTERM );
1559
+ }
1560
+ /* wait a little again */
1561
+ sleep (orte_odls_globals .timeout_before_sigkill );
1562
+ /* issue a SIGKILL to all */
1563
+ OPAL_LIST_FOREACH (cd , & procs_killed , orte_odls_quick_caddy_t ) {
1564
+ OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
1565
+ "%s SENDING SIGKILL TO %s" ,
1566
+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
1567
+ ORTE_NAME_PRINT (& child -> name )));
1568
+ kill_local (cd -> child -> pid , SIGKILL );
1569
+ /* indicate the waitpid fired as this is effectively what
1570
+ * has happened
1571
+ */
1572
+ ORTE_FLAG_SET (cd -> child , ORTE_PROC_FLAG_WAITPID );
1573
+ cd -> child -> pid = 0 ;
1574
+
1575
+ /* mark the child as "killed" */
1576
+ cd -> child -> state = ORTE_PROC_STATE_KILLED_BY_CMD ; /* we ordered it to die */
1577
+
1578
+ /* ensure the child's session directory is cleaned up */
1579
+ orte_session_dir_finalize (& cd -> child -> name );
1580
+ /* check for everything complete - this will remove
1581
+ * the child object from our local list
1582
+ */
1583
+ if (ORTE_FLAG_TEST (cd -> child , ORTE_PROC_FLAG_IOF_COMPLETE ) &&
1584
+ ORTE_FLAG_TEST (cd -> child , ORTE_PROC_FLAG_WAITPID )) {
1585
+ ORTE_ACTIVATE_PROC_STATE (& cd -> child -> name , cd -> child -> state );
1586
+ }
1587
+ }
1588
+ }
1589
+ OPAL_LIST_DESTRUCT (& procs_killed );
1590
+
1591
+ /* cleanup arrays, if required */
1595
1592
if (do_cleanup ) {
1596
1593
OBJ_DESTRUCT (& procarray );
1597
1594
OBJ_DESTRUCT (& proctmp );
0 commit comments