Skip to content

Commit ac36457

Browse files
committed
Stop PMIx progress thread during abnormal shutdown
Ensure full cleanup during abnormal term, and stop the PMIx progress thread before doing so. Signed-off-by: Ralph Castain <[email protected]>
1 parent 97e683d commit ac36457

File tree

1 file changed

+58
-59
lines changed

1 file changed

+58
-59
lines changed

src/prted/prte.c

Lines changed: 58 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,6 @@ static void setup_sighandler(int signal, prte_event_t *ev, prte_event_cbfunc_t c
216216
static void shutdown_callback(int fd, short flags, void *arg)
217217
{
218218
prte_timer_t *tm = (prte_timer_t *) arg;
219-
prte_job_t *jdata;
220219
PRTE_HIDE_UNUSED_PARAMS(fd, flags);
221220

222221
if (NULL != tm) {
@@ -234,8 +233,11 @@ static void shutdown_callback(int fd, short flags, void *arg)
234233
prte_odls.kill_local_procs(NULL);
235234
// mark that we are finalizing so the session directory will cleanup
236235
prte_finalizing = true;
237-
jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace);
238-
PMIX_RELEASE(jdata);
236+
#ifdef PRTE_PMIX_STOP_PRGTHRD
237+
PMIx_Progress_thread_stop(NULL, 0);
238+
#endif
239+
prte_job_session_dir_finalize(NULL);
240+
PMIx_server_finalize();
239241
exit(PRTE_ERROR_DEFAULT_EXIT_CODE);
240242
}
241243

@@ -1224,12 +1226,13 @@ int prte(int argc, char *argv[])
12241226
* indicating clean termination! Instead, just forcibly cleanup
12251227
* the local session_dir tree and exit
12261228
*/
1227-
jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace);
1228-
PMIX_RELEASE(jdata);
1229-
1230-
/* return with non-zero status */
1231-
ret = PRTE_ERROR_DEFAULT_EXIT_CODE;
1232-
goto DONE;
1229+
prte_finalizing = true;
1230+
#ifdef PRTE_PMIX_STOP_PRGTHRD
1231+
PMIx_Progress_thread_stop(NULL, 0);
1232+
#endif
1233+
prte_job_session_dir_finalize(NULL);
1234+
PMIx_server_finalize();
1235+
exit(PRTE_ERROR_DEFAULT_EXIT_CODE);
12331236
}
12341237
}
12351238
}
@@ -1430,56 +1433,6 @@ int prte(int argc, char *argv[])
14301433
exit(prte_exit_status);
14311434
}
14321435

1433-
static void clean_abort(int fd, short flags, void *arg)
1434-
{
1435-
PRTE_HIDE_UNUSED_PARAMS(fd, flags);
1436-
1437-
if (keepalive && NULL == arg) {
1438-
// ignore this
1439-
return;
1440-
}
1441-
1442-
/* if we have already ordered this once, don't keep
1443-
* doing it to avoid race conditions
1444-
*/
1445-
if (pmix_mutex_trylock(&prun_abort_inprogress_lock)) { /* returns 1 if already locked */
1446-
if (forcibly_die) {
1447-
/* exit with a non-zero status */
1448-
exit(1);
1449-
}
1450-
fprintf(stderr,
1451-
"%s: abort is already in progress...hit ctrl-c again to forcibly terminate\n\n",
1452-
prte_tool_basename);
1453-
forcibly_die = true;
1454-
/* reset the event */
1455-
prte_event_add(&term_handler, NULL);
1456-
return;
1457-
}
1458-
1459-
fflush(stderr);
1460-
/* ensure we exit with a non-zero status */
1461-
PRTE_UPDATE_EXIT_STATUS(PRTE_ERROR_DEFAULT_EXIT_CODE);
1462-
/* ensure that the forwarding of stdin stops */
1463-
prte_dvm_abort_ordered = true;
1464-
/* tell us to be quiet - hey, the user killed us with a ctrl-c,
1465-
* so need to tell them that!
1466-
*/
1467-
prte_execute_quiet = true;
1468-
prte_abnormal_term_ordered = true;
1469-
/* We are in an event handler; the job completed procedure
1470-
will delete the signal handler that is currently running
1471-
(which is a Bad Thing), so we can't call it directly.
1472-
Instead, we have to exit this handler and setup to call
1473-
job_completed() after this. */
1474-
prte_plm.terminate_orteds();
1475-
if (NULL != arg) {
1476-
PMIX_RELEASE(arg);
1477-
}
1478-
}
1479-
1480-
static bool first = true;
1481-
static bool second = true;
1482-
14831436
static void surekill(void)
14841437
{
14851438
prte_proc_t *child;
@@ -1514,6 +1467,47 @@ static void surekill(void)
15141467
}
15151468
}
15161469

1470+
static void clean_abort(int fd, short flags, void *arg)
1471+
{
1472+
PRTE_HIDE_UNUSED_PARAMS(fd, flags);
1473+
1474+
if (keepalive && NULL == arg) {
1475+
// ignore this
1476+
return;
1477+
}
1478+
1479+
/* if we have already ordered this once, don't keep
1480+
* doing it to avoid race conditions
1481+
*/
1482+
if (pmix_mutex_trylock(&prun_abort_inprogress_lock)) { /* returns 1 if already locked */
1483+
if (forcibly_die) {
1484+
/* exit with a non-zero status */
1485+
exit(1);
1486+
}
1487+
fprintf(stderr,
1488+
"%s: abort is already in progress...hit ctrl-c again to forcibly terminate\n\n",
1489+
prte_tool_basename);
1490+
forcibly_die = true;
1491+
/* reset the event */
1492+
prte_event_add(&term_handler, NULL);
1493+
return;
1494+
}
1495+
1496+
fflush(stderr);
1497+
prte_finalizing = true;
1498+
/* ensure we exit with a non-zero status */
1499+
#ifdef PRTE_PMIX_STOP_PRGTHRD
1500+
PMIx_Progress_thread_stop(NULL, 0);
1501+
#endif
1502+
surekill(); // ensure we attempt to kill everything
1503+
prte_job_session_dir_finalize(NULL);
1504+
PMIx_server_finalize();
1505+
exit(PRTE_ERROR_DEFAULT_EXIT_CODE);
1506+
}
1507+
1508+
static bool first = true;
1509+
static bool second = true;
1510+
15171511
/*
15181512
* Attempt to terminate the job and wait for callback indicating
15191513
* the job has been aborted.
@@ -1541,7 +1535,12 @@ static void abort_signal_callback(int fd)
15411535
second = false;
15421536
} else {
15431537
surekill(); // ensure we attempt to kill everything
1538+
prte_finalizing = true;
1539+
#ifdef PRTE_PMIX_STOP_PRGTHRD
1540+
PMIx_Progress_thread_stop(NULL, 0);
1541+
#endif
15441542
prte_job_session_dir_finalize(NULL);
1543+
PMIx_server_finalize();
15451544
exit(1);
15461545
}
15471546
}

0 commit comments

Comments
 (0)