Skip to content

Commit 429bdf1

Browse files
committed
oob/tcp: fix a race condition when finalizing the oob/tcp component
1 parent e380f8c commit 429bdf1

File tree

1 file changed

+40
-4
lines changed

1 file changed

+40
-4
lines changed

orte/mca/oob/tcp/oob_tcp_component.c

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
1717
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
1818
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
19+
* Copyright (c) 2015 Research Organization for Information Science
20+
* and Technology (RIST). All rights reserved.
1921
* $COPYRIGHT$
2022
*
2123
* Additional copyrights may follow
@@ -68,6 +70,7 @@
6870
#include "orte/util/parse_options.h"
6971
#include "orte/util/show_help.h"
7072
#include "orte/runtime/orte_globals.h"
73+
#include "orte/runtime/orte_wait.h"
7174

7275
#include "orte/mca/oob/tcp/oob_tcp.h"
7376
#include "orte/mca/oob/tcp/oob_tcp_component.h"
@@ -630,10 +633,22 @@ static int component_startup(void)
630633
return rc;
631634
}
632635

636+
static void cleanup(int sd, short args, void *cbdata)
637+
{
638+
opal_list_item_t * item;
639+
bool *active = (bool*)cbdata;
640+
while (NULL != (item = opal_list_remove_first(&mca_oob_tcp_component.listeners))) {
641+
OBJ_RELEASE(item);
642+
}
643+
if (NULL != active) {
644+
*active = false;
645+
}
646+
}
647+
633648
static void component_shutdown(void)
634649
{
635650
int i = 0;
636-
opal_list_item_t *item;
651+
bool active;
637652

638653
opal_output_verbose(2, orte_oob_base_framework.framework_output,
639654
"%s TCP SHUTDOWN",
@@ -644,16 +659,37 @@ static void component_shutdown(void)
644659
/* tell the thread to exit */
645660
write(mca_oob_tcp_component.stop_thread[1], &i, sizeof(int));
646661
opal_thread_join(&mca_oob_tcp_component.listen_thread, NULL);
662+
} else {
663+
opal_output_verbose(2, orte_oob_base_framework.framework_output,
664+
"no hnp or not active");
647665
}
648666

649-
while (NULL != (item = opal_list_remove_first(&mca_oob_tcp_component.listeners))) {
650-
OBJ_RELEASE(item);
651-
}
667+
/* because the listeners are in a separate
668+
* async thread for apps, we can't just release them here.
669+
* Instead, we push it into that event thread and release
670+
* them there */
671+
if (ORTE_PROC_IS_APP) {
672+
opal_event_t ev;
673+
active = true;
674+
opal_event_set(orte_event_base, &ev, -1,
675+
OPAL_EV_WRITE, cleanup, &active);
676+
opal_event_set_priority(&ev, ORTE_ERROR_PRI);
677+
opal_event_active(&ev, OPAL_EV_WRITE, 1);
678+
ORTE_WAIT_FOR_COMPLETION(active);
679+
} else {
680+
/* we can call the destruct directly */
681+
cleanup(0, 0, NULL);
682+
}
683+
opal_output_verbose(2, orte_oob_base_framework.framework_output,
684+
"all listeners released");
652685

653686
/* shutdown the module */
654687
if (NULL != mca_oob_tcp_module.api.finalize) {
655688
mca_oob_tcp_module.api.finalize();
656689
}
690+
opal_output_verbose(2, orte_oob_base_framework.framework_output,
691+
"%s TCP SHUTDOWN done",
692+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
657693
}
658694

659695
static int component_send(orte_rml_send_t *msg)

0 commit comments

Comments
 (0)