Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit 923f4dd

Browse files
author
Ralph Castain
committed
Fix the 2.0 branch segfaults on finalize - we need to be in the same thread when closing the RML recv list
1 parent 1d9a61c commit 923f4dd

File tree

1 file changed

+29
-6
lines changed

1 file changed

+29
-6
lines changed

orte/mca/rml/base/rml_base_frame.c

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
66
* reserved.
77
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
8-
* Copyright (c) 2014 Intel Corporation. All rights reserved.
8+
* Copyright (c) 2014-2016 Intel Corporation. All rights reserved.
99
* $COPYRIGHT$
1010
*
1111
* Additional copyrights may follow
@@ -24,6 +24,7 @@
2424

2525
#include "orte/mca/rml/rml.h"
2626
#include "orte/mca/state/state.h"
27+
#include "orte/runtime/orte_wait.h"
2728
#include "orte/util/name_fns.h"
2829

2930
#include "orte/mca/rml/base/base.h"
@@ -72,14 +73,36 @@ static int orte_rml_base_register(mca_base_register_flag_t flags)
7273
return ORTE_SUCCESS;
7374
}
7475

75-
static int orte_rml_base_close(void)
76+
static void cleanup(int sd, short args, void *cbdata)
7677
{
77-
opal_list_item_t *item;
78+
volatile bool *active = (volatile bool*)cbdata;
7879

79-
while (NULL != (item = opal_list_remove_first(&orte_rml_base.posted_recvs))) {
80-
OBJ_RELEASE(item);
80+
OPAL_LIST_DESTRUCT(&orte_rml_base.posted_recvs);
81+
if (NULL != active) {
82+
*active = false;
8183
}
82-
OBJ_DESTRUCT(&orte_rml_base.posted_recvs);
84+
}
85+
86+
static int orte_rml_base_close(void)
87+
{
88+
volatile bool active;
89+
90+
/* because the RML posted recvs list is in a separate
91+
* async thread for apps, we can't just destruct it here.
92+
* Instead, we push it into that event thread and destruct
93+
* it there */
94+
if (ORTE_PROC_IS_APP) {
95+
opal_event_t ev;
96+
active = true;
97+
opal_event_set(orte_event_base, &ev, -1,
98+
OPAL_EV_WRITE, cleanup, (void*)&active);
99+
opal_event_set_priority(&ev, ORTE_ERROR_PRI);
100+
opal_event_active(&ev, OPAL_EV_WRITE, 1);
101+
ORTE_WAIT_FOR_COMPLETION(active);
102+
} else {
103+
/* we can call the destruct directly */
104+
cleanup(0, 0, NULL);
105+
}
83106

84107
OPAL_TIMING_REPORT(orte_rml_base.timing, &tm_rml);
85108

0 commit comments

Comments
 (0)