Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit ddb21f7

Browse files
authored
Merge pull request #1251 from hjelmn/v2.x_rdmacm
btl/openib: fix rdmacm hang
2 parents bb1c4f3 + ed2bba2 commit ddb21f7

File tree

1 file changed

+11
-8
lines changed

1 file changed

+11
-8
lines changed

opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1175,7 +1175,7 @@ static void *call_disconnect_callback(int fd, int flags, void *v)
11751175
*/
11761176
static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
11771177
{
1178-
rdmacm_contents_t *contents;
1178+
rdmacm_contents_t *contents = NULL, *item;
11791179
opal_event_t event;
11801180

11811181
BTL_VERBOSE(("Start disconnecting..."));
@@ -1195,8 +1195,9 @@ static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
11951195
* main thread and service thread.
11961196
*/
11971197
opal_mutex_lock(&client_list_lock);
1198-
OPAL_LIST_FOREACH(contents, &client_list, rdmacm_contents_t) {
1199-
if (endpoint == contents->endpoint) {
1198+
OPAL_LIST_FOREACH(item, &client_list, rdmacm_contents_t) {
1199+
if (endpoint == item->endpoint) {
1200+
contents = item;
12001201
opal_list_remove_item(&client_list, (opal_list_item_t *) contents);
12011202
contents->on_client_list = false;
12021203

@@ -1225,12 +1226,14 @@ static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
12251226
opal_atomic_wmb();
12261227
opal_mutex_unlock(&client_list_lock);
12271228

1228-
/* Now wait for all the disconnect callbacks to occur */
1229-
pthread_mutex_lock(&rdmacm_disconnect_lock);
1230-
while (opal_list_get_size (&contents->ids)) {
1231-
pthread_cond_wait (&rdmacm_disconnect_cond, &rdmacm_disconnect_lock);
1229+
if (NULL != contents) {
1230+
/* Now wait for all the disconnect callbacks to occur */
1231+
pthread_mutex_lock(&rdmacm_disconnect_lock);
1232+
while (opal_list_get_size (&contents->ids)) {
1233+
pthread_cond_wait (&rdmacm_disconnect_cond, &rdmacm_disconnect_lock);
1234+
}
1235+
pthread_mutex_unlock(&rdmacm_disconnect_lock);
12321236
}
1233-
pthread_mutex_unlock(&rdmacm_disconnect_lock);
12341237

12351238
OPAL_OUTPUT((-1, "MAIN Endpoint finished finalizing"));
12361239
return OPAL_SUCCESS;

0 commit comments

Comments
 (0)