Skip to content

Commit cc2b3e0

Browse files
authored
Merge pull request #1830 from hjelmn/rdmacm_test
Test for rdmacm hang fix
2 parents 2640271 + 960fcd2 commit cc2b3e0

File tree

1 file changed

+11
-8
lines changed

1 file changed

+11
-8
lines changed

opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1173,7 +1173,7 @@ static void *call_disconnect_callback(int fd, int flags, void *v)
11731173
*/
11741174
static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
11751175
{
1176-
rdmacm_contents_t *contents;
1176+
rdmacm_contents_t *contents = NULL, *item;
11771177
opal_event_t event;
11781178

11791179
BTL_VERBOSE(("Start disconnecting..."));
@@ -1193,8 +1193,9 @@ static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
11931193
* main thread and service thread.
11941194
*/
11951195
opal_mutex_lock(&client_list_lock);
1196-
OPAL_LIST_FOREACH(contents, &client_list, rdmacm_contents_t) {
1197-
if (endpoint == contents->endpoint) {
1196+
OPAL_LIST_FOREACH(item, &client_list, rdmacm_contents_t) {
1197+
if (endpoint == item->endpoint) {
1198+
contents = item;
11981199
opal_list_remove_item(&client_list, (opal_list_item_t *) contents);
11991200
contents->on_client_list = false;
12001201

@@ -1223,12 +1224,14 @@ static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
12231224
opal_atomic_wmb();
12241225
opal_mutex_unlock(&client_list_lock);
12251226

1226-
/* Now wait for all the disconnect callbacks to occur */
1227-
pthread_mutex_lock(&rdmacm_disconnect_lock);
1228-
while (opal_list_get_size (&contents->ids)) {
1229-
pthread_cond_wait (&rdmacm_disconnect_cond, &rdmacm_disconnect_lock);
1227+
if (NULL != contents) {
1228+
/* Now wait for all the disconnect callbacks to occur */
1229+
pthread_mutex_lock(&rdmacm_disconnect_lock);
1230+
while (opal_list_get_size (&contents->ids)) {
1231+
pthread_cond_wait (&rdmacm_disconnect_cond, &rdmacm_disconnect_lock);
1232+
}
1233+
pthread_mutex_unlock(&rdmacm_disconnect_lock);
12301234
}
1231-
pthread_mutex_unlock(&rdmacm_disconnect_lock);
12321235

12331236
OPAL_OUTPUT((-1, "MAIN Endpoint finished finalizing"));
12341237
return OPAL_SUCCESS;

0 commit comments

Comments
 (0)