Skip to content

Commit 2cf0e5d

Browse files
authored
Merge pull request #1837 from hjelmn/rdmacm_fix
btl/openib: fix rdmacm locking bug
2 parents cc2b3e0 + 01d6da3 commit 2cf0e5d

File tree

4 files changed

+9
-2
lines changed

4 files changed

+9
-2
lines changed

opal/mca/btl/openib/btl_openib_component.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
496496
trigger credit management (because the rd_credits will
497497
still be negative), and Bad Things will happen. */
498498
if (ep->endpoint_posted_recvs) {
499+
/* need to hold to lock for both send_cts and connected */
500+
OPAL_THREAD_LOCK(&ep->endpoint_lock);
499501
if (!ep->endpoint_cts_sent) {
500502
mca_btl_openib_endpoint_send_cts(ep);
501503
}

opal/mca/btl/openib/btl_openib_endpoint.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -537,13 +537,11 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
537537
ctl_hdr->type = MCA_BTL_OPENIB_CONTROL_CTS;
538538

539539
/* Send the fragment */
540-
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
541540
if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_send(endpoint, sc_frag)) {
542541
BTL_ERROR(("Failed to post CTS send"));
543542
mca_btl_openib_endpoint_invoke_error(endpoint);
544543
}
545544
endpoint->endpoint_cts_sent = true;
546-
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
547545
}
548546

549547
/*
@@ -588,6 +586,9 @@ void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
588586
OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete",
589587
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
590588
mca_btl_openib_endpoint_connected(endpoint);
589+
} else {
590+
/* the caller hold the lock and expects us to drop it */
591+
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
591592
}
592593
}
593594

opal/mca/btl/openib/btl_openib_endpoint.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,8 +342,11 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t*,
342342
void mca_btl_openib_endpoint_send_credits(mca_btl_base_endpoint_t*, const int);
343343
void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*);
344344
int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t*);
345+
346+
/* the endpoint lock must be held with OPAL_THREAD_LOCK for both CTS and cpc complete */
345347
void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint);
346348
void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t*);
349+
347350
void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t*);
348351
void mca_btl_openib_endpoint_init(mca_btl_openib_module_t*,
349352
mca_btl_base_endpoint_t*,

opal/mca/btl/openib/connect/btl_openib_connect_rdmacm.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1246,6 +1246,7 @@ static void *local_endpoint_cpc_complete(void *context)
12461246

12471247
OPAL_OUTPUT((-1, "MAIN local_endpoint_cpc_complete to %s",
12481248
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
1249+
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
12491250
mca_btl_openib_endpoint_cpc_complete(endpoint);
12501251

12511252
return NULL;

0 commit comments

Comments
 (0)