Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit e142840

Browse files
authored
Merge pull request #1305 from hjelmn/v2.x_ugni
btl/ugni: protect against re-entry and races in connections
2 parents 7887c58 + e89e847 commit e142840

File tree

3 files changed

+33
-10
lines changed

3 files changed

+33
-10
lines changed

opal/mca/btl/ugni/btl_ugni_component.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -398,8 +398,8 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
398398
/* check for datagram completion */
399399
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); /* TODO: may not need lock for this function */
400400
grc = GNI_PostDataProbeById (ugni_module->device->dev_handle, &datagram_id);
401-
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
402401
if (OPAL_LIKELY(GNI_RC_SUCCESS != grc)) {
402+
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
403403
return 0;
404404
}
405405

@@ -415,7 +415,6 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
415415
}
416416

417417
/* wait for the incoming datagram to complete (in case it isn't) */
418-
OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); /* TODO: may not need lock for this function */
419418
grc = GNI_EpPostDataWaitById (handle, datagram_id, -1, &post_state,
420419
&remote_addr, &remote_id);
421420
OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
@@ -457,7 +456,14 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
457456
data, (void *) ep, remote_id));
458457

459458
/* NTH: TODO -- error handling */
459+
opal_mutex_lock (&ep->lock);
460+
if (handle != ugni_module->wildcard_ep) {
461+
/* directed post complete */
462+
ep->dg_posted = false;
463+
}
464+
460465
(void) mca_btl_ugni_ep_connect_progress (ep);
466+
opal_mutex_unlock (&ep->lock);
461467

462468
if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
463469
/* process messages waiting in the endpoint's smsg mailbox */

opal/mca/btl/ugni/btl_ugni_endpoint.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -200,11 +200,14 @@ int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) {
200200

201201
if (GNI_SMSG_TYPE_INVALID == ep->remote_attr.smsg_attr.msg_type) {
202202
/* use datagram to exchange connection information with the remote peer */
203-
rc = mca_btl_ugni_directed_ep_post (ep);
204-
if (OPAL_SUCCESS == rc) {
205-
rc = OPAL_ERR_RESOURCE_BUSY;
203+
if (!ep->dg_posted) {
204+
rc = mca_btl_ugni_directed_ep_post (ep);
205+
if (OPAL_SUCCESS == rc) {
206+
ep->dg_posted = true;
207+
rc = OPAL_ERR_RESOURCE_BUSY;
208+
}
209+
return rc;
206210
}
207-
return rc;
208211
}
209212

210213
return mca_btl_ugni_ep_connect_finish (ep);

opal/mca/btl/ugni/btl_ugni_endpoint.h

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
3+
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
44
* reserved.
55
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
66
* $COPYRIGHT$
@@ -17,6 +17,7 @@
1717

1818
enum mca_btl_ugni_endpoint_state_t {
1919
MCA_BTL_UGNI_EP_STATE_INIT = 0,
20+
MCA_BTL_UGNI_EP_STATE_START,
2021
MCA_BTL_UGNI_EP_STATE_RDMA,
2122
MCA_BTL_UGNI_EP_STATE_CONNECTING,
2223
MCA_BTL_UGNI_EP_STATE_CONNECTED
@@ -30,7 +31,10 @@ typedef struct mca_btl_base_endpoint_t {
3031

3132
opal_proc_t *peer_proc;
3233

33-
opal_mutex_t lock;
34+
/** may need to lock recursively as the modex lookup could call opal_progress
35+
* and hence our progress function. if this changes modify this mutex to not
36+
* be recursive. also need to update the constructor function. */
37+
opal_recursive_mutex_t lock;
3438
mca_btl_ugni_endpoint_state_t state;
3539

3640
opal_common_ugni_endpoint_t *common;
@@ -48,6 +52,8 @@ typedef struct mca_btl_base_endpoint_t {
4852

4953
opal_list_t frag_wait_list;
5054
bool wait_listed;
55+
/** protect against race on connection */
56+
bool dg_posted;
5157

5258
int32_t smsg_progressing;
5359

@@ -74,7 +80,6 @@ static inline int mca_btl_ugni_init_ep (mca_btl_ugni_module_t *ugni_module,
7480

7581
endpoint->btl = btl;
7682
endpoint->peer_proc = peer_proc;
77-
endpoint->common = NULL;
7883
endpoint->index = opal_pointer_array_add (&ugni_module->endpoints, endpoint);
7984

8085
*ep = endpoint;
@@ -116,6 +121,7 @@ static inline int mca_btl_ugni_check_endpoint_state (mca_btl_ugni_endpoint_t *ep
116121
switch (ep->state) {
117122
case MCA_BTL_UGNI_EP_STATE_INIT:
118123
case MCA_BTL_UGNI_EP_STATE_RDMA:
124+
case MCA_BTL_UGNI_EP_STATE_START:
119125
rc = mca_btl_ugni_ep_connect_progress (ep);
120126
if (OPAL_SUCCESS != rc) {
121127
break;
@@ -139,7 +145,15 @@ static inline int mca_btl_ugni_ep_connect_rdma (mca_btl_base_endpoint_t *ep) {
139145
return OPAL_SUCCESS;
140146
}
141147

142-
/* get the modex info for this endpoint and setup a ugni endpoint */
148+
/* protect against re-entry from opal_progress */
149+
if (OPAL_UNLIKELY(MCA_BTL_UGNI_EP_STATE_START == ep->state)) {
150+
return OPAL_ERR_RESOURCE_BUSY;
151+
}
152+
153+
ep->state = MCA_BTL_UGNI_EP_STATE_START;
154+
155+
/* get the modex info for this endpoint and setup a ugni endpoint. this call may lead
156+
* to re-entry through opal_progress(). */
143157
rc = opal_common_ugni_endpoint_for_proc (ep->btl->device, ep->peer_proc, &ep->common);
144158
if (OPAL_SUCCESS != rc) {
145159
assert (0);

0 commit comments

Comments
 (0)