Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions opal/mca/btl/ugni/btl_ugni_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,14 @@ mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
data, (void *) ep, remote_id));

/* NTH: TODO -- error handling */
opal_mutex_lock (&ep->lock);
if (handle != ugni_module->wildcard_ep) {
/* directed post complete */
ep->dg_posted = false;
}

(void) mca_btl_ugni_ep_connect_progress (ep);
opal_mutex_unlock (&ep->lock);

if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
/* process messages waiting in the endpoint's smsg mailbox */
Expand Down
11 changes: 7 additions & 4 deletions opal/mca/btl/ugni/btl_ugni_endpoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,14 @@ int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) {

if (GNI_SMSG_TYPE_INVALID == ep->remote_attr.smsg_attr.msg_type) {
/* use datagram to exchange connection information with the remote peer */
rc = mca_btl_ugni_directed_ep_post (ep);
if (OPAL_SUCCESS == rc) {
rc = OPAL_ERR_RESOURCE_BUSY;
if (!ep->dg_posted) {
rc = mca_btl_ugni_directed_ep_post (ep);
if (OPAL_SUCCESS == rc) {
ep->dg_posted = true;
rc = OPAL_ERR_RESOURCE_BUSY;
}
return rc;
}
return rc;
}

return mca_btl_ugni_ep_connect_finish (ep);
Expand Down
22 changes: 18 additions & 4 deletions opal/mca/btl/ugni/btl_ugni_endpoint.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
Expand All @@ -17,6 +17,7 @@

enum mca_btl_ugni_endpoint_state_t {
MCA_BTL_UGNI_EP_STATE_INIT = 0,
MCA_BTL_UGNI_EP_STATE_START,
MCA_BTL_UGNI_EP_STATE_RDMA,
MCA_BTL_UGNI_EP_STATE_CONNECTING,
MCA_BTL_UGNI_EP_STATE_CONNECTED
Expand All @@ -30,7 +31,10 @@ typedef struct mca_btl_base_endpoint_t {

opal_proc_t *peer_proc;

opal_mutex_t lock;
/** may need to lock recursively as the modex lookup could call opal_progress
* and hence our progress function. if this changes modify this mutex to not
* be recursive. also need to update the constructor function. */
opal_recursive_mutex_t lock;
mca_btl_ugni_endpoint_state_t state;

opal_common_ugni_endpoint_t *common;
Expand All @@ -48,6 +52,8 @@ typedef struct mca_btl_base_endpoint_t {

opal_list_t frag_wait_list;
bool wait_listed;
/** protect against race on connection */
bool dg_posted;

int32_t smsg_progressing;

Expand All @@ -74,7 +80,6 @@ static inline int mca_btl_ugni_init_ep (mca_btl_ugni_module_t *ugni_module,

endpoint->btl = btl;
endpoint->peer_proc = peer_proc;
endpoint->common = NULL;
endpoint->index = opal_pointer_array_add (&ugni_module->endpoints, endpoint);

*ep = endpoint;
Expand Down Expand Up @@ -116,6 +121,7 @@ static inline int mca_btl_ugni_check_endpoint_state (mca_btl_ugni_endpoint_t *ep
switch (ep->state) {
case MCA_BTL_UGNI_EP_STATE_INIT:
case MCA_BTL_UGNI_EP_STATE_RDMA:
case MCA_BTL_UGNI_EP_STATE_START:
rc = mca_btl_ugni_ep_connect_progress (ep);
if (OPAL_SUCCESS != rc) {
break;
Expand All @@ -139,7 +145,15 @@ static inline int mca_btl_ugni_ep_connect_rdma (mca_btl_base_endpoint_t *ep) {
return OPAL_SUCCESS;
}

/* get the modex info for this endpoint and setup a ugni endpoint */
/* protect against re-entry from opal_progress */
if (OPAL_UNLIKELY(MCA_BTL_UGNI_EP_STATE_START == ep->state)) {
return OPAL_ERR_RESOURCE_BUSY;
}

ep->state = MCA_BTL_UGNI_EP_STATE_START;

/* get the modex info for this endpoint and setup a ugni endpoint. this call may lead
* to re-entry through opal_progress(). */
rc = opal_common_ugni_endpoint_for_proc (ep->btl->device, ep->peer_proc, &ep->common);
if (OPAL_SUCCESS != rc) {
assert (0);
Expand Down