11/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ 
22/* 
3-  * Copyright (c) 2011-2014  Los Alamos National Security, LLC. All rights 
3+  * Copyright (c) 2011-2016  Los Alamos National Security, LLC. All rights 
44 *                         reserved. 
55 * Copyright (c) 2011      UT-Battelle, LLC. All rights reserved. 
66 * $COPYRIGHT$ 
1717
1818enum  mca_btl_ugni_endpoint_state_t  {
1919    MCA_BTL_UGNI_EP_STATE_INIT  =  0 ,
20+     MCA_BTL_UGNI_EP_STATE_START ,
2021    MCA_BTL_UGNI_EP_STATE_RDMA ,
2122    MCA_BTL_UGNI_EP_STATE_CONNECTING ,
2223    MCA_BTL_UGNI_EP_STATE_CONNECTED 
@@ -30,7 +31,10 @@ typedef struct mca_btl_base_endpoint_t {
3031
3132    opal_proc_t  * peer_proc ;
3233
33-     opal_mutex_t  lock ;
34+     /** may need to lock recursively as the modex lookup could call opal_progress 
35+      * and hence our progress function. if this changes modify this mutex to not 
36+      * be recursive. also need to update the constructor function. */ 
37+     opal_recursive_mutex_t  lock ;
3438    mca_btl_ugni_endpoint_state_t  state ;
3539
3640    opal_common_ugni_endpoint_t  * common ;
@@ -48,6 +52,8 @@ typedef struct mca_btl_base_endpoint_t {
4852
4953    opal_list_t  frag_wait_list ;
5054    bool  wait_listed ;
55+     /** protect against race on connection */ 
56+     bool  dg_posted ;
5157
5258    int32_t  smsg_progressing ;
5359
@@ -74,7 +80,6 @@ static inline int mca_btl_ugni_init_ep (mca_btl_ugni_module_t *ugni_module,
7480
7581    endpoint -> btl  =  btl ;
7682    endpoint -> peer_proc  =  peer_proc ;
77-     endpoint -> common  =  NULL ;
7883    endpoint -> index  =  opal_pointer_array_add  (& ugni_module -> endpoints , endpoint );
7984
8085    * ep  =  endpoint ;
@@ -116,6 +121,7 @@ static inline int mca_btl_ugni_check_endpoint_state (mca_btl_ugni_endpoint_t *ep
116121    switch  (ep -> state ) {
117122    case  MCA_BTL_UGNI_EP_STATE_INIT :
118123    case  MCA_BTL_UGNI_EP_STATE_RDMA :
124+     case  MCA_BTL_UGNI_EP_STATE_START :
119125        rc  =  mca_btl_ugni_ep_connect_progress  (ep );
120126        if  (OPAL_SUCCESS  !=  rc ) {
121127            break ;
@@ -139,7 +145,15 @@ static inline int mca_btl_ugni_ep_connect_rdma (mca_btl_base_endpoint_t *ep) {
139145        return  OPAL_SUCCESS ;
140146    }
141147
142-     /* get the modex info for this endpoint and setup a ugni endpoint */ 
148+     /* protect against re-entry from opal_progress */ 
149+     if  (OPAL_UNLIKELY (MCA_BTL_UGNI_EP_STATE_START  ==  ep -> state )) {
150+         return  OPAL_ERR_RESOURCE_BUSY ;
151+     }
152+ 
153+     ep -> state  =  MCA_BTL_UGNI_EP_STATE_START ;
154+ 
155+     /* get the modex info for this endpoint and setup a ugni endpoint. this call may lead 
156+      * to re-entry through opal_progress(). */ 
143157    rc  =  opal_common_ugni_endpoint_for_proc  (ep -> btl -> device , ep -> peer_proc , & ep -> common );
144158    if  (OPAL_SUCCESS  !=  rc ) {
145159        assert  (0 );
0 commit comments