Skip to content

Commit 8109dca

Browse files
committed
btl/openib: fix locking bugs with XRC ib_addr lock
This bug fixes two issue with the ib_addr lock: - The ib_addr lock must always be obtained regardless of opal_using_threads() as the CPC is run in a seperate thread. - The ib_addr lock is held in mca_btl_openib_endpoint_connected when calling back into the CPC start_connect on any pending connections. This will attempt to obtain the ib_addr lock again. Since this is not a performance-critical part of the code the lock has been changed to be recursive. Signed-off-by: Nathan Hjelm <[email protected]> (cherry picked from commit open-mpi/ompi@371df45) Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 0043b0e commit 8109dca

File tree

2 files changed

+10
-4
lines changed

2 files changed

+10
-4
lines changed

opal/mca/btl/openib/btl_openib_endpoint.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved.
14-
* Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights
14+
* Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights
1515
* reserved.
1616
* Copyright (c) 2006-2007 Voltaire All rights reserved.
1717
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
@@ -579,7 +579,7 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
579579

580580
opal_output(-1, "Now we are CONNECTED");
581581
if (MCA_BTL_XRC_ENABLED) {
582-
OPAL_THREAD_LOCK(&endpoint->ib_addr->addr_lock);
582+
opal_mutex_lock (&endpoint->ib_addr->addr_lock);
583583
if (MCA_BTL_IB_ADDR_CONNECTED == endpoint->ib_addr->status) {
584584
/* We are not xrc master */
585585
/* set our qp pointer to master qp */
@@ -622,7 +622,7 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
622622
}
623623
}
624624
}
625-
OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock);
625+
opal_mutex_unlock (&endpoint->ib_addr->addr_lock);
626626
}
627627

628628

opal/mca/btl/openib/btl_openib_xrc.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
12
/*
23
* Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
34
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
45
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
56
* Copyright (c) 2014-2015 Research Organization for Information Science
67
* and Technology (RIST). All rights reserved.
78
* Copyright (c) 2014 Bull SAS. All rights reserved.
9+
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
10+
* reserved.
811
* $COPYRIGHT$
912
*
1013
* Additional copyrights may follow
@@ -122,7 +125,10 @@ static void ib_address_constructor(ib_address_t *ib_addr)
122125
ib_addr->lid = 0;
123126
ib_addr->status = MCA_BTL_IB_ADDR_CLOSED;
124127
ib_addr->qp = NULL;
125-
OBJ_CONSTRUCT(&ib_addr->addr_lock, opal_mutex_t);
128+
/* NTH: make the addr_lock recursive because mca_btl_openib_endpoint_connected can call
129+
* into the CPC with the lock held. The alternative would be to drop the lock but the
130+
* lock is never obtained in a critical path. */
131+
OBJ_CONSTRUCT(&ib_addr->addr_lock, opal_recursive_mutex_t);
126132
OBJ_CONSTRUCT(&ib_addr->pending_ep, opal_list_t);
127133
}
128134

0 commit comments

Comments
 (0)