Skip to content

Commit 4d79c93

Browse files
committed
btl/ugni: fix race condition that causes completions to be dropped
The send code in the ugni btl has an optimization that enables it to return 1 (fragment gone) in some cases. This optimization involved removing the btl ownership and callback flags to ensure the fragment stuck around long enough for its completion flag to be checked. This works fine for the single-threaded case but not in the multi-threaded case. It is possible that a fragment will be completed by another thread while a thread is in mca_btl_ugni_send. This competition can lead to a leaked fragment, missed callback, or both. To fix the issue without removing the optimization a reference count has been added to the fragment. Callbacks and fragment release will not be made until the fragment reference count has reach 0. The count is incremented before sending the frag and decremented after the completion flag has been checked. The fix has been verified to work using a multi-threaded RMA benchmark with the osc/pt2pt component. (cherry picked from open-mpi/ompi@cd11fc3) Signed-off-by: Nathan Hjelm <[email protected]>
1 parent c667fda commit 4d79c93

File tree

2 files changed

+40
-15
lines changed

2 files changed

+40
-15
lines changed

opal/mca/btl/ugni/btl_ugni_frag.h

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
3+
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
44
* reserved.
55
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
66
* Copyright (c) 2013 The University of Tennessee and The University
@@ -66,6 +66,7 @@ struct mca_btl_ugni_base_frag_t;
6666

6767
typedef struct mca_btl_ugni_base_frag_t {
6868
mca_btl_base_descriptor_t base;
69+
volatile int32_t ref_cnt;
6970
uint32_t msg_id;
7071
uint16_t hdr_size;
7172
uint16_t flags;
@@ -148,6 +149,7 @@ static inline int mca_btl_ugni_frag_alloc (mca_btl_base_endpoint_t *ep,
148149
if (OPAL_LIKELY(NULL != *frag)) {
149150
(*frag)->my_list = list;
150151
(*frag)->endpoint = ep;
152+
(*frag)->ref_cnt = 1;
151153
return OPAL_SUCCESS;
152154
}
153155

@@ -169,10 +171,16 @@ static inline int mca_btl_ugni_frag_return (mca_btl_ugni_base_frag_t *frag)
169171
return OPAL_SUCCESS;
170172
}
171173

172-
static inline void mca_btl_ugni_frag_complete (mca_btl_ugni_base_frag_t *frag, int rc) {
173-
frag->flags |= MCA_BTL_UGNI_FRAG_COMPLETE;
174+
static inline bool mca_btl_ugni_frag_del_ref (mca_btl_ugni_base_frag_t *frag, int rc) {
175+
int32_t ref_cnt;
174176

175-
BTL_VERBOSE(("frag complete. flags = %d", frag->base.des_flags));
177+
opal_atomic_mb ();
178+
179+
ref_cnt = OPAL_THREAD_ADD32(&frag->ref_cnt, -1);
180+
if (ref_cnt) {
181+
assert (ref_cnt > 0);
182+
return false;
183+
}
176184

177185
/* call callback if specified */
178186
if (frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
@@ -182,6 +190,20 @@ static inline void mca_btl_ugni_frag_complete (mca_btl_ugni_base_frag_t *frag, i
182190
if (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) {
183191
mca_btl_ugni_frag_return (frag);
184192
}
193+
194+
return true;
195+
}
196+
197+
static inline void mca_btl_ugni_frag_complete (mca_btl_ugni_base_frag_t *frag, int rc) {
198+
BTL_VERBOSE(("frag complete. flags = %d", frag->base.des_flags));
199+
200+
frag->flags |= MCA_BTL_UGNI_FRAG_COMPLETE;
201+
202+
mca_btl_ugni_frag_del_ref (frag, rc);
203+
}
204+
205+
static inline bool mca_btl_ugni_frag_check_complete (mca_btl_ugni_base_frag_t *frag) {
206+
return !!(MCA_BTL_UGNI_FRAG_COMPLETE & frag->flags);
185207
}
186208

187209
#define MCA_BTL_UGNI_FRAG_ALLOC_SMSG(ep, frag) \

opal/mca/btl/ugni/btl_ugni_send.c

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
2525
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) descriptor;
2626
size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len;
2727
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
28-
int flags_save = frag->base.des_flags;
2928
int rc;
3029

3130
/* tag and len are at the same location in eager and smsg frag hdrs */
@@ -43,34 +42,38 @@ int mca_btl_ugni_send (struct mca_btl_base_module_t *btl,
4342
BTL_VERBOSE(("btl/ugni sending descriptor %p from %d -> %d. length = %" PRIu64, (void *)descriptor,
4443
OPAL_PROC_MY_NAME.vpid, endpoint->common->ep_rem_id, size));
4544

46-
/* temporarily disable ownership and callback flags so we can reliably check the complete flag */
47-
frag->base.des_flags &= ~(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
45+
/* add a reference to prevent the fragment from being returned until after the
46+
* completion flag is checked. */
47+
++frag->ref_cnt;
4848
frag->flags &= ~MCA_BTL_UGNI_FRAG_COMPLETE;
4949

5050
rc = mca_btl_ugni_send_frag (endpoint, frag);
51-
52-
if (OPAL_LIKELY(frag->flags & MCA_BTL_UGNI_FRAG_COMPLETE)) {
51+
if (OPAL_LIKELY(mca_btl_ugni_frag_check_complete (frag))) {
5352
/* fast path: remote side has received the frag */
54-
frag->base.des_flags = flags_save;
55-
mca_btl_ugni_frag_complete (frag, OPAL_SUCCESS);
53+
(void) mca_btl_ugni_frag_del_ref (frag, OPAL_SUCCESS);
5654

5755
return 1;
5856
}
5957

60-
if ((OPAL_SUCCESS == rc) && (frag->flags & MCA_BTL_UGNI_FRAG_BUFFERED) && (flags_save & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
58+
if ((OPAL_SUCCESS == rc) && (frag->flags & MCA_BTL_UGNI_FRAG_BUFFERED) && (frag->flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
6159
/* fast(ish) path: btl owned buffered frag. report send as complete */
62-
frag->base.des_flags = flags_save & ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
60+
bool call_callback = !!(frag->flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
61+
frag->flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
6362

64-
if (OPAL_LIKELY(flags_save & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)) {
63+
if (call_callback) {
6564
frag->base.des_cbfunc(&frag->endpoint->btl->super, frag->endpoint, &frag->base, rc);
6665
}
6766

67+
(void) mca_btl_ugni_frag_del_ref (frag, OPAL_SUCCESS);
68+
6869
return 1;
6970
}
7071

7172
/* slow(ish) path: remote side hasn't received the frag. call the frag's callback when
7273
we get the local smsg/msgq or remote rdma completion */
73-
frag->base.des_flags = flags_save | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
74+
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
75+
76+
mca_btl_ugni_frag_del_ref (frag, OPAL_SUCCESS);
7477

7578
if (OPAL_UNLIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
7679
/* queue up request */

0 commit comments

Comments
 (0)