Skip to content

Commit 0043b0e

Browse files
committed
btl/openib: XRC fix bug that could cause an invalid SRQ# to be used
This commit fixes a bug that occurs when attempting a get or put operation on an endpoint that is not already connected. In this case the remote_srqn may be set to an invalid value as the rem_srqs array on the endpoint is not populated. This commit moves the usage of the rem_srqs array to the internal put/get functions where it is guaranteed this array is populated. Signed-off-by: Nathan Hjelm <[email protected]> (cherry picked from commit open-mpi/ompi@4dc73d7) Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 5137cea commit 0043b0e

File tree

3 files changed

+31
-35
lines changed

3 files changed

+31
-35
lines changed

opal/mca/btl/openib/btl_openib_atomic.c

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
3+
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
44
* reserved.
55
* Copyright (c) 2015 Research Organization for Information Science
66
* and Technology (RIST). All rights reserved.
@@ -73,16 +73,7 @@ static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, st
7373

7474
frag->sr_desc.wr.atomic.rkey = rkey;
7575

76-
#if HAVE_XRC
77-
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
78-
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
79-
frag->sr_desc.qp_type.xrc.remote_srqn = endpoint->rem_info.rem_srqs[qp].rem_srq_num;
80-
#else
81-
frag->sr_desc.xrc_remote_srq_num = endpoint->rem_info.rem_srqs[qp].rem_srq_num;
82-
#endif
83-
84-
}
85-
#endif
76+
/* NTH: the SRQ# is set in mca_btl_get_internal */
8677

8778
if (endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) {
8879
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);

opal/mca/btl/openib/btl_openib_get.c

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
15-
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
15+
* Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights
1616
* reserved.
1717
* Copyright (c) 2006-2007 Voltaire All rights reserved.
1818
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
@@ -92,16 +92,6 @@ int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint
9292
frag->sr_desc.wr.rdma.rkey = remote_handle->rkey;
9393
}
9494

95-
#if HAVE_XRC
96-
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
97-
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
98-
frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
99-
#else
100-
frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
101-
#endif
102-
}
103-
#endif
104-
10595
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
10696
OPAL_THREAD_LOCK(&ep->endpoint_lock);
10797
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags);
@@ -138,6 +128,19 @@ int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base
138128
int qp = to_base_frag(frag)->base.order;
139129
struct ibv_send_wr *bad_wr;
140130

131+
#if HAVE_XRC
132+
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
133+
/* NTH: the remote SRQ number is only available once the endpoint is connected. By
134+
* setting the value here instead of mca_btl_openib_get we guarantee the rem_srqs
135+
* array is initialized. */
136+
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
137+
frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
138+
#else
139+
frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
140+
#endif
141+
}
142+
#endif
143+
141144
/* check for a send wqe */
142145
if (qp_get_wqe(ep, qp) < 0) {
143146
qp_put_wqe(ep, qp);

opal/mca/btl/openib/btl_openib_put.c

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -101,19 +101,6 @@ int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint
101101
to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey;
102102
}
103103

104-
#if HAVE_XRC
105-
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
106-
107-
#if OPAL_HAVE_CONNECTX_XRC
108-
to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
109-
#elif OPAL_HAVE_CONNECTX_XRC_DOMAINS
110-
to_out_frag(frag)->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
111-
#else
112-
#error "that should never happen"
113-
#endif
114-
}
115-
#endif
116-
117104
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
118105
OPAL_THREAD_LOCK(&ep->endpoint_lock);
119106
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags);
@@ -153,6 +140,21 @@ int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base
153140
struct ibv_send_wr *bad_wr;
154141
int rc;
155142

143+
#if HAVE_XRC
144+
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
145+
/* NTH: the remote SRQ number is only available once the endpoint is connected. By
146+
* setting the value here instead of mca_btl_openib_put we guarantee the rem_srqs
147+
* array is initialized. */
148+
#if OPAL_HAVE_CONNECTX_XRC
149+
to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
150+
#elif OPAL_HAVE_CONNECTX_XRC_DOMAINS
151+
to_out_frag(frag)->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
152+
#else
153+
#error "that should never happen"
154+
#endif
155+
}
156+
#endif
157+
156158
/* check for a send wqe */
157159
if (qp_get_wqe(ep, qp) < 0) {
158160
qp_put_wqe(ep, qp);

0 commit comments

Comments
 (0)