Skip to content

Commit e727ba3

Browse files
committed
btl/openib: add support for mlx5 atomic operations
This commit adds support for fetch-and-add and compare-and-swap when using the mlx5 driver. The support is only enabled if the expanded verbs interface is detected. This is required because mlx5 HCAs return the atomic result in network byte order. This support may need to be tweaked if Mellanox commits their changes into upstream verbs. Closes open-mpi#1077 Closes open-mpi#1148 (cherry picked from open-mpi/ompi@02a6c68) Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 758328e commit e727ba3

File tree

6 files changed

+83
-8
lines changed

6 files changed

+83
-8
lines changed

config/opal_check_openfabrics.m4

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,23 @@ AC_DEFUN([OPAL_CHECK_OPENFABRICS_CM],[
387387
fi
388388
])dnl
389389
390+
AC_DEFUN([OPAL_CHECK_EXP_VERBS],[
391+
OPAL_VAR_SCOPE_PUSH([have_struct_ibv_exp_send_wr])
392+
393+
AC_MSG_CHECKING([whether expanded verbs are available])
394+
AC_TRY_COMPILE([#include <infiniband/verbs_exp.h>], [struct ibv_exp_send_wr;],
395+
[have_struct_ibv_exp_send_wr=1
396+
AC_MSG_RESULT([yes])],
397+
[have_struct_ibv_exp_send_wr=0
398+
AC_MSG_RESULT([no])])
399+
400+
AC_DEFINE_UNQUOTED([HAVE_EXP_VERBS], [$have_struct_ibv_exp_send_wr], [Expanded verbs])
401+
AC_CHECK_DECLS([IBV_EXP_ATOMIC_HCA_REPLY_BE, IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY, ibv_exp_create_qp], [], [], [#include <infiniband/verbs_exp.h>])
402+
AC_CHECK_HEADERS([infiniband/verbs_exp.h])
403+
AS_IF([test '$have_struct_ibv_exp_send_wr' = 1], [$1], [$2])
404+
OPAL_VAR_SCOPE_POP
405+
])dnl
406+
390407
AC_DEFUN([OPAL_CHECK_MLNX_OPENFABRICS],[
391408
$1_have_mverbs=0
392409
$1_have_mqe=0

opal/mca/btl/openib/btl_openib.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,8 @@ struct mca_btl_openib_module_t {
490490
mca_btl_openib_module_qp_t * qps;
491491

492492
int local_procs; /** number of local procs */
493+
494+
bool atomic_ops_be; /** atomic result is big endian */
493495
};
494496
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
495497

opal/mca/btl/openib/btl_openib_atomic.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, st
2727
{
2828
mca_btl_openib_get_frag_t* frag = NULL;
2929
int qp = order;
30+
int32_t rkey;
3031
int rc;
3132

3233
frag = to_get_frag(alloc_recv_user_frag());
@@ -61,15 +62,16 @@ static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, st
6162
frag->sr_desc.wr.atomic.compare_add = operand;
6263
frag->sr_desc.wr.atomic.swap = operand2;
6364

65+
rkey = remote_handle->rkey;
66+
6467
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
6568
if((endpoint->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
6669
!= (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
67-
frag->sr_desc.wr.atomic.rkey = opal_swap_bytes4 (remote_handle->rkey);
68-
} else
69-
#endif
70-
{
71-
frag->sr_desc.wr.atomic.rkey = remote_handle->rkey;
70+
rkey = opal_swap_bytes4 (rkey);
7271
}
72+
#endif
73+
74+
frag->sr_desc.wr.atomic.rkey = rkey;
7375

7476
#if HAVE_XRC
7577
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {

opal/mca/btl/openib/btl_openib_component.c

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -822,13 +822,26 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
822822
openib_btl->super.btl_get_local_registration_threshold = 0;
823823

824824
#if HAVE_DECL_IBV_ATOMIC_HCA
825-
if (openib_btl->device->ib_dev_attr.atomic_cap == IBV_ATOMIC_NONE) {
825+
openib_btl->atomic_ops_be = false;
826+
827+
switch (openib_btl->device->ib_dev_attr.atomic_cap) {
828+
case IBV_ATOMIC_GLOB:
829+
openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB;
830+
break;
831+
#if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE
832+
case IBV_EXP_ATOMIC_HCA_REPLY_BE:
833+
openib_btl->atomic_ops_be = true;
834+
break;
835+
#endif
836+
case IBV_ATOMIC_HCA:
837+
break;
838+
case IBV_ATOMIC_NONE:
839+
default:
840+
/* no atomics or an unsupported atomic type */
826841
openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS;
827842
openib_btl->super.btl_atomic_flags = 0;
828843
openib_btl->super.btl_atomic_fop = NULL;
829844
openib_btl->super.btl_atomic_cswap = NULL;
830-
} else if (IBV_ATOMIC_GLOB == openib_btl->device->ib_dev_attr.atomic_cap) {
831-
openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB;
832845
}
833846
#endif
834847

@@ -3450,6 +3463,11 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
34503463

34513464
mca_btl_openib_get_frag_t *get_frag = to_get_frag(des);
34523465

3466+
/* check if atomic result needs to be byte swapped (mlx5) */
3467+
if (openib_btl->atomic_ops_be && IBV_WC_RDMA_READ != wc->opcode) {
3468+
*((int64_t *) frag->sg_entry.addr) = ntoh64 (*((int64_t *) frag->sg_entry.addr));
3469+
}
3470+
34533471
get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr,
34543472
get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data,
34553473
OPAL_SUCCESS);

opal/mca/btl/openib/configure.m4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ AC_DEFUN([MCA_opal_btl_openib_CONFIG],[
4646
[btl_openib_happy="yes"
4747
OPAL_CHECK_OPENFABRICS_CM([btl_openib])],
4848
[btl_openib_happy="no"])
49+
OPAL_CHECK_EXP_VERBS([btl_openib], [], [])
4950

5051
AS_IF([test "$btl_openib_happy" = "yes"],
5152
[# With the new openib flags, look for ibv_fork_init

opal/mca/btl/openib/connect/btl_openib_connect_udcm.c

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@
5656
#include <sys/types.h>
5757
#include <fcntl.h>
5858
#include <infiniband/verbs.h>
59+
#ifdef HAVE_INFINIBAND_VERBS_EXP_H
60+
#include <infiniband/verbs_exp.h>
61+
#endif
5962
#include <signal.h>
6063

6164
#include <pthread.h>
@@ -1307,7 +1310,11 @@ static int udcm_rc_qp_create_one(udcm_module_t *m, mca_btl_base_endpoint_t* lcl_
13071310
uint32_t max_send_wr)
13081311
{
13091312
udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep);
1313+
#if HAVE_DECL_IBV_EXP_CREATE_QP
1314+
struct ibv_exp_qp_init_attr init_attr;
1315+
#else
13101316
struct ibv_qp_init_attr init_attr;
1317+
#endif
13111318
size_t req_inline;
13121319
int rc;
13131320

@@ -1328,6 +1335,32 @@ static int udcm_rc_qp_create_one(udcm_module_t *m, mca_btl_base_endpoint_t* lcl_
13281335
}
13291336
init_attr.cap.max_send_wr = max_send_wr;
13301337

1338+
#if HAVE_DECL_IBV_EXP_CREATE_QP
1339+
/* use expanded verbs qp create to enable use of mlx5 atomics */
1340+
init_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD;
1341+
init_attr.pd = m->btl->device->ib_pd;
1342+
1343+
init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG;
1344+
init_attr.max_atomic_arg = 8;
1345+
1346+
#if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE
1347+
if (IBV_EXP_ATOMIC_HCA_REPLY_BE == m->btl->device->ib_dev_attr.atomic_cap) {
1348+
init_attr.exp_create_flags = IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY;
1349+
init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS;
1350+
}
1351+
#endif
1352+
1353+
while (NULL == (lcl_ep->qps[qp].qp->lcl_qp = ibv_exp_create_qp (m->btl->device->ib_dev_context,
1354+
&init_attr))) {
1355+
/* NTH: this process may be out of registered memory. try evicting an item from
1356+
the lru of this btl's mpool */
1357+
if (false == mca_mpool_grdma_evict (m->btl->super.btl_mpool)) {
1358+
break;
1359+
}
1360+
}
1361+
1362+
#else
1363+
13311364
while (NULL == (lcl_ep->qps[qp].qp->lcl_qp = ibv_create_qp(m->btl->device->ib_pd,
13321365
&init_attr))) {
13331366
/* NTH: this process may be out of registered memory. try evicting an item from
@@ -1337,6 +1370,8 @@ static int udcm_rc_qp_create_one(udcm_module_t *m, mca_btl_base_endpoint_t* lcl_
13371370
}
13381371
}
13391372

1373+
#endif
1374+
13401375
if (NULL == lcl_ep->qps[qp].qp->lcl_qp) {
13411376
opal_show_help("help-mpi-btl-openib-cpc-base.txt",
13421377
"ibv_create_qp failed", true, opal_process_info.nodename,

0 commit comments

Comments
 (0)