Skip to content

Commit 368da00

Browse files
authored
Merge pull request #6804 from hppritcha/topic/swat_issue_6785
btl/openib: fix issue 6785
2 parents 507fcc9 + 71f240f commit 368da00

File tree

4 files changed

+103
-131
lines changed

4 files changed

+103
-131
lines changed

config/ompi_check_ucx.m4

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,11 @@ AC_DEFUN([OMPI_CHECK_UCX],[
135135
[$1_CPPFLAGS="[$]$1_CPPFLAGS $ompi_check_ucx_CPPFLAGS"
136136
$1_LDFLAGS="[$]$1_LDFLAGS $ompi_check_ucx_LDFLAGS"
137137
$1_LIBS="[$]$1_LIBS $ompi_check_ucx_LIBS"
138+
AC_DEFINE([HAVE_UCX], [1], [have ucx])
138139
$2],
139140
[AS_IF([test ! -z "$with_ucx" && test "$with_ucx" != "no"],
140141
[AC_MSG_ERROR([UCX support requested but not found. Aborting])])
142+
AC_DEFINE([HAVE_UCX], [0], [have ucx])
141143
$3])
142144

143145
OPAL_VAR_SCOPE_POP

opal/mca/btl/openib/btl_openib.c

Lines changed: 64 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
* Copyright (c) 2014-2018 Research Organization for Information Science
2323
* and Technology (RIST). All rights reserved.
2424
* Copyright (c) 2014 Bull SAS. All rights reserved
25+
* Copyrigth (c) 2019 Triad National Security, LLC. All rights reserved.
2526
* $COPYRIGHT$
2627
*
2728
* Additional copyrights may follow
@@ -1040,15 +1041,6 @@ int mca_btl_openib_add_procs(
10401041
int btl_rank = 0;
10411042
volatile mca_btl_base_endpoint_t* endpoint;
10421043

1043-
1044-
if (! openib_btl->allowed) {
1045-
opal_bitmap_clear_all_bits(reachable);
1046-
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
1047-
true, opal_process_info.nodename,
1048-
openib_btl->device_name, openib_btl->port_num);
1049-
return OPAL_SUCCESS;
1050-
}
1051-
10521044
btl_rank = get_openib_btl_params(openib_btl, &lcl_subnet_id_port_cnt);
10531045
if( 0 > btl_rank ){
10541046
return OPAL_ERR_NOT_FOUND;
@@ -1648,81 +1640,80 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl)
16481640
return OPAL_SUCCESS;
16491641
}
16501642

1651-
if (openib_btl->allowed) {
1652-
/* Release all QPs */
1653-
if (NULL != openib_btl->device->endpoints) {
1654-
for (ep_index=0;
1655-
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
1656-
ep_index++) {
1657-
endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints,
1643+
/* Release all QPs */
1644+
if (NULL != openib_btl->device->endpoints) {
1645+
for (ep_index=0;
1646+
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
1647+
ep_index++) {
1648+
1649+
endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints,
16581650
ep_index);
1659-
if(!endpoint) {
1660-
BTL_VERBOSE(("In finalize, got another null endpoint"));
1661-
continue;
1662-
}
1663-
if(endpoint->endpoint_btl != openib_btl) {
1664-
continue;
1665-
}
1666-
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
1667-
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
1668-
openib_btl->device->eager_rdma_buffers[i] = NULL;
1669-
OBJ_RELEASE(endpoint);
1670-
}
1651+
if(!endpoint) {
1652+
BTL_VERBOSE(("In finalize, got another null endpoint"));
1653+
continue;
1654+
}
1655+
if(endpoint->endpoint_btl != openib_btl) {
1656+
continue;
1657+
}
1658+
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
1659+
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
1660+
openib_btl->device->eager_rdma_buffers[i] = NULL;
1661+
OBJ_RELEASE(endpoint);
16711662
}
1672-
opal_pointer_array_set_item(openib_btl->device->endpoints,
1673-
ep_index, NULL);
1674-
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
1675-
OBJ_RELEASE(endpoint);
16761663
}
1664+
opal_pointer_array_set_item(openib_btl->device->endpoints,
1665+
ep_index, NULL);
1666+
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
1667+
OBJ_RELEASE(endpoint);
16771668
}
1669+
}
16781670

1679-
/* Release SRQ resources */
1680-
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
1681-
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
1682-
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
1683-
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
1684-
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
1685-
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
1686-
if (NULL != openib_btl->qps[qp].u.srq_qp.srq) {
1687-
opal_mutex_t *lock =
1688-
&mca_btl_openib_component.srq_manager.lock;
1689-
1690-
opal_hash_table_t *srq_addr_table =
1691-
&mca_btl_openib_component.srq_manager.srq_addr_table;
1692-
1693-
opal_mutex_lock(lock);
1694-
if (OPAL_SUCCESS !=
1695-
opal_hash_table_remove_value_ptr(srq_addr_table,
1696-
&openib_btl->qps[qp].u.srq_qp.srq,
1697-
sizeof(struct ibv_srq *))) {
1698-
BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp));
1699-
rc = OPAL_ERROR;
1700-
}
1701-
opal_mutex_unlock(lock);
1702-
if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) {
1703-
BTL_VERBOSE(("Failed to close SRQ %d", qp));
1704-
rc = OPAL_ERROR;
1705-
}
1706-
}
1671+
/* Release SRQ resources */
1672+
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
1673+
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
1674+
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
1675+
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
1676+
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
1677+
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
1678+
if (NULL != openib_btl->qps[qp].u.srq_qp.srq) {
1679+
opal_mutex_t *lock =
1680+
&mca_btl_openib_component.srq_manager.lock;
17071681

1708-
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
1709-
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
1710-
}
1711-
}
1682+
opal_hash_table_t *srq_addr_table =
1683+
&mca_btl_openib_component.srq_manager.srq_addr_table;
17121684

1713-
/* Finalize the CPC modules on this openib module */
1714-
for (i = 0; i < openib_btl->num_cpcs; ++i) {
1715-
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
1716-
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
1685+
opal_mutex_lock(lock);
1686+
if (OPAL_SUCCESS !=
1687+
opal_hash_table_remove_value_ptr(srq_addr_table,
1688+
&openib_btl->qps[qp].u.srq_qp.srq,
1689+
sizeof(struct ibv_srq *))) {
1690+
BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp));
1691+
rc = OPAL_ERROR;
1692+
}
1693+
opal_mutex_unlock(lock);
1694+
if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) {
1695+
BTL_VERBOSE(("Failed to close SRQ %d", qp));
1696+
rc = OPAL_ERROR;
1697+
}
17171698
}
1718-
free(openib_btl->cpcs[i]);
1699+
1700+
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
1701+
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
17191702
}
1720-
free(openib_btl->cpcs);
1703+
}
17211704

1722-
/* Release device if there are no more users */
1723-
if(!(--openib_btl->device->allowed_btls)) {
1724-
OBJ_RELEASE(openib_btl->device);
1705+
/* Finalize the CPC modules on this openib module */
1706+
for (i = 0; i < openib_btl->num_cpcs; ++i) {
1707+
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
1708+
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
17251709
}
1710+
free(openib_btl->cpcs[i]);
1711+
}
1712+
free(openib_btl->cpcs);
1713+
1714+
/* Release device if there are no more users */
1715+
if(!(--openib_btl->device->allowed_btls)) {
1716+
OBJ_RELEASE(openib_btl->device);
17261717
}
17271718

17281719
if (NULL != openib_btl->qps) {

opal/mca/btl/openib/btl_openib.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
* Copyright (c) 2014 Bull SAS. All rights reserved.
2121
* Copyright (c) 2015-2018 Research Organization for Information Science
2222
* and Technology (RIST). All rights reserved.
23+
* Copyrigth (c) 2019 Triad National Security, LLC. All rights reserved.
24+
*
2325
* $COPYRIGHT$
2426
*
2527
* Additional copyrights may follow
@@ -506,8 +508,6 @@ struct mca_btl_openib_module_t {
506508
int local_procs; /** number of local procs */
507509

508510
bool atomic_ops_be; /** atomic result is big endian */
509-
510-
bool allowed; /** is this port allowed */
511511
};
512512
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
513513

opal/mca/btl/openib/btl_openib_component.c

Lines changed: 35 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
* Copyright (c) 2014-2018 Research Organization for Information Science
2323
* and Technology (RIST). All rights reserved.
2424
* Copyright (c) 2014 Bull SAS. All rights reserved.
25+
* Copyrigth (c) 2019 Triad National Security, LLC. All rights reserved.
2526
* $COPYRIGHT$
2627
*
2728
* Additional copyrights may follow
@@ -278,9 +279,6 @@ static int btl_openib_modex_send(void)
278279
);
279280
/* For each module, add in the size of the per-CPC data */
280281
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
281-
if (! mca_btl_openib_component.openib_btls[i]->allowed) {
282-
continue;
283-
}
284282
for (j = 0;
285283
j < mca_btl_openib_component.openib_btls[i]->num_cpcs;
286284
++j) {
@@ -309,9 +307,6 @@ static int btl_openib_modex_send(void)
309307
/* Pack each of the modules */
310308
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
311309

312-
if (! mca_btl_openib_component.openib_btls[i]->allowed) {
313-
continue;
314-
}
315310
/* Pack the modex common message struct. */
316311
size = modex_message_size;
317312

@@ -633,38 +628,26 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
633628
* unless the user specifically requested to override this
634629
* policy. For ancient OFED, only allow if user has set
635630
* the MCA parameter.
631+
*
632+
* We emit a help message if Open MPI was configured without
633+
* UCX support if the port is configured to use infiniband for link
634+
* layer. If UCX support is available, don't emit help message
635+
* since UCX PML has higher priority than OB1 and this BTL will
636+
* not be used.
636637
*/
637-
if (! mca_btl_openib_component.allow_ib
638+
if (false == mca_btl_openib_component.allow_ib
638639
#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET
639640
&& IBV_LINK_LAYER_INFINIBAND == ib_port_attr->link_layer
640641
#endif
641642
) {
642-
openib_btl = (mca_btl_openib_module_t *) calloc(1, sizeof(mca_btl_openib_module_t));
643-
if(NULL == openib_btl) {
644-
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
645-
return OPAL_ERR_OUT_OF_RESOURCE;
646-
}
647-
memcpy(openib_btl, &mca_btl_openib_module,
648-
sizeof(mca_btl_openib_module));
649-
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
650-
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
651-
openib_btl->port_num = (uint8_t) port_num;
652-
openib_btl->allowed = false;
653-
openib_btl->device = NULL;
654-
openib_btl->device_name = strdup(ibv_get_device_name(device->ib_dev));
655-
OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
656-
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
657-
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
658-
++device->btls;
659-
++mca_btl_openib_component.ib_num_btls;
660-
if (-1 != mca_btl_openib_component.ib_max_btls &&
661-
mca_btl_openib_component.ib_num_btls >=
662-
mca_btl_openib_component.ib_max_btls) {
663-
return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
664-
}
665-
return OPAL_SUCCESS;
666-
}
667-
643+
#if !HAVE_UCX
644+
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
645+
true, opal_process_info.nodename,
646+
ibv_get_device_name(device->ib_dev),
647+
port_num);
648+
#endif
649+
return OPAL_ERR_NOT_FOUND;
650+
}
668651

669652
/* Ensure that the requested GID index (via the
670653
btl_openib_gid_index MCA param) is within the GID table
@@ -901,8 +884,6 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
901884
}
902885
}
903886

904-
openib_btl->allowed = true;
905-
906887
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
907888
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
908889
++device->btls;
@@ -2999,29 +2980,27 @@ btl_openib_component_init(int *num_btl_modules,
29992980
ib_selected = (mca_btl_base_selected_module_t*)item;
30002981
openib_btl = (mca_btl_openib_module_t*)ib_selected->btl_module;
30012982

3002-
if (openib_btl->allowed) {
3003-
/* Search for a CPC that can handle this port */
3004-
ret = opal_btl_openib_connect_base_select_for_local_port(openib_btl);
3005-
/* If we get NOT_SUPPORTED, then no CPC was found for this
3006-
port. But that's not a fatal error -- just keep going;
3007-
let's see if we find any usable openib modules or not. */
3008-
if (OPAL_ERR_NOT_SUPPORTED == ret) {
3009-
continue;
3010-
} else if (OPAL_SUCCESS != ret) {
3011-
/* All others *are* fatal. Note that we already did a
3012-
show_help in the lower layer */
3013-
goto no_btls;
3014-
}
2983+
/* Search for a CPC that can handle this port */
2984+
ret = opal_btl_openib_connect_base_select_for_local_port(openib_btl);
2985+
/* If we get NOT_SUPPORTED, then no CPC was found for this
2986+
port. But that's not a fatal error -- just keep going;
2987+
let's see if we find any usable openib modules or not. */
2988+
if (OPAL_ERR_NOT_SUPPORTED == ret) {
2989+
continue;
2990+
} else if (OPAL_SUCCESS != ret) {
2991+
/* All others *are* fatal. Note that we already did a
2992+
show_help in the lower layer */
2993+
goto no_btls;
2994+
}
30152995

3016-
if (mca_btl_openib_component.max_hw_msg_size > 0 &&
3017-
(uint32_t)mca_btl_openib_component.max_hw_msg_size > openib_btl->ib_port_attr.max_msg_sz) {
3018-
BTL_ERROR(("max_hw_msg_size (%" PRIu32 ") is larger than hw max message size (%" PRIu32 ")",
3019-
mca_btl_openib_component.max_hw_msg_size, openib_btl->ib_port_attr.max_msg_sz));
3020-
}
2996+
if (mca_btl_openib_component.max_hw_msg_size > 0 &&
2997+
(uint32_t)mca_btl_openib_component.max_hw_msg_size > openib_btl->ib_port_attr.max_msg_sz) {
2998+
BTL_ERROR(("max_hw_msg_size (%" PRIu32 ") is larger than hw max message size (%" PRIu32 ")",
2999+
mca_btl_openib_component.max_hw_msg_size, openib_btl->ib_port_attr.max_msg_sz));
3000+
}
30213001

3022-
if (finish_btl_init(openib_btl) != OPAL_SUCCESS) {
3023-
goto no_btls;
3024-
}
3002+
if (finish_btl_init(openib_btl) != OPAL_SUCCESS) {
3003+
goto no_btls;
30253004
}
30263005

30273006
mca_btl_openib_component.openib_btls[i] = openib_btl;

0 commit comments

Comments
 (0)