Skip to content

Commit b49cbf4

Browse files
yosefejsquyres
authored andcommitted
ucx: disable version 1.8
Signed-off-by: Yossi Itigin <[email protected]>
1 parent 46a4bc3 commit b49cbf4

File tree

2 files changed

+68
-43
lines changed

2 files changed

+68
-43
lines changed

config/ompi_check_ucx.m4

Lines changed: 52 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -104,44 +104,58 @@ AC_DEFUN([OMPI_CHECK_UCX],[
104104
old_CPPFLAGS="$CPPFLAGS"
105105
AS_IF([test -n "$ompi_check_ucx_dir"],
106106
[CPPFLAGS="$CPPFLAGS -I$ompi_check_ucx_dir/include"])
107-
AC_CHECK_DECLS([ucp_tag_send_nbr],
108-
[AC_DEFINE([HAVE_UCP_TAG_SEND_NBR],[1],
109-
[have ucp_tag_send_nbr()])], [],
110-
[#include <ucp/api/ucp.h>])
111-
AC_CHECK_DECLS([ucp_ep_flush_nb, ucp_worker_flush_nb,
112-
ucp_request_check_status, ucp_put_nb, ucp_get_nb,
113-
ucp_put_nbx, ucp_get_nbx, ucp_atomic_op_nbx],
114-
[], [],
115-
[#include <ucp/api/ucp.h>])
116-
AC_CHECK_DECLS([ucm_test_events,
117-
ucm_test_external_events],
118-
[], [],
119-
[#include <ucm/api/ucm.h>])
120-
AC_CHECK_DECLS([UCP_ATOMIC_POST_OP_AND,
121-
UCP_ATOMIC_POST_OP_OR,
122-
UCP_ATOMIC_POST_OP_XOR,
123-
UCP_ATOMIC_FETCH_OP_FAND,
124-
UCP_ATOMIC_FETCH_OP_FOR,
125-
UCP_ATOMIC_FETCH_OP_FXOR,
126-
UCP_PARAM_FIELD_ESTIMATED_NUM_PPN],
127-
[], [],
128-
[#include <ucp/api/ucp.h>])
129-
AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS],
130-
[AC_DEFINE([HAVE_UCP_WORKER_ADDRESS_FLAGS], [1],
131-
[have worker address attribute])], [],
132-
[#include <ucp/api/ucp.h>])
133-
AC_CHECK_DECLS([UCP_ATTR_FIELD_MEMORY_TYPES],
134-
[AC_DEFINE([HAVE_UCP_ATTR_MEMORY_TYPES], [1],
135-
[have memory types attribute])], [],
136-
[#include <ucp/api/ucp.h>])
137-
AC_CHECK_DECLS([ucp_tag_send_nbx,
138-
ucp_tag_send_sync_nbx,
139-
ucp_tag_recv_nbx],
140-
[], [],
141-
[#include <ucp/api/ucp.h>])
142-
AC_CHECK_TYPES([ucp_request_param_t],
143-
[], [],
144-
[[#include <ucp/api/ucp.h>]])
107+
# Turn off UCX version v1.8 due to issue #8321
108+
AC_MSG_CHECKING([UCX version])
109+
AC_PREPROC_IFELSE([AC_LANG_PROGRAM([#include <ucp/api/ucp_version.h>
110+
#if (UCP_API_MAJOR == 1) && (UCP_API_MINOR == 8)
111+
#error "Invalid version"
112+
#endif], [])],
113+
[AC_MSG_RESULT([ok (not 1.8.x)])],
114+
[AC_MSG_RESULT([bad (1.8.x)])
115+
AC_MSG_WARN([UCX support skipped because version 1.8.x was found, which has a known catastrophic issue.])
116+
AC_MSG_WARN([Please upgrade to UCX version 1.9 or higher.])
117+
ompi_check_ucx_happy=no])
118+
AS_IF([test "$ompi_check_ucx_happy" = yes],
119+
[
120+
AC_CHECK_DECLS([ucp_tag_send_nbr],
121+
[AC_DEFINE([HAVE_UCP_TAG_SEND_NBR],[1],
122+
[have ucp_tag_send_nbr()])], [],
123+
[#include <ucp/api/ucp.h>])
124+
AC_CHECK_DECLS([ucp_ep_flush_nb, ucp_worker_flush_nb,
125+
ucp_request_check_status, ucp_put_nb, ucp_get_nb,
126+
ucp_put_nbx, ucp_get_nbx, ucp_atomic_op_nbx],
127+
[], [],
128+
[#include <ucp/api/ucp.h>])
129+
AC_CHECK_DECLS([ucm_test_events,
130+
ucm_test_external_events],
131+
[], [],
132+
[#include <ucm/api/ucm.h>])
133+
AC_CHECK_DECLS([UCP_ATOMIC_POST_OP_AND,
134+
UCP_ATOMIC_POST_OP_OR,
135+
UCP_ATOMIC_POST_OP_XOR,
136+
UCP_ATOMIC_FETCH_OP_FAND,
137+
UCP_ATOMIC_FETCH_OP_FOR,
138+
UCP_ATOMIC_FETCH_OP_FXOR,
139+
UCP_PARAM_FIELD_ESTIMATED_NUM_PPN],
140+
[], [],
141+
[#include <ucp/api/ucp.h>])
142+
AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS],
143+
[AC_DEFINE([HAVE_UCP_WORKER_ADDRESS_FLAGS], [1],
144+
[have worker address attribute])], [],
145+
[#include <ucp/api/ucp.h>])
146+
AC_CHECK_DECLS([UCP_ATTR_FIELD_MEMORY_TYPES],
147+
[AC_DEFINE([HAVE_UCP_ATTR_MEMORY_TYPES], [1],
148+
[have memory types attribute])], [],
149+
[#include <ucp/api/ucp.h>])
150+
AC_CHECK_DECLS([ucp_tag_send_nbx,
151+
ucp_tag_send_sync_nbx,
152+
ucp_tag_recv_nbx],
153+
[], [],
154+
[#include <ucp/api/ucp.h>])
155+
AC_CHECK_TYPES([ucp_request_param_t],
156+
[], [],
157+
[[#include <ucp/api/ucp.h>]])
158+
])
145159
CPPFLAGS=$old_CPPFLAGS
146160

147161
OPAL_SUMMARY_ADD([[Transports]],[[Open UCX]],[$1],[$ompi_check_ucx_happy])])])

ompi/mca/pml/ucx/pml_ucx.c

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -190,12 +190,23 @@ static int mca_pml_ucx_recv_worker_address(ompi_proc_t *proc,
190190

191191
int mca_pml_ucx_open(void)
192192
{
193+
unsigned major_version, minor_version, release_number;
193194
ucp_context_attr_t attr;
194195
ucp_params_t params;
195196
ucp_config_t *config;
196197
ucs_status_t status;
197198

198-
PML_UCX_VERBOSE(1, "mca_pml_ucx_open");
199+
/* Check version */
200+
ucp_get_version(&major_version, &minor_version, &release_number);
201+
PML_UCX_VERBOSE(1, "mca_pml_ucx_open: UCX version %u.%u.%u",
202+
major_version, minor_version, release_number);
203+
204+
if ((major_version == 1) && (minor_version == 8)) {
205+
/* disabled due to issue #8321 */
206+
PML_UCX_VERBOSE(1, "UCX PML is disabled because the run-time UCX version "
207+
"is 1.8, which has a known catastrophic issue");
208+
return OMPI_ERROR;
209+
}
199210

200211
/* Read options */
201212
status = ucp_config_read("MPI", NULL, &config);
@@ -690,7 +701,7 @@ int mca_pml_ucx_isend_init(const void *buf, size_t count, ompi_datatype_t *datat
690701
}
691702

692703
static ucs_status_ptr_t
693-
mca_pml_ucx_bsend(ucp_ep_h ep, const void *buf, size_t count,
704+
mca_pml_ucx_bsend(ucp_ep_h ep, const void *buf, size_t count,
694705
ompi_datatype_t *datatype, uint64_t pml_tag)
695706
{
696707
ompi_request_t *req;
@@ -713,7 +724,7 @@ mca_pml_ucx_bsend(ucp_ep_h ep, const void *buf, size_t count,
713724
PML_UCX_ERROR("bsend: failed to allocate buffer");
714725
return UCS_STATUS_PTR(OMPI_ERROR);
715726
}
716-
727+
717728
iov_count = 1;
718729
iov.iov_base = packed_data;
719730
iov.iov_len = packed_length;
@@ -801,8 +812,8 @@ int mca_pml_ucx_isend(const void *buf, size_t count, ompi_datatype_t *datatype,
801812
ompi_request_t *req;
802813
ucp_ep_h ep;
803814

804-
PML_UCX_TRACE_SEND("i%ssend request *%p",
805-
buf, count, datatype, dst, tag, mode, comm,
815+
PML_UCX_TRACE_SEND("i%ssend request *%p",
816+
buf, count, datatype, dst, tag, mode, comm,
806817
mode == MCA_PML_BASE_SEND_BUFFERED ? "b" : "",
807818
(void*)request)
808819

0 commit comments

Comments
 (0)