Skip to content

Commit 89be953

Browse files
author
Michael Heinz
committed
REF6976 Silent failure of OMPI over OFI with large messages sizes
INTERNAL: STL-59403 The OFI (libfabric) MTL does not respect the maximum message size parameter that OFI provides in the fi_info data. This patch adds this missing max_msg_size field to the mca_ofi_module_t structure and adds a length check to the low-level send routines. (cherry-picked from commit 3aca4af) Change-Id: Ie50445e5edfb0f30916de0836db0edc64ecf7c60 Signed-off-by: Michael Heinz <[email protected]> Reviewed-by: Adam Goldman <[email protected]> Reviewed-by: Brendan Cunningham <[email protected]>
1 parent e547a2b commit 89be953

File tree

4 files changed

+25
-3
lines changed

4 files changed

+25
-3
lines changed

ompi/mca/mtl/ofi/help-mtl-ofi.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,5 @@ unusual; your job may behave unpredictably (and/or abort) after this.
1616
Local host: %s
1717
Location: %s:%d
1818
Error: %s (%zd)
19+
[message too big]
20+
Message size %llu bigger than supported by selected transport. Max = %llu

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -308,13 +308,22 @@ ompi_mtl_ofi_send(struct mca_mtl_base_module_t *mtl,
308308
endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc);
309309

310310
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
311-
if (OMPI_SUCCESS != ompi_ret) return ompi_ret;
311+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
312+
return ompi_ret;
313+
}
312314

313315
ofi_req.buffer = (free_after) ? start : NULL;
314316
ofi_req.length = length;
315317
ofi_req.status.MPI_ERROR = OMPI_SUCCESS;
316318
ofi_req.completion_count = 0;
317319

320+
if (OPAL_UNLIKELY(length > endpoint->mtl_ofi_module->max_msg_size)) {
321+
opal_show_help("help-mtl-ofi.txt",
322+
"message too big", false,
323+
length, endpoint->mtl_ofi_module->max_msg_size);
324+
return OMPI_ERROR;
325+
}
326+
318327
if (ompi_mtl_ofi.fi_cq_data) {
319328
match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag);
320329
src_addr = endpoint->peer_fiaddr;
@@ -438,13 +447,20 @@ ompi_mtl_ofi_isend(struct mca_mtl_base_module_t *mtl,
438447
endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc);
439448

440449
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
441-
if (OMPI_SUCCESS != ompi_ret) return ompi_ret;
450+
if (OMPI_UNLIKELY(OMPI_SUCCESS != ompi_ret)) return ompi_ret;
442451

443452
ofi_req->buffer = (free_after) ? start : NULL;
444453
ofi_req->length = length;
445454
ofi_req->status.MPI_ERROR = OMPI_SUCCESS;
446455
ofi_req->completion_count = 1;
447456

457+
if (OPAL_UNLIKELY(length > endpoint->mtl_ofi_module->max_msg_size)) {
458+
opal_show_help("help-mtl-ofi.txt",
459+
"message too big", false,
460+
length, endpoint->mtl_ofi_module->max_msg_size);
461+
return OMPI_ERROR;
462+
}
463+
448464
if (ompi_mtl_ofi.fi_cq_data) {
449465
match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag);
450466
src_addr = endpoint->peer_fiaddr;

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -630,9 +630,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
630630
}
631631

632632
/**
633-
* Save the maximum inject size.
633+
* Save the maximum sizes.
634634
*/
635635
ompi_mtl_ofi.max_inject_size = prov->tx_attr->inject_size;
636+
ompi_mtl_ofi.max_msg_size = prov->ep_attr->max_msg_size;
636637

637638
/**
638639
* Create the objects that will be bound to the endpoint.

ompi/mca/mtl/ofi/mtl_ofi_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ typedef struct mca_mtl_ofi_module_t {
4949
/** Maximum inject size */
5050
size_t max_inject_size;
5151

52+
/** Largest message that can be sent in a single send. */
53+
size_t max_msg_size;
54+
5255
/** Maximum number of CQ events to read in OFI Progress */
5356
int ofi_progress_event_count;
5457

0 commit comments

Comments
 (0)