Skip to content

Commit 3da939b

Browse files
authored
Merge pull request open-mpi#7248 from wckzhang/v4.0.x
MTL/OFI: Check threshold number of peers allowed per rank
2 parents 6985a55 + 1bee429 commit 3da939b

File tree

4 files changed

+30
-3
lines changed

4 files changed

+30
-3
lines changed

ompi/mca/mtl/ofi/mtl_ofi.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,22 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
5454
char *ep_names = NULL;
5555
fi_addr_t *fi_addrs = NULL;
5656
mca_mtl_ofi_endpoint_t *endpoint = NULL;
57+
int num_peers_limit = (1 << ompi_mtl_ofi.num_bits_source_rank) - 1;
5758

5859
namelen = ompi_mtl_ofi.epnamelen;
5960

61+
/* We cannot add more ranks than available tag bits */
62+
if ((false == ompi_mtl_ofi.fi_cq_data) &&
63+
OPAL_UNLIKELY(((int) (nprocs + ompi_mtl_ofi.num_peers) > num_peers_limit))) {
64+
opal_output(0, "%s:%d: OFI provider: %s does not have enough bits for source rank in its tag.\n"
65+
"Adding more ranks will result in undefined behaviour. Please enable\n"
66+
"FI_REMOTE_CQ_DATA feature in the provider. For more info refer fi_cq(3).\n",
67+
__FILE__, __LINE__, ompi_mtl_ofi.provider_name);
68+
fflush(stderr);
69+
ret = OMPI_ERROR;
70+
goto bail;
71+
}
72+
6073
/**
6174
* Create array of EP names.
6275
*/
@@ -126,6 +139,9 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
126139
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint;
127140
}
128141

142+
/* Update global counter of number of procs added to this rank */
143+
ompi_mtl_ofi.num_peers += nprocs;
144+
129145
ret = OMPI_SUCCESS;
130146

131147
bail:

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
#include "mtl_ofi.h"
1616
#include "opal/util/argv.h"
17-
#include "opal/util/show_help.h"
17+
#include "opal/util/printf.h"
1818

1919
static int ompi_mtl_ofi_component_open(void);
2020
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
@@ -576,6 +576,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
576576
ompi_mtl_ofi_define_tag_mode(ofi_tag_mode);
577577
}
578578

579+
ompi_mtl_ofi.num_peers = 0;
580+
579581
/**
580582
* Open fabric
581583
* The getinfo struct returns a fabric attribute struct that can be used to
@@ -709,6 +711,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
709711
goto error;
710712
}
711713

714+
ompi_mtl_ofi.provider_name = strdup(prov->fabric_attr->prov_name);
715+
712716
/**
713717
* Free providers info since it's not needed anymore.
714718
*/

ompi/mca/mtl/ofi/mtl_ofi_endpoint.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,13 @@ typedef struct mca_mtl_ofi_endpoint_t mca_mtl_ofi_endpoint_t;
4141
static inline mca_mtl_ofi_endpoint_t *ompi_mtl_ofi_get_endpoint (struct mca_mtl_base_module_t* mtl, ompi_proc_t *ompi_proc)
4242
{
4343
if (OPAL_UNLIKELY(NULL == ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL])) {
44-
ompi_mtl_ofi_add_procs(mtl, 1, &ompi_proc);
44+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_mtl_ofi_add_procs(mtl, 1, &ompi_proc))) {
45+
/* Fatal error. exit() out */
46+
opal_output(0, "%s:%d: *** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
47+
__FILE__, __LINE__);
48+
fflush(stderr);
49+
exit(1);
50+
}
4551
}
4652

4753
return ompi_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];

ompi/mca/mtl/ofi/mtl_ofi_types.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ typedef struct mca_mtl_ofi_module_t {
4343
/** "Any source" address */
4444
fi_addr_t any_addr;
4545

46-
/** Optional user-specified OFI provider name */
46+
/** OFI provider name */
4747
char *provider_name;
4848

4949
/** Maximum inject size */
@@ -64,6 +64,7 @@ typedef struct mca_mtl_ofi_module_t {
6464
unsigned long long source_rank_mask;
6565
unsigned long long mpi_tag_mask;
6666
int num_bits_mpi_tag;
67+
int num_peers;
6768

6869
/** Synchronous protocol tag bits */
6970
unsigned long long sync_send;

0 commit comments

Comments
 (0)