Skip to content

Commit 462e0fc

Browse files
committed
ch4/ofi: relax av insertion order of non-root vcis
We used restrict all remote vcis to be inserted on each local vci in the exact same order. This forces us to initialize all vcis at init time. Relax the restriction by allowing local non-root vci to have a subset of remote vcis to be inserted than on the local root vci. This allows enabling multiple vcis on a sub communicator rather than always comm_world. Because we insert all remote endpoints to all local non-root-endpoints at the same time, thus follow the exact same insertion order, they will share the same av table index except for the local root endpoint because it has inserted other remote root endpoints at init time. The local root to remote non-root endpoints will have a fixed offset from that of local non-root.
1 parent d6a1df7 commit 462e0fc

File tree

1 file changed

+55
-81
lines changed

1 file changed

+55
-81
lines changed

src/mpid/ch4/netmod/ofi/ofi_vci.c

Lines changed: 55 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,24 @@
77
#include "ofi_impl.h"
88
#include "ofi_init.h"
99

10+
/* NOTE on av insertion order:
11+
*
12+
* Each nic-vci is an endpoint with a unique address, and inside libfabric maintains
13+
* one av table. Thus to fully store the address mapping, we'll need a multi-dim table as
14+
* av_table[src_vci][src_nic][dest_rank][dest_vci][dest_nic]
15+
* Note, this table is for illustration, and different from MPIDI_OFI_addr_t.
16+
*
17+
* However, if we insert the addresses in the same order between local endpoints, then the
18+
* av table indexes will be *identical*. Then, we can omit the dimension of [src_vci][src_nic].
19+
* I.e. we only need av_table[rank][vci][nic], saving two dimensions of local vcis and local nics.
20+
*
21+
* To achieve that, we need always insert each remote address on *all* local endpoints together.
22+
* Because we separate root addr (av_table[0][0][rank][0][0]) separately, we allow the root
23+
* address to be inserted separately from the rest. The rest of the addresses are only
24+
* needed when multiple vcis/nics are enabled. But we require for each remote rank, all remote
25+
* endpoints to be inserted all at once.
26+
*/
27+
1028
int MPIDI_OFI_vci_init(void)
1129
{
1230
MPIDI_OFI_global.num_nics = 1;
@@ -173,7 +191,7 @@ static int setup_additional_vcis(void)
173191
#endif
174192

175193
#define GET_AV_AND_ADDRNAMES(rank) \
176-
MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(&MPIDIU_get_av(0, rank)); \
194+
MPIDI_av_entry_t *av ATTRIBUTE((unused)) = &MPIDIU_get_av(0, rank); \
177195
char *r_names = all_names + rank * max_vcis * num_nics * name_len;
178196

179197
#define DO_AV_INSERT(ctx_idx, nic, vci) \
@@ -237,9 +255,9 @@ static int addr_exchange_all_ctx(MPIR_Comm * comm)
237255
int mpi_errno = MPI_SUCCESS;
238256
MPIR_CHKLMEM_DECL();
239257

240-
MPIR_Comm *comm = MPIR_Process.comm_world;
241-
int size = MPIR_Process.size;
242-
int rank = MPIR_Process.rank;
258+
MPIR_Assert(comm == MPIR_Process.comm_world);
259+
int size = comm->local_size;
260+
int rank = comm->rank;
243261

244262
int max_vcis;
245263
int *all_num_vcis;
@@ -270,9 +288,10 @@ static int addr_exchange_all_ctx(MPIR_Comm * comm)
270288
goto fn_exit;
271289
}
272290

273-
/* allocate additional av addrs */
291+
/* allocate all_dest[] in av entry */
274292
for (int i = 0; i < size; i++) {
275-
MPIDI_av_entry_t *av = &MPIDIU_get_av(0, i);
293+
MPIDI_av_entry_t *av = MPIDIU_comm_rank_to_av(comm, i);
294+
MPIR_Assert(MPIDI_OFI_AV(av).all_dest == NULL);
276295
MPIDI_OFI_AV(av).all_dest = MPL_malloc(max_vcis * num_nics * sizeof(fi_addr_t),
277296
MPL_MEM_ADDRESS);
278297
MPIR_ERR_CHKANDJUMP(!MPIDI_OFI_AV(av).all_dest, mpi_errno, MPI_ERR_OTHER, "**nomem");
@@ -300,94 +319,49 @@ static int addr_exchange_all_ctx(MPIR_Comm * comm)
300319
mpi_errno = MPIR_Allgather_fallback(MPI_IN_PLACE, 0, MPIR_BYTE_INTERNAL,
301320
all_names, my_len, MPIR_BYTE_INTERNAL, comm, MPIR_ERR_NONE);
302321

303-
/* Step 2: insert and store non-root nic/vci on the root context */
304-
int root_ctx_idx = MPIDI_OFI_get_ctx_index(0, 0);
322+
/* insert and store non-root nic/vci on the root context */
305323
for (int r = 0; r < size; r++) {
324+
fi_addr_t expect_addr = FI_ADDR_NOTAVAIL;
325+
fi_addr_t root_offset = 0;
306326
GET_AV_AND_ADDRNAMES(r);
307327
for (int nic = 0; nic < num_nics; nic++) {
308328
for (int vci = 0; vci < NUM_VCIS_FOR_RANK(r); vci++) {
309-
SKIP_ROOT(nic, vci);
310-
DO_AV_INSERT(root_ctx_idx, nic, vci);
311-
MPIDI_OFI_AV_ADDR(av, 0, 0, vci, nic) = addr;
312-
}
313-
}
314-
}
315-
316-
/* Step 3: insert all nic/vci on non-root context, following exact order as step 1 and 2 */
317-
318-
int *is_node_roots = NULL;
319-
if (MPIR_CVAR_CH4_ROOTS_ONLY_PMI) {
320-
MPIR_CHKLMEM_MALLOC(is_node_roots, size * sizeof(int));
321-
for (int r = 0; r < size; r++) {
322-
is_node_roots[r] = 0;
323-
}
324-
for (int i = 0; i < MPIR_Process.num_nodes; i++) {
325-
is_node_roots[MPIR_Process.node_root_map[i]] = 1;
326-
}
327-
}
328-
329-
for (int nic_local = 0; nic_local < num_nics; nic_local++) {
330-
for (int vci_local = 0; vci_local < num_vcis; vci_local++) {
331-
SKIP_ROOT(nic_local, vci_local);
332-
int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, nic_local);
333-
334-
/* -- same order as step 1 -- */
335-
if (MPIR_CVAR_CH4_ROOTS_ONLY_PMI) {
336-
/* node roots */
337-
for (int r = 0; r < size; r++) {
338-
if (is_node_roots[r]) {
339-
GET_AV_AND_ADDRNAMES(r);
340-
DO_AV_INSERT(ctx_idx, 0, 0);
341-
MPIR_Assert(MPIDI_OFI_AV_ROOT_ADDR(av) == addr);
342-
}
343-
}
344-
/* non-node-root */
345-
for (int r = 0; r < size; r++) {
346-
if (!is_node_roots[r]) {
347-
GET_AV_AND_ADDRNAMES(r);
348-
DO_AV_INSERT(ctx_idx, 0, 0);
349-
MPIR_Assert(MPIDI_OFI_AV_ROOT_ADDR(av) == addr);
350-
}
351-
}
352-
} else {
353-
/* !MPIR_CVAR_CH4_ROOTS_ONLY_PMI */
354-
for (int r = 0; r < size; r++) {
355-
GET_AV_AND_ADDRNAMES(r);
356-
DO_AV_INSERT(ctx_idx, 0, 0);
357-
MPIR_Assert(MPIDI_OFI_AV_ROOT_ADDR(av) == addr);
358-
}
359-
}
360-
361-
/* -- same order as step 2 -- */
362-
for (int r = 0; r < size; r++) {
363-
GET_AV_AND_ADDRNAMES(r);
364-
for (int nic = 0; nic < num_nics; nic++) {
365-
for (int vci = 0; vci < NUM_VCIS_FOR_RANK(r); vci++) {
366-
SKIP_ROOT(nic, vci);
329+
/* for each local endpoints */
330+
for (int nic_local = 0; nic_local < num_nics; nic_local++) {
331+
for (int vci_local = 0; vci_local < num_vcis; vci_local++) {
332+
/* skip root */
333+
if (nic == 0 && vci == 0 && nic_local == 0 && vci_local == 0) {
334+
continue;
335+
}
336+
int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, nic_local);
367337
DO_AV_INSERT(ctx_idx, nic, vci);
368-
MPIR_Assert(MPIDI_OFI_AV_ADDR(av, 0, 0, vci, nic) == addr);
338+
/* we expect all resulting addr to be the same except for local root endpoint, which
339+
* will have an offset */
340+
if (expect_addr == FI_ADDR_NOTAVAIL) {
341+
expect_addr = addr;
342+
} else if (nic_local == 0 && vci_local == 0) {
343+
if (root_offset == 0) {
344+
root_offset = addr - expect_addr;
345+
} else {
346+
MPIR_Assert(addr == expect_addr + root_offset);
347+
}
348+
} else {
349+
MPIR_Assert(addr == expect_addr);
350+
}
369351
}
370352
}
353+
MPIR_Assert(expect_addr != FI_ADDR_NOTAVAIL);
354+
MPIDI_OFI_AV_ADDR_NO_OFFSET(av, vci, nic) = expect_addr;
355+
/* next */
356+
expect_addr++;
371357
}
372358
}
359+
MPIDI_OFI_AV(av).root_offset = root_offset;
373360
}
361+
374362
mpi_errno = MPIR_Barrier_fallback(comm, MPIR_ERR_NONE);
375363
MPIR_ERR_CHECK(mpi_errno);
376364

377-
/* check */
378-
#if MPIDI_CH4_MAX_VCIS > 1
379-
if (MPIDI_OFI_ENABLE_AV_TABLE) {
380-
for (int r = 0; r < size; r++) {
381-
MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(&MPIDIU_get_av(0, r));
382-
for (int nic = 0; nic < num_nics; nic++) {
383-
for (int vci = 0; vci < NUM_VCIS_FOR_RANK(r); vci++) {
384-
MPIR_Assert(MPIDI_OFI_AV_ADDR(av, 0, 0, vci, nic) ==
385-
get_av_table_index(r, nic, vci, all_num_vcis));
386-
}
387-
}
388-
}
389-
}
390-
#endif
391365
fn_exit:
392366
MPIR_CHKLMEM_FREEALL();
393367
return mpi_errno;

0 commit comments

Comments
 (0)