|
7 | 7 | #include "ofi_impl.h" |
8 | 8 | #include "ofi_init.h" |
9 | 9 |
|
| 10 | +/* NOTE on av insertion order: |
| 11 | + * |
| 12 | + * Each nic-vci is an endpoint with a unique address, and inside libfabric maintains |
| 13 | + * one av table. Thus to fully store the address mapping, we'll need a multi-dim table as |
| 14 | + * av_table[src_vci][src_nic][dest_rank][dest_vci][dest_nic] |
| 15 | + * Note, this table is for illustration, and different from MPIDI_OFI_addr_t. |
| 16 | + * |
| 17 | + * However, if we insert the addresses in the same order between local endpoints, then the |
| 18 | + * av table indexes will be *identical*. Then, we can omit the dimension of [src_vci][src_nic]. |
| 19 | + * I.e. we only need av_table[rank][vci][nic], saving two dimensions of local vcis and local nics. |
| 20 | + * |
| 21 | + * To achieve that, we need always insert each remote address on *all* local endpoints together. |
| 22 | + * Because we separate root addr (av_table[0][0][rank][0][0]) separately, we allow the root |
| 23 | + * address to be inserted separately from the rest. The rest of the addresses are only |
| 24 | + * needed when multiple vcis/nics are enabled. But we require for each remote rank, all remote |
| 25 | + * endpoints to be inserted all at once. |
| 26 | + */ |
| 27 | + |
10 | 28 | int MPIDI_OFI_vci_init(void) |
11 | 29 | { |
12 | 30 | MPIDI_OFI_global.num_nics = 1; |
@@ -173,7 +191,7 @@ static int setup_additional_vcis(void) |
173 | 191 | #endif |
174 | 192 |
|
175 | 193 | #define GET_AV_AND_ADDRNAMES(rank) \ |
176 | | - MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(&MPIDIU_get_av(0, rank)); \ |
| 194 | + MPIDI_av_entry_t *av ATTRIBUTE((unused)) = &MPIDIU_get_av(0, rank); \ |
177 | 195 | char *r_names = all_names + rank * max_vcis * num_nics * name_len; |
178 | 196 |
|
179 | 197 | #define DO_AV_INSERT(ctx_idx, nic, vci) \ |
@@ -237,9 +255,9 @@ static int addr_exchange_all_ctx(MPIR_Comm * comm) |
237 | 255 | int mpi_errno = MPI_SUCCESS; |
238 | 256 | MPIR_CHKLMEM_DECL(); |
239 | 257 |
|
240 | | - MPIR_Comm *comm = MPIR_Process.comm_world; |
241 | | - int size = MPIR_Process.size; |
242 | | - int rank = MPIR_Process.rank; |
| 258 | + MPIR_Assert(comm == MPIR_Process.comm_world); |
| 259 | + int size = comm->local_size; |
| 260 | + int rank = comm->rank; |
243 | 261 |
|
244 | 262 | int max_vcis; |
245 | 263 | int *all_num_vcis; |
@@ -270,9 +288,10 @@ static int addr_exchange_all_ctx(MPIR_Comm * comm) |
270 | 288 | goto fn_exit; |
271 | 289 | } |
272 | 290 |
|
273 | | - /* allocate additional av addrs */ |
| 291 | + /* allocate all_dest[] in av entry */ |
274 | 292 | for (int i = 0; i < size; i++) { |
275 | | - MPIDI_av_entry_t *av = &MPIDIU_get_av(0, i); |
| 293 | + MPIDI_av_entry_t *av = MPIDIU_comm_rank_to_av(comm, i); |
| 294 | + MPIR_Assert(MPIDI_OFI_AV(av).all_dest == NULL); |
276 | 295 | MPIDI_OFI_AV(av).all_dest = MPL_malloc(max_vcis * num_nics * sizeof(fi_addr_t), |
277 | 296 | MPL_MEM_ADDRESS); |
278 | 297 | MPIR_ERR_CHKANDJUMP(!MPIDI_OFI_AV(av).all_dest, mpi_errno, MPI_ERR_OTHER, "**nomem"); |
@@ -300,94 +319,49 @@ static int addr_exchange_all_ctx(MPIR_Comm * comm) |
300 | 319 | mpi_errno = MPIR_Allgather_fallback(MPI_IN_PLACE, 0, MPIR_BYTE_INTERNAL, |
301 | 320 | all_names, my_len, MPIR_BYTE_INTERNAL, comm, MPIR_ERR_NONE); |
302 | 321 |
|
303 | | - /* Step 2: insert and store non-root nic/vci on the root context */ |
304 | | - int root_ctx_idx = MPIDI_OFI_get_ctx_index(0, 0); |
| 322 | + /* insert and store non-root nic/vci on the root context */ |
305 | 323 | for (int r = 0; r < size; r++) { |
| 324 | + fi_addr_t expect_addr = FI_ADDR_NOTAVAIL; |
| 325 | + fi_addr_t root_offset = 0; |
306 | 326 | GET_AV_AND_ADDRNAMES(r); |
307 | 327 | for (int nic = 0; nic < num_nics; nic++) { |
308 | 328 | for (int vci = 0; vci < NUM_VCIS_FOR_RANK(r); vci++) { |
309 | | - SKIP_ROOT(nic, vci); |
310 | | - DO_AV_INSERT(root_ctx_idx, nic, vci); |
311 | | - MPIDI_OFI_AV_ADDR(av, 0, 0, vci, nic) = addr; |
312 | | - } |
313 | | - } |
314 | | - } |
315 | | - |
316 | | - /* Step 3: insert all nic/vci on non-root context, following exact order as step 1 and 2 */ |
317 | | - |
318 | | - int *is_node_roots = NULL; |
319 | | - if (MPIR_CVAR_CH4_ROOTS_ONLY_PMI) { |
320 | | - MPIR_CHKLMEM_MALLOC(is_node_roots, size * sizeof(int)); |
321 | | - for (int r = 0; r < size; r++) { |
322 | | - is_node_roots[r] = 0; |
323 | | - } |
324 | | - for (int i = 0; i < MPIR_Process.num_nodes; i++) { |
325 | | - is_node_roots[MPIR_Process.node_root_map[i]] = 1; |
326 | | - } |
327 | | - } |
328 | | - |
329 | | - for (int nic_local = 0; nic_local < num_nics; nic_local++) { |
330 | | - for (int vci_local = 0; vci_local < num_vcis; vci_local++) { |
331 | | - SKIP_ROOT(nic_local, vci_local); |
332 | | - int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, nic_local); |
333 | | - |
334 | | - /* -- same order as step 1 -- */ |
335 | | - if (MPIR_CVAR_CH4_ROOTS_ONLY_PMI) { |
336 | | - /* node roots */ |
337 | | - for (int r = 0; r < size; r++) { |
338 | | - if (is_node_roots[r]) { |
339 | | - GET_AV_AND_ADDRNAMES(r); |
340 | | - DO_AV_INSERT(ctx_idx, 0, 0); |
341 | | - MPIR_Assert(MPIDI_OFI_AV_ROOT_ADDR(av) == addr); |
342 | | - } |
343 | | - } |
344 | | - /* non-node-root */ |
345 | | - for (int r = 0; r < size; r++) { |
346 | | - if (!is_node_roots[r]) { |
347 | | - GET_AV_AND_ADDRNAMES(r); |
348 | | - DO_AV_INSERT(ctx_idx, 0, 0); |
349 | | - MPIR_Assert(MPIDI_OFI_AV_ROOT_ADDR(av) == addr); |
350 | | - } |
351 | | - } |
352 | | - } else { |
353 | | - /* !MPIR_CVAR_CH4_ROOTS_ONLY_PMI */ |
354 | | - for (int r = 0; r < size; r++) { |
355 | | - GET_AV_AND_ADDRNAMES(r); |
356 | | - DO_AV_INSERT(ctx_idx, 0, 0); |
357 | | - MPIR_Assert(MPIDI_OFI_AV_ROOT_ADDR(av) == addr); |
358 | | - } |
359 | | - } |
360 | | - |
361 | | - /* -- same order as step 2 -- */ |
362 | | - for (int r = 0; r < size; r++) { |
363 | | - GET_AV_AND_ADDRNAMES(r); |
364 | | - for (int nic = 0; nic < num_nics; nic++) { |
365 | | - for (int vci = 0; vci < NUM_VCIS_FOR_RANK(r); vci++) { |
366 | | - SKIP_ROOT(nic, vci); |
| 329 | + /* for each local endpoints */ |
| 330 | + for (int nic_local = 0; nic_local < num_nics; nic_local++) { |
| 331 | + for (int vci_local = 0; vci_local < num_vcis; vci_local++) { |
| 332 | + /* skip root */ |
| 333 | + if (nic == 0 && vci == 0 && nic_local == 0 && vci_local == 0) { |
| 334 | + continue; |
| 335 | + } |
| 336 | + int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, nic_local); |
367 | 337 | DO_AV_INSERT(ctx_idx, nic, vci); |
368 | | - MPIR_Assert(MPIDI_OFI_AV_ADDR(av, 0, 0, vci, nic) == addr); |
| 338 | + /* we expect all resulting addr to be the same except for local root endpoint, which |
| 339 | + * will have an offset */ |
| 340 | + if (expect_addr == FI_ADDR_NOTAVAIL) { |
| 341 | + expect_addr = addr; |
| 342 | + } else if (nic_local == 0 && vci_local == 0) { |
| 343 | + if (root_offset == 0) { |
| 344 | + root_offset = addr - expect_addr; |
| 345 | + } else { |
| 346 | + MPIR_Assert(addr == expect_addr + root_offset); |
| 347 | + } |
| 348 | + } else { |
| 349 | + MPIR_Assert(addr == expect_addr); |
| 350 | + } |
369 | 351 | } |
370 | 352 | } |
| 353 | + MPIR_Assert(expect_addr != FI_ADDR_NOTAVAIL); |
| 354 | + MPIDI_OFI_AV_ADDR_NO_OFFSET(av, vci, nic) = expect_addr; |
| 355 | + /* next */ |
| 356 | + expect_addr++; |
371 | 357 | } |
372 | 358 | } |
| 359 | + MPIDI_OFI_AV(av).root_offset = root_offset; |
373 | 360 | } |
| 361 | + |
374 | 362 | mpi_errno = MPIR_Barrier_fallback(comm, MPIR_ERR_NONE); |
375 | 363 | MPIR_ERR_CHECK(mpi_errno); |
376 | 364 |
|
377 | | - /* check */ |
378 | | -#if MPIDI_CH4_MAX_VCIS > 1 |
379 | | - if (MPIDI_OFI_ENABLE_AV_TABLE) { |
380 | | - for (int r = 0; r < size; r++) { |
381 | | - MPIDI_OFI_addr_t *av ATTRIBUTE((unused)) = &MPIDI_OFI_AV(&MPIDIU_get_av(0, r)); |
382 | | - for (int nic = 0; nic < num_nics; nic++) { |
383 | | - for (int vci = 0; vci < NUM_VCIS_FOR_RANK(r); vci++) { |
384 | | - MPIR_Assert(MPIDI_OFI_AV_ADDR(av, 0, 0, vci, nic) == |
385 | | - get_av_table_index(r, nic, vci, all_num_vcis)); |
386 | | - } |
387 | | - } |
388 | | - } |
389 | | - } |
390 | | -#endif |
391 | 365 | fn_exit: |
392 | 366 | MPIR_CHKLMEM_FREEALL(); |
393 | 367 | return mpi_errno; |
|
0 commit comments