Skip to content

Commit ac717dd

Browse files
committed
ch4/ofi: refactor vci-related initialization code
Move code that are related to multiple-vci setup to ofi_vci.c.
1 parent 5e2c277 commit ac717dd

File tree

5 files changed

+350
-362
lines changed

5 files changed

+350
-362
lines changed

src/mpid/ch4/netmod/ofi/init_addrxchg.c

Lines changed: 0 additions & 242 deletions
Original file line numberDiff line numberDiff line change
@@ -38,58 +38,6 @@
3838
* isolates multi-nic/vci complications from bootstrapping phase.
3939
*/
4040

41-
/* with MPIDI_OFI_ENABLE_AV_TABLE, we potentially can omit storing av tables.
42-
* The following routines ensures we can do that. It is static now, but we can
43-
* easily export to global when we need to.
44-
*/
45-
46-
#if !defined(MPIDI_OFI_VNI_USE_DOMAIN) || MPIDI_CH4_MAX_VCIS == 1
47-
/* NOTE: with scalable endpoint as context, all vcis share the same address. */
48-
#define NUM_VCIS_FOR_RANK(r) 1
49-
#else
50-
#define NUM_VCIS_FOR_RANK(r) all_num_vcis[r]
51-
#endif
52-
53-
ATTRIBUTE((unused))
54-
static int get_root_av_table_index(int rank)
55-
{
56-
if (MPIR_CVAR_CH4_ROOTS_ONLY_PMI) {
57-
/* node roots with greater ranks are inserted before this rank if it a non-node-root */
58-
int num_extra = 0;
59-
60-
/* check node roots */
61-
for (int i = 0; i < MPIR_Process.num_nodes; i++) {
62-
if (MPIR_Process.node_root_map[i] == rank) {
63-
return i;
64-
} else if (MPIR_Process.node_root_map[i] > rank) {
65-
num_extra++;
66-
}
67-
}
68-
69-
/* must be non-node-root */
70-
return rank + num_extra;
71-
} else {
72-
return rank;
73-
}
74-
}
75-
76-
ATTRIBUTE((unused))
77-
static int get_av_table_index(int rank, int nic, int vci, int *all_num_vcis)
78-
{
79-
if (nic == 0 && vci == 0) {
80-
return get_root_av_table_index(rank);
81-
} else {
82-
int num_nics = MPIDI_OFI_global.num_nics;
83-
int idx = 0;
84-
idx += MPIR_Process.size; /* root entries */
85-
for (int i = 0; i < rank; i++) {
86-
idx += num_nics * NUM_VCIS_FOR_RANK(i) - 1;
87-
}
88-
idx += nic * NUM_VCIS_FOR_RANK(rank) + vci - 1;
89-
return idx;
90-
}
91-
}
92-
9341
/* Step 1: exchange root contexts */
9442
int MPIDI_OFI_addr_exchange_root_ctx(void)
9543
{
@@ -170,14 +118,6 @@ int MPIDI_OFI_addr_exchange_root_ctx(void)
170118
MPIR_ERR_CHECK(mpi_errno);
171119
}
172120

173-
/* check */
174-
if (MPIDI_OFI_ENABLE_AV_TABLE) {
175-
for (int r = 0; r < size; r++) {
176-
MPIDI_av_entry_t *av ATTRIBUTE((unused)) = &MPIDIU_get_av(0, r);
177-
MPIR_Assert(MPIDI_OFI_AV_ROOT_ADDR(av) == get_root_av_table_index(r));
178-
}
179-
}
180-
181121
fn_exit:
182122
if (init_comm && !mpi_errno) {
183123
MPIDI_destroy_init_comm(&init_comm);
@@ -186,185 +126,3 @@ int MPIDI_OFI_addr_exchange_root_ctx(void)
186126
fn_fail:
187127
goto fn_exit;
188128
}
189-
190-
/* Step 2 & 3: exchange non-root contexts */
191-
192-
/* Macros to reduce clutter, so we can focus on the ordering logics.
193-
* Note: they are not perfectly wrapped, but tolerable since only used here. */
194-
#define GET_AV_AND_ADDRNAMES(rank) \
195-
MPIDI_av_entry_t *av ATTRIBUTE((unused)) = &MPIDIU_get_av(0, rank); \
196-
char *r_names = all_names + rank * max_vcis * num_nics * name_len;
197-
198-
#define DO_AV_INSERT(ctx_idx, nic, vci) \
199-
fi_addr_t addr; \
200-
MPIDI_OFI_CALL(fi_av_insert(MPIDI_OFI_global.ctx[ctx_idx].av, \
201-
r_names + (vci * num_nics + nic) * name_len, 1, \
202-
&addr, 0ULL, NULL), avmap);
203-
204-
#define SKIP_ROOT(nic, vci) \
205-
if (nic == 0 && vci == 0) { \
206-
continue; \
207-
}
208-
209-
int MPIDI_OFI_addr_exchange_all_ctx(void)
210-
{
211-
int mpi_errno = MPI_SUCCESS;
212-
213-
MPIR_Comm *comm = MPIR_Process.comm_world;
214-
int size = MPIR_Process.size;
215-
int rank = MPIR_Process.rank;
216-
MPIR_CHKLMEM_DECL();
217-
218-
int max_vcis;
219-
int *all_num_vcis;
220-
221-
#if !defined(MPIDI_OFI_VNI_USE_DOMAIN) || MPIDI_CH4_MAX_VCIS == 1
222-
max_vcis = 1;
223-
all_num_vcis = NULL;
224-
#else
225-
/* Allgather num_vcis */
226-
MPIR_CHKLMEM_MALLOC(all_num_vcis, sizeof(int) * size);
227-
mpi_errno = MPIR_Allgather_fallback(&MPIDI_OFI_global.num_vcis, 1, MPIR_INT_INTERNAL,
228-
all_num_vcis, 1, MPIR_INT_INTERNAL, comm, MPIR_ERR_NONE);
229-
MPIR_ERR_CHECK(mpi_errno);
230-
231-
max_vcis = 0;
232-
for (int i = 0; i < size; i++) {
233-
if (max_vcis < NUM_VCIS_FOR_RANK(i)) {
234-
max_vcis = NUM_VCIS_FOR_RANK(i);
235-
}
236-
}
237-
#endif
238-
239-
int num_vcis = NUM_VCIS_FOR_RANK(rank);
240-
int num_nics = MPIDI_OFI_global.num_nics;
241-
242-
/* Assume num_nics are all equal */
243-
if (max_vcis * num_nics == 1) {
244-
goto fn_exit;
245-
}
246-
247-
/* allocate additional av addrs */
248-
for (int i = 0; i < size; i++) {
249-
MPIDI_av_entry_t *av = &MPIDIU_get_av(0, i);
250-
MPIDI_OFI_AV(av).all_dest = MPL_malloc(max_vcis * num_nics * sizeof(fi_addr_t),
251-
MPL_MEM_ADDRESS);
252-
MPIR_ERR_CHKANDJUMP(!MPIDI_OFI_AV(av).all_dest, mpi_errno, MPI_ERR_OTHER, "**nomem");
253-
}
254-
255-
/* libfabric uses uniform name_len within a single provider */
256-
int name_len = MPIDI_OFI_global.addrnamelen;
257-
int my_len = max_vcis * num_nics * name_len;
258-
char *all_names;
259-
MPIR_CHKLMEM_MALLOC(all_names, size * my_len);
260-
char *my_names = all_names + rank * my_len;
261-
262-
/* put in my addrnames */
263-
for (int nic = 0; nic < num_nics; nic++) {
264-
for (int vci = 0; vci < num_vcis; vci++) {
265-
size_t actual_name_len = name_len;
266-
char *vci_addrname = my_names + (vci * num_nics + nic) * name_len;
267-
int ctx_idx = MPIDI_OFI_get_ctx_index(vci, nic);
268-
MPIDI_OFI_CALL(fi_getname((fid_t) MPIDI_OFI_global.ctx[ctx_idx].ep, vci_addrname,
269-
&actual_name_len), getname);
270-
MPIR_Assert(actual_name_len == name_len);
271-
}
272-
}
273-
/* Allgather */
274-
mpi_errno = MPIR_Allgather_fallback(MPI_IN_PLACE, 0, MPIR_BYTE_INTERNAL,
275-
all_names, my_len, MPIR_BYTE_INTERNAL, comm, MPIR_ERR_NONE);
276-
277-
/* Step 2: insert and store non-root nic/vci on the root context */
278-
int root_ctx_idx = MPIDI_OFI_get_ctx_index(0, 0);
279-
for (int r = 0; r < size; r++) {
280-
GET_AV_AND_ADDRNAMES(r);
281-
for (int nic = 0; nic < num_nics; nic++) {
282-
for (int vci = 0; vci < NUM_VCIS_FOR_RANK(r); vci++) {
283-
SKIP_ROOT(nic, vci);
284-
DO_AV_INSERT(root_ctx_idx, nic, vci);
285-
MPIDI_OFI_AV_ADDR(av, 0, 0, vci, nic) = addr;
286-
}
287-
}
288-
}
289-
290-
/* Step 3: insert all nic/vci on non-root context, following exact order as step 1 and 2 */
291-
292-
int *is_node_roots = NULL;
293-
if (MPIR_CVAR_CH4_ROOTS_ONLY_PMI) {
294-
MPIR_CHKLMEM_MALLOC(is_node_roots, size * sizeof(int));
295-
for (int r = 0; r < size; r++) {
296-
is_node_roots[r] = 0;
297-
}
298-
for (int i = 0; i < MPIR_Process.num_nodes; i++) {
299-
is_node_roots[MPIR_Process.node_root_map[i]] = 1;
300-
}
301-
}
302-
303-
for (int nic_local = 0; nic_local < num_nics; nic_local++) {
304-
for (int vci_local = 0; vci_local < num_vcis; vci_local++) {
305-
SKIP_ROOT(nic_local, vci_local);
306-
int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, nic_local);
307-
308-
/* -- same order as step 1 -- */
309-
if (MPIR_CVAR_CH4_ROOTS_ONLY_PMI) {
310-
/* node roots */
311-
for (int r = 0; r < size; r++) {
312-
if (is_node_roots[r]) {
313-
GET_AV_AND_ADDRNAMES(r);
314-
DO_AV_INSERT(ctx_idx, 0, 0);
315-
MPIR_Assert(MPIDI_OFI_AV_ROOT_ADDR(av) == addr);
316-
}
317-
}
318-
/* non-node-root */
319-
for (int r = 0; r < size; r++) {
320-
if (!is_node_roots[r]) {
321-
GET_AV_AND_ADDRNAMES(r);
322-
DO_AV_INSERT(ctx_idx, 0, 0);
323-
MPIR_Assert(MPIDI_OFI_AV_ROOT_ADDR(av) == addr);
324-
}
325-
}
326-
} else {
327-
/* !MPIR_CVAR_CH4_ROOTS_ONLY_PMI */
328-
for (int r = 0; r < size; r++) {
329-
GET_AV_AND_ADDRNAMES(r);
330-
DO_AV_INSERT(ctx_idx, 0, 0);
331-
MPIR_Assert(MPIDI_OFI_AV_ROOT_ADDR(av) == addr);
332-
}
333-
}
334-
335-
/* -- same order as step 2 -- */
336-
for (int r = 0; r < size; r++) {
337-
GET_AV_AND_ADDRNAMES(r);
338-
for (int nic = 0; nic < num_nics; nic++) {
339-
for (int vci = 0; vci < NUM_VCIS_FOR_RANK(r); vci++) {
340-
SKIP_ROOT(nic, vci);
341-
DO_AV_INSERT(ctx_idx, nic, vci);
342-
MPIR_Assert(MPIDI_OFI_AV_ADDR(av, 0, 0, vci, nic) == addr);
343-
}
344-
}
345-
}
346-
}
347-
}
348-
mpi_errno = MPIR_Barrier_fallback(comm, MPIR_ERR_NONE);
349-
MPIR_ERR_CHECK(mpi_errno);
350-
351-
/* check */
352-
#if MPIDI_CH4_MAX_VCIS > 1
353-
if (MPIDI_OFI_ENABLE_AV_TABLE) {
354-
for (int r = 0; r < size; r++) {
355-
MPIDI_av_entry_t *av ATTRIBUTE((unused)) = &MPIDIU_get_av(0, r);
356-
for (int nic = 0; nic < num_nics; nic++) {
357-
for (int vci = 0; vci < NUM_VCIS_FOR_RANK(r); vci++) {
358-
MPIR_Assert(MPIDI_OFI_AV_ADDR(av, 0, 0, vci, nic) ==
359-
get_av_table_index(r, nic, vci, all_num_vcis));
360-
}
361-
}
362-
}
363-
}
364-
#endif
365-
fn_exit:
366-
MPIR_CHKLMEM_FREEALL();
367-
return mpi_errno;
368-
fn_fail:
369-
goto fn_exit;
370-
}

0 commit comments

Comments
 (0)