3838 * isolates multi-nic/vci complications from bootstrapping phase.
3939 */
4040
41- /* with MPIDI_OFI_ENABLE_AV_TABLE, we potentially can omit storing av tables.
42- * The following routines ensures we can do that. It is static now, but we can
43- * easily export to global when we need to.
44- */
45-
46- #if !defined(MPIDI_OFI_VNI_USE_DOMAIN ) || MPIDI_CH4_MAX_VCIS == 1
47- /* NOTE: with scalable endpoint as context, all vcis share the same address. */
48- #define NUM_VCIS_FOR_RANK (r ) 1
49- #else
50- #define NUM_VCIS_FOR_RANK (r ) all_num_vcis[r]
51- #endif
52-
53- ATTRIBUTE ((unused ))
54- static int get_root_av_table_index (int rank )
55- {
56- if (MPIR_CVAR_CH4_ROOTS_ONLY_PMI ) {
57- /* node roots with greater ranks are inserted before this rank if it a non-node-root */
58- int num_extra = 0 ;
59-
60- /* check node roots */
61- for (int i = 0 ; i < MPIR_Process .num_nodes ; i ++ ) {
62- if (MPIR_Process .node_root_map [i ] == rank ) {
63- return i ;
64- } else if (MPIR_Process .node_root_map [i ] > rank ) {
65- num_extra ++ ;
66- }
67- }
68-
69- /* must be non-node-root */
70- return rank + num_extra ;
71- } else {
72- return rank ;
73- }
74- }
75-
76- ATTRIBUTE ((unused ))
77- static int get_av_table_index (int rank , int nic , int vci , int * all_num_vcis )
78- {
79- if (nic == 0 && vci == 0 ) {
80- return get_root_av_table_index (rank );
81- } else {
82- int num_nics = MPIDI_OFI_global .num_nics ;
83- int idx = 0 ;
84- idx += MPIR_Process .size ; /* root entries */
85- for (int i = 0 ; i < rank ; i ++ ) {
86- idx += num_nics * NUM_VCIS_FOR_RANK (i ) - 1 ;
87- }
88- idx += nic * NUM_VCIS_FOR_RANK (rank ) + vci - 1 ;
89- return idx ;
90- }
91- }
92-
9341/* Step 1: exchange root contexts */
9442int MPIDI_OFI_addr_exchange_root_ctx (void )
9543{
@@ -170,14 +118,6 @@ int MPIDI_OFI_addr_exchange_root_ctx(void)
170118 MPIR_ERR_CHECK (mpi_errno );
171119 }
172120
173- /* check */
174- if (MPIDI_OFI_ENABLE_AV_TABLE ) {
175- for (int r = 0 ; r < size ; r ++ ) {
176- MPIDI_av_entry_t * av ATTRIBUTE ((unused )) = & MPIDIU_get_av (0 , r );
177- MPIR_Assert (MPIDI_OFI_AV_ROOT_ADDR (av ) == get_root_av_table_index (r ));
178- }
179- }
180-
181121 fn_exit :
182122 if (init_comm && !mpi_errno ) {
183123 MPIDI_destroy_init_comm (& init_comm );
@@ -186,185 +126,3 @@ int MPIDI_OFI_addr_exchange_root_ctx(void)
186126 fn_fail :
187127 goto fn_exit ;
188128}
189-
190- /* Step 2 & 3: exchange non-root contexts */
191-
192- /* Macros to reduce clutter, so we can focus on the ordering logics.
193- * Note: they are not perfectly wrapped, but tolerable since only used here. */
194- #define GET_AV_AND_ADDRNAMES (rank ) \
195- MPIDI_av_entry_t *av ATTRIBUTE((unused)) = &MPIDIU_get_av(0, rank); \
196- char *r_names = all_names + rank * max_vcis * num_nics * name_len;
197-
198- #define DO_AV_INSERT (ctx_idx , nic , vci ) \
199- fi_addr_t addr; \
200- MPIDI_OFI_CALL(fi_av_insert(MPIDI_OFI_global.ctx[ctx_idx].av, \
201- r_names + (vci * num_nics + nic) * name_len, 1, \
202- &addr, 0ULL, NULL), avmap);
203-
204- #define SKIP_ROOT (nic , vci ) \
205- if (nic == 0 && vci == 0) { \
206- continue; \
207- }
208-
209- int MPIDI_OFI_addr_exchange_all_ctx (void )
210- {
211- int mpi_errno = MPI_SUCCESS ;
212-
213- MPIR_Comm * comm = MPIR_Process .comm_world ;
214- int size = MPIR_Process .size ;
215- int rank = MPIR_Process .rank ;
216- MPIR_CHKLMEM_DECL ();
217-
218- int max_vcis ;
219- int * all_num_vcis ;
220-
221- #if !defined(MPIDI_OFI_VNI_USE_DOMAIN ) || MPIDI_CH4_MAX_VCIS == 1
222- max_vcis = 1 ;
223- all_num_vcis = NULL ;
224- #else
225- /* Allgather num_vcis */
226- MPIR_CHKLMEM_MALLOC (all_num_vcis , sizeof (int ) * size );
227- mpi_errno = MPIR_Allgather_fallback (& MPIDI_OFI_global .num_vcis , 1 , MPIR_INT_INTERNAL ,
228- all_num_vcis , 1 , MPIR_INT_INTERNAL , comm , MPIR_ERR_NONE );
229- MPIR_ERR_CHECK (mpi_errno );
230-
231- max_vcis = 0 ;
232- for (int i = 0 ; i < size ; i ++ ) {
233- if (max_vcis < NUM_VCIS_FOR_RANK (i )) {
234- max_vcis = NUM_VCIS_FOR_RANK (i );
235- }
236- }
237- #endif
238-
239- int num_vcis = NUM_VCIS_FOR_RANK (rank );
240- int num_nics = MPIDI_OFI_global .num_nics ;
241-
242- /* Assume num_nics are all equal */
243- if (max_vcis * num_nics == 1 ) {
244- goto fn_exit ;
245- }
246-
247- /* allocate additional av addrs */
248- for (int i = 0 ; i < size ; i ++ ) {
249- MPIDI_av_entry_t * av = & MPIDIU_get_av (0 , i );
250- MPIDI_OFI_AV (av ).all_dest = MPL_malloc (max_vcis * num_nics * sizeof (fi_addr_t ),
251- MPL_MEM_ADDRESS );
252- MPIR_ERR_CHKANDJUMP (!MPIDI_OFI_AV (av ).all_dest , mpi_errno , MPI_ERR_OTHER , "**nomem" );
253- }
254-
255- /* libfabric uses uniform name_len within a single provider */
256- int name_len = MPIDI_OFI_global .addrnamelen ;
257- int my_len = max_vcis * num_nics * name_len ;
258- char * all_names ;
259- MPIR_CHKLMEM_MALLOC (all_names , size * my_len );
260- char * my_names = all_names + rank * my_len ;
261-
262- /* put in my addrnames */
263- for (int nic = 0 ; nic < num_nics ; nic ++ ) {
264- for (int vci = 0 ; vci < num_vcis ; vci ++ ) {
265- size_t actual_name_len = name_len ;
266- char * vci_addrname = my_names + (vci * num_nics + nic ) * name_len ;
267- int ctx_idx = MPIDI_OFI_get_ctx_index (vci , nic );
268- MPIDI_OFI_CALL (fi_getname ((fid_t ) MPIDI_OFI_global .ctx [ctx_idx ].ep , vci_addrname ,
269- & actual_name_len ), getname );
270- MPIR_Assert (actual_name_len == name_len );
271- }
272- }
273- /* Allgather */
274- mpi_errno = MPIR_Allgather_fallback (MPI_IN_PLACE , 0 , MPIR_BYTE_INTERNAL ,
275- all_names , my_len , MPIR_BYTE_INTERNAL , comm , MPIR_ERR_NONE );
276-
277- /* Step 2: insert and store non-root nic/vci on the root context */
278- int root_ctx_idx = MPIDI_OFI_get_ctx_index (0 , 0 );
279- for (int r = 0 ; r < size ; r ++ ) {
280- GET_AV_AND_ADDRNAMES (r );
281- for (int nic = 0 ; nic < num_nics ; nic ++ ) {
282- for (int vci = 0 ; vci < NUM_VCIS_FOR_RANK (r ); vci ++ ) {
283- SKIP_ROOT (nic , vci );
284- DO_AV_INSERT (root_ctx_idx , nic , vci );
285- MPIDI_OFI_AV_ADDR (av , 0 , 0 , vci , nic ) = addr ;
286- }
287- }
288- }
289-
290- /* Step 3: insert all nic/vci on non-root context, following exact order as step 1 and 2 */
291-
292- int * is_node_roots = NULL ;
293- if (MPIR_CVAR_CH4_ROOTS_ONLY_PMI ) {
294- MPIR_CHKLMEM_MALLOC (is_node_roots , size * sizeof (int ));
295- for (int r = 0 ; r < size ; r ++ ) {
296- is_node_roots [r ] = 0 ;
297- }
298- for (int i = 0 ; i < MPIR_Process .num_nodes ; i ++ ) {
299- is_node_roots [MPIR_Process .node_root_map [i ]] = 1 ;
300- }
301- }
302-
303- for (int nic_local = 0 ; nic_local < num_nics ; nic_local ++ ) {
304- for (int vci_local = 0 ; vci_local < num_vcis ; vci_local ++ ) {
305- SKIP_ROOT (nic_local , vci_local );
306- int ctx_idx = MPIDI_OFI_get_ctx_index (vci_local , nic_local );
307-
308- /* -- same order as step 1 -- */
309- if (MPIR_CVAR_CH4_ROOTS_ONLY_PMI ) {
310- /* node roots */
311- for (int r = 0 ; r < size ; r ++ ) {
312- if (is_node_roots [r ]) {
313- GET_AV_AND_ADDRNAMES (r );
314- DO_AV_INSERT (ctx_idx , 0 , 0 );
315- MPIR_Assert (MPIDI_OFI_AV_ROOT_ADDR (av ) == addr );
316- }
317- }
318- /* non-node-root */
319- for (int r = 0 ; r < size ; r ++ ) {
320- if (!is_node_roots [r ]) {
321- GET_AV_AND_ADDRNAMES (r );
322- DO_AV_INSERT (ctx_idx , 0 , 0 );
323- MPIR_Assert (MPIDI_OFI_AV_ROOT_ADDR (av ) == addr );
324- }
325- }
326- } else {
327- /* !MPIR_CVAR_CH4_ROOTS_ONLY_PMI */
328- for (int r = 0 ; r < size ; r ++ ) {
329- GET_AV_AND_ADDRNAMES (r );
330- DO_AV_INSERT (ctx_idx , 0 , 0 );
331- MPIR_Assert (MPIDI_OFI_AV_ROOT_ADDR (av ) == addr );
332- }
333- }
334-
335- /* -- same order as step 2 -- */
336- for (int r = 0 ; r < size ; r ++ ) {
337- GET_AV_AND_ADDRNAMES (r );
338- for (int nic = 0 ; nic < num_nics ; nic ++ ) {
339- for (int vci = 0 ; vci < NUM_VCIS_FOR_RANK (r ); vci ++ ) {
340- SKIP_ROOT (nic , vci );
341- DO_AV_INSERT (ctx_idx , nic , vci );
342- MPIR_Assert (MPIDI_OFI_AV_ADDR (av , 0 , 0 , vci , nic ) == addr );
343- }
344- }
345- }
346- }
347- }
348- mpi_errno = MPIR_Barrier_fallback (comm , MPIR_ERR_NONE );
349- MPIR_ERR_CHECK (mpi_errno );
350-
351- /* check */
352- #if MPIDI_CH4_MAX_VCIS > 1
353- if (MPIDI_OFI_ENABLE_AV_TABLE ) {
354- for (int r = 0 ; r < size ; r ++ ) {
355- MPIDI_av_entry_t * av ATTRIBUTE ((unused )) = & MPIDIU_get_av (0 , r );
356- for (int nic = 0 ; nic < num_nics ; nic ++ ) {
357- for (int vci = 0 ; vci < NUM_VCIS_FOR_RANK (r ); vci ++ ) {
358- MPIR_Assert (MPIDI_OFI_AV_ADDR (av , 0 , 0 , vci , nic ) ==
359- get_av_table_index (r , nic , vci , all_num_vcis ));
360- }
361- }
362- }
363- }
364- #endif
365- fn_exit :
366- MPIR_CHKLMEM_FREEALL ();
367- return mpi_errno ;
368- fn_fail :
369- goto fn_exit ;
370- }
0 commit comments