1212 * All rights reserved.
1313 * Copyright (c) 2006 Sandia National Laboratories. All rights
1414 * reserved.
15- * Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
15+ * Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
1616 * Copyright (c) 2014 Los Alamos National Security, LLC. All rights
1717 * reserved.
1818 * Copyright (c) 2014 Intel, Inc. All rights reserved
@@ -69,13 +69,14 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module,
6969
7070
7171/*
72- * Loop over all procs sent to us in add_procs and see if we want to
73- * add a proc/endpoint for them.
72+ * Loop over a block of procs sent to us in add_procs and see if we
73+ * want to add a proc/endpoint for them.
7474 */
75- static int add_procs_create_endpoints (opal_btl_usnic_module_t * module ,
76- size_t nprocs ,
77- opal_proc_t * * procs ,
78- mca_btl_base_endpoint_t * * endpoints )
75+ static int add_procs_block_create_endpoints (opal_btl_usnic_module_t * module ,
76+ size_t block_offset ,
77+ size_t block_len ,
78+ opal_proc_t * * procs ,
79+ mca_btl_base_endpoint_t * * endpoints )
7980{
8081 int rc ;
8182 opal_proc_t * my_proc ;
@@ -87,8 +88,8 @@ static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
8788 return OPAL_ERR_OUT_OF_RESOURCE ;
8889 }
8990
90- /* Loop over the procs we were given */
91- for (size_t i = 0 ; i < nprocs ; i ++ ) {
91+ /* Loop over a block in the procs we were given */
92+ for (size_t i = block_offset ; i < ( block_offset + block_len ) ; i ++ ) {
9293 struct opal_proc_t * opal_proc = procs [i ];
9394 opal_btl_usnic_proc_t * usnic_proc ;
9495 mca_btl_base_endpoint_t * usnic_endpoint ;
@@ -195,22 +196,22 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
195196 * invoked. Go reap them all.
196197 */
197198static int
198- add_procs_reap_fi_av_inserts (opal_btl_usnic_module_t * module ,
199- size_t array_len ,
200- struct mca_btl_base_endpoint_t * * endpoints )
199+ add_procs_block_reap_fi_av_inserts (opal_btl_usnic_module_t * module ,
200+ size_t block_offset ,
201+ size_t block_len ,
202+ struct mca_btl_base_endpoint_t * * endpoints )
201203{
202204 int ret = OPAL_SUCCESS ;
203205 int num_left ;
204206 size_t i , channel ;
205207 uint32_t event ;
206208 struct fi_eq_entry entry ;
207209 struct fi_eq_err_entry err_entry ;
208-
209210 bool error_occurred = false;
210211
211212 /* compute num fi_av_insert completions we are waiting for */
212213 num_left = 0 ;
213- for (i = 0 ; i < array_len ; ++ i ) {
214+ for (i = block_offset ; i < ( block_offset + block_len ) ; ++ i ) {
214215 if (NULL != endpoints [i ]) {
215216 num_left += USNIC_NUM_CHANNELS ;
216217 }
@@ -266,7 +267,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
266267 We therefore only want to print a pretty
267268 warning about (and OBJ_RELEASE) that endpoint
268269 the *first* time it is reported. */
269- for (i = 0 ; i < array_len ; ++ i ) {
270+ for (i = block_offset ; i < ( block_offset + block_len ) ; ++ i ) {
270271 if (endpoints [i ] == context -> endpoint ) {
271272 add_procs_warn_unreachable (module ,
272273 context -> endpoint );
@@ -348,7 +349,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
348349 - If an otherwise-valid endpoint has no dest, that means we timed
349350 out trying to resolve it, so just release that endpoint. */
350351 size_t num_endpoints_created = 0 ;
351- for (i = 0 ; i < array_len ; i ++ ) {
352+ for (i = block_offset ; i < ( block_offset + block_len ) ; i ++ ) {
352353 if (NULL != endpoints [i ]) {
353354 bool happy ;
354355
@@ -382,6 +383,79 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
382383 return ret ;
383384}
384385
386+ /*
387+ * Create endpoints for the procs we were given in add_procs.
388+ */
389+ static int add_procs_create_endpoints (struct opal_btl_usnic_module_t * module ,
390+ size_t nprocs ,
391+ struct opal_proc_t * * procs ,
392+ struct mca_btl_base_endpoint_t * * endpoints )
393+ {
394+ /* We need to ensure that we don't overrun the libfabric AV EQ.
395+ Divide up all the peer address resolutions we need to do into a
396+ series of blocks; insert and complete each block before moving
397+ to the next (note: if performance mandates it, we can move to a
398+ sliding window style of AV inserts to get better concurrency of
399+ AV resolution). */
400+
401+ /* Leave a few empty slots in the AV EQ, just for good measure */
402+ if (module -> av_eq_size < 8 ) {
403+ opal_show_help ("help-mpi-btl-usnic.txt" , "fi_av_eq too small" ,
404+ true,
405+ opal_process_info .nodename ,
406+ module -> av_eq_size ,
407+ 8 );
408+ return OPAL_ERR_OUT_OF_RESOURCE ;
409+ }
410+
411+ size_t eq_size = module -> av_eq_size - 8 ;
412+ size_t block_len = eq_size ;
413+ size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS ;
414+ size_t num_blocks = num_av_inserts / block_len ;
415+ if (eq_size % num_av_inserts != 0 ) {
416+ ++ num_blocks ;
417+ }
418+
419+ /* Per above, the blocks are expressed in terms of number of AV
420+ inserts. Convert them to be expressed in terms of number of
421+ procs. */
422+ block_len /= USNIC_NUM_CHANNELS ;
423+
424+ /* Per above, loop over creating the endpoints so that we do not
425+ overrun the libfabric AV EQ. */
426+ int rc ;
427+ for (size_t block_offset = 0 , block = 0 ; block < num_blocks ;
428+ block_offset += block_len , ++ block ) {
429+ /* Adjust for the last block */
430+ if (block_len > (nprocs - block_offset )) {
431+ block_len = nprocs - block_offset ;
432+ }
433+
434+ /* First, create endpoints (and procs, if they're not already
435+ created) for the usnic-reachable procs we were given. */
436+ rc = add_procs_block_create_endpoints (module ,
437+ block_offset , block_len ,
438+ procs , endpoints );
439+ if (OPAL_SUCCESS != rc ) {
440+ return rc ;
441+ }
442+
443+ /* For each endpoint that was created, we initiated the
444+ process to create NUM_CHANNELS fi_addrs. Go finish all of
445+ those. This will be the final determination of whether we
446+ can use the endpoint or not because we'll find out if each
447+ endpoint is reachable or not. */
448+ rc = add_procs_block_reap_fi_av_inserts (module ,
449+ block_offset , block_len ,
450+ endpoints );
451+ if (OPAL_SUCCESS != rc ) {
452+ return rc ;
453+ }
454+ }
455+
456+ return OPAL_SUCCESS ;
457+ }
458+
385459/*
386460 * Add procs to this BTL module, receiving endpoint information from
387461 * the modex. This is done in 2 phases:
@@ -408,23 +482,13 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
408482 opal_btl_usnic_module_t * module = (opal_btl_usnic_module_t * ) base_module ;
409483 int rc ;
410484
411- /* First, create endpoints (and procs, if they're not already
412- created) for all the usnic-reachable procs we were given. */
485+ /* Go create the endpoints (including all relevant address
486+ resolution) */
413487 rc = add_procs_create_endpoints (module , nprocs , procs , endpoints );
414488 if (OPAL_SUCCESS != rc ) {
415489 goto fail ;
416490 }
417491
418- /* For each endpoint that was created, we initiated the process to
419- create NUM_CHANNELS fi_addrs. Go finish all of those. This
420- will be the final determination of whether we can use the
421- endpoint or not because we'll find out if each endpoint is
422- reachable or not. */
423- rc = add_procs_reap_fi_av_inserts (module , nprocs , endpoints );
424- if (OPAL_SUCCESS != rc ) {
425- goto fail ;
426- }
427-
428492 /* Find all the endpoints with a complete set of USD destinations
429493 and mark them as reachable */
430494 for (size_t i = 0 ; NULL != reachable && i < nprocs ; ++ i ) {
@@ -1205,7 +1269,7 @@ usnic_send(
12051269 /* assign length */
12061270 sseg -> ss_len = sizeof (opal_btl_usnic_btl_header_t ) + frag -> sf_size ;
12071271
1208- sseg -> ss_channel = USNIC_PRIORITY_CHANNEL ;
1272+ sseg -> ss_channel = USNIC_DATA_CHANNEL ;
12091273 sseg -> ss_base .us_btl_header -> tag = tag ;
12101274#if MSGDEBUG1
12111275 opal_output (0 , "INLINE send, sseg=%p" , (void * )sseg );
@@ -2018,12 +2082,15 @@ static int init_channels(opal_btl_usnic_module_t *module)
20182082 }
20192083
20202084 memset (& eq_attr , 0 , sizeof (eq_attr ));
2021- eq_attr .size = 1024 ;
2085+ eq_attr .size = module -> av_eq_num ;
20222086 eq_attr .wait_obj = FI_WAIT_UNSPEC ;
20232087 rc = fi_eq_open (module -> fabric , & eq_attr , & module -> av_eq , NULL );
20242088 if (rc != OPAL_SUCCESS ) {
20252089 goto destroy ;
20262090 }
2091+ // Save the size of the created EQ
2092+ module -> av_eq_size = eq_attr .size ;
2093+
20272094 eq_attr .wait_obj = FI_WAIT_FD ;
20282095 rc = fi_eq_open (module -> fabric , & eq_attr , & module -> dom_eq , NULL );
20292096 if (rc != OPAL_SUCCESS ) {
0 commit comments