Skip to content

Commit bbb8246

Browse files
committed
usnic: don't overrun the fi_av_insert() EQ
Add endpoints in a blocked manner so that we don't overrun the fi_av_insert() event queue. Also make the AV EQ length an MCA param, and report it in mca_btl_base_verbose >=5 output. (cherry picked from commit open-mpi/ompi@db825ab)
1 parent 402cba0 commit bbb8246

File tree

6 files changed

+132
-34
lines changed

6 files changed

+132
-34
lines changed

opal/mca/btl/usnic/btl_usnic.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006 Sandia National Laboratories. All rights
1313
* reserved.
14-
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
1515
* $COPYRIGHT$
1616
*
1717
* Additional copyrights may follow
@@ -181,6 +181,9 @@ typedef struct opal_btl_usnic_component_t {
181181
/** max completion queue entries per module */
182182
int32_t cq_num;
183183

184+
/** max number of entries in AV EQ */
185+
int32_t av_eq_num;
186+
184187
/** retrans characteristics */
185188
int retrans_timeout;
186189

opal/mca/btl/usnic/btl_usnic_component.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -956,11 +956,12 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
956956
/* Output all of this module's values. */
957957
const char *devname = module->fabric_info->fabric_attr->name;
958958
opal_output_verbose(5, USNIC_OUT,
959-
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d",
959+
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d, num aveqe=%d",
960960
devname,
961961
module->sd_num,
962962
module->rd_num,
963-
module->cq_num);
963+
module->cq_num,
964+
module->av_eq_num);
964965
opal_output_verbose(5, USNIC_OUT,
965966
"btl:usnic: %s priority MTU = %" PRIsize_t,
966967
devname,

opal/mca/btl/usnic/btl_usnic_mca.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006 Sandia National Laboratories. All rights
1313
* reserved.
14-
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
1616
* reserved.
1717
* Copyright (c) 2015 Intel, Inc. All rights reserved.
@@ -162,6 +162,7 @@ int opal_btl_usnic_component_register(void)
162162
static int prio_sd_num;
163163
static int prio_rd_num;
164164
static int cq_num;
165+
static int av_eq_num;
165166
static int udp_port_base;
166167
static int max_tiny_msg_size;
167168
static int eager_limit;
@@ -235,6 +236,10 @@ int opal_btl_usnic_component_register(void)
235236
-1, &cq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
236237
mca_btl_usnic_component.cq_num = (int32_t) cq_num;
237238

239+
CHECK(reg_int("av_eq_num", "Number of event queue entries for peer address resolution (-1 = pre-set defaults; depends on number and type of devices available; will error if ac_eq_num < 8)",
240+
-1, &av_eq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
241+
mca_btl_usnic_component.av_eq_num = (int32_t) av_eq_num;
242+
238243
CHECK(reg_int("base_udp_port", "Base UDP port to use for usNIC communications. If 0, system will pick the port number. If non-zero, it will be added to each process' local rank to obtain the final port number (default: 0)",
239244
0, &udp_port_base, REGINT_GE_ZERO, OPAL_INFO_LVL_5));
240245
mca_btl_usnic_component.udp_port_base = (int) udp_port_base;

opal/mca/btl/usnic/btl_usnic_module.c

Lines changed: 100 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -69,13 +69,14 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module,
6969

7070

7171
/*
72-
* Loop over all procs sent to us in add_procs and see if we want to
73-
* add a proc/endpoint for them.
72+
* Loop over a block of procs sent to us in add_procs and see if we
73+
* want to add a proc/endpoint for them.
7474
*/
75-
static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
76-
size_t nprocs,
77-
opal_proc_t **procs,
78-
mca_btl_base_endpoint_t **endpoints)
75+
static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
76+
size_t block_offset,
77+
size_t block_len,
78+
opal_proc_t **procs,
79+
mca_btl_base_endpoint_t **endpoints)
7980
{
8081
int rc;
8182
opal_proc_t* my_proc;
@@ -87,8 +88,8 @@ static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
8788
return OPAL_ERR_OUT_OF_RESOURCE;
8889
}
8990

90-
/* Loop over the procs we were given */
91-
for (size_t i = 0; i < nprocs; i++) {
91+
/* Loop over a block in the procs we were given */
92+
for (size_t i = block_offset; i < (block_offset + block_len); i++) {
9293
struct opal_proc_t* opal_proc = procs[i];
9394
opal_btl_usnic_proc_t* usnic_proc;
9495
mca_btl_base_endpoint_t* usnic_endpoint;
@@ -195,22 +196,22 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
195196
* invoked. Go reap them all.
196197
*/
197198
static int
198-
add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
199-
size_t array_len,
200-
struct mca_btl_base_endpoint_t **endpoints)
199+
add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
200+
size_t block_offset,
201+
size_t block_len,
202+
struct mca_btl_base_endpoint_t **endpoints)
201203
{
202204
int ret = OPAL_SUCCESS;
203205
int num_left;
204206
size_t i, channel;
205207
uint32_t event;
206208
struct fi_eq_entry entry;
207209
struct fi_eq_err_entry err_entry;
208-
209210
bool error_occurred = false;
210211

211212
/* compute num fi_av_insert completions we are waiting for */
212213
num_left = 0;
213-
for (i = 0; i < array_len; ++i) {
214+
for (i = block_offset; i < (block_offset + block_len); ++i) {
214215
if (NULL != endpoints[i]) {
215216
num_left += USNIC_NUM_CHANNELS;
216217
}
@@ -266,7 +267,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
266267
We therefore only want to print a pretty
267268
warning about (and OBJ_RELEASE) that endpoint
268269
the *first* time it is reported. */
269-
for (i = 0; i < array_len; ++i) {
270+
for (i = block_offset; i < (block_offset + block_len); ++i) {
270271
if (endpoints[i] == context->endpoint) {
271272
add_procs_warn_unreachable(module,
272273
context->endpoint);
@@ -348,7 +349,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
348349
- If an otherwise-valid endpoint has no dest, that means we timed
349350
out trying to resolve it, so just release that endpoint. */
350351
size_t num_endpoints_created = 0;
351-
for (i = 0; i < array_len; i++) {
352+
for (i = block_offset; i < (block_offset + block_len); i++) {
352353
if (NULL != endpoints[i]) {
353354
bool happy;
354355

@@ -382,6 +383,79 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
382383
return ret;
383384
}
384385

386+
/*
387+
* Create endpoints for the procs we were given in add_procs.
388+
*/
389+
static int add_procs_create_endpoints(struct opal_btl_usnic_module_t* module,
390+
size_t nprocs,
391+
struct opal_proc_t **procs,
392+
struct mca_btl_base_endpoint_t** endpoints)
393+
{
394+
/* We need to ensure that we don't overrun the libfabric AV EQ.
395+
Divide up all the peer address resolutions we need to do into a
396+
series of blocks; insert and complete each block before moving
397+
to the next (note: if performance mandates it, we can move to a
398+
sliding window style of AV inserts to get better concurrency of
399+
AV resolution). */
400+
401+
/* Leave a few empty slots in the AV EQ, just for good measure */
402+
if (module->av_eq_size < 8) {
403+
opal_show_help("help-mpi-btl-usnic.txt", "fi_av_eq too small",
404+
true,
405+
opal_process_info.nodename,
406+
module->av_eq_size,
407+
8);
408+
return OPAL_ERR_OUT_OF_RESOURCE;
409+
}
410+
411+
size_t eq_size = module->av_eq_size - 8;
412+
size_t block_len = eq_size;
413+
size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS;
414+
size_t num_blocks = num_av_inserts / eq_size;
415+
if (eq_size % num_av_inserts != 0) {
416+
++num_blocks;
417+
}
418+
419+
/* Per above, the blocks are expressed in terms of number of AV
420+
inserts. Convert them to be expressed in terms of number of
421+
procs. */
422+
block_len /= USNIC_NUM_CHANNELS;
423+
424+
/* Per above, loop over creating the endpoints so that we do not
425+
overrun the libfabric AV EQ. */
426+
int rc;
427+
for (size_t block_offset = 0, block = 0; block < num_blocks;
428+
block_offset += block_len, ++block) {
429+
/* Adjust for the last block */
430+
if (block_len > (nprocs - block_offset)) {
431+
block_len = nprocs - block_offset;
432+
}
433+
434+
/* First, create endpoints (and procs, if they're not already
435+
created) for the usnic-reachable procs we were given. */
436+
rc = add_procs_block_create_endpoints(module,
437+
block_offset, block_len,
438+
procs, endpoints);
439+
if (OPAL_SUCCESS != rc) {
440+
return rc;
441+
}
442+
443+
/* For each endpoint that was created, we initiated the
444+
process to create NUM_CHANNELS fi_addrs. Go finish all of
445+
those. This will be the final determination of whether we
446+
can use the endpoint or not because we'll find out if each
447+
endpoint is reachable or not. */
448+
rc = add_procs_block_reap_fi_av_inserts(module,
449+
block_offset, block_len,
450+
endpoints);
451+
if (OPAL_SUCCESS != rc) {
452+
return rc;
453+
}
454+
}
455+
456+
return OPAL_SUCCESS;
457+
}
458+
385459
/*
386460
* Add procs to this BTL module, receiving endpoint information from
387461
* the modex. This is done in 2 phases:
@@ -408,23 +482,13 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
408482
opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*) base_module;
409483
int rc;
410484

411-
/* First, create endpoints (and procs, if they're not already
412-
created) for all the usnic-reachable procs we were given. */
485+
/* Go create the endpoints (including all relevant address
486+
resolution) */
413487
rc = add_procs_create_endpoints(module, nprocs, procs, endpoints);
414488
if (OPAL_SUCCESS != rc) {
415489
goto fail;
416490
}
417491

418-
/* For each endpoint that was created, we initiated the process to
419-
create NUM_CHANNELS fi_addrs. Go finish all of those. This
420-
will be the final determination of whether we can use the
421-
endpoint or not because we'll find out if each endpoint is
422-
reachable or not. */
423-
rc = add_procs_reap_fi_av_inserts(module, nprocs, endpoints);
424-
if (OPAL_SUCCESS != rc) {
425-
goto fail;
426-
}
427-
428492
/* Find all the endpoints with a complete set of USD destinations
429493
and mark them as reachable */
430494
for (size_t i = 0; NULL != reachable && i < nprocs; ++i) {
@@ -1831,6 +1895,11 @@ static void init_queue_lengths(opal_btl_usnic_module_t *module)
18311895
} else {
18321896
module->cq_num = mca_btl_usnic_component.cq_num;
18331897
}
1898+
if (-1 == mca_btl_usnic_component.av_eq_num) {
1899+
module->av_eq_num = 1024;
1900+
} else {
1901+
module->av_eq_num = mca_btl_usnic_component.av_eq_num;
1902+
}
18341903

18351904
/*
18361905
* Queue sizes for priority channel scale with # of endpoint. A
@@ -2018,12 +2087,15 @@ static int init_channels(opal_btl_usnic_module_t *module)
20182087
}
20192088

20202089
memset(&eq_attr, 0, sizeof(eq_attr));
2021-
eq_attr.size = 1024;
2090+
eq_attr.size = module->av_eq_num;
20222091
eq_attr.wait_obj = FI_WAIT_UNSPEC;
20232092
rc = fi_eq_open(module->fabric, &eq_attr, &module->av_eq, NULL);
20242093
if (rc != OPAL_SUCCESS) {
20252094
goto destroy;
20262095
}
2096+
// Save the size of the created EQ
2097+
module->av_eq_size = eq_attr.size;
2098+
20272099
eq_attr.wait_obj = FI_WAIT_FD;
20282100
rc = fi_eq_open(module->fabric, &eq_attr, &module->dom_eq, NULL);
20292101
if (rc != OPAL_SUCCESS) {

opal/mca/btl/usnic/btl_usnic_module.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006 Sandia National Laboratories. All rights
1313
* reserved.
14-
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
1515
* $COPYRIGHT$
1616
*
1717
* Additional copyrights may follow
@@ -110,6 +110,8 @@ typedef struct opal_btl_usnic_module_t {
110110
struct fid_eq *av_eq;
111111
struct fid_av *av;
112112

113+
size_t av_eq_size;
114+
113115
mca_btl_base_module_error_cb_fn_t pml_error_callback;
114116

115117
/* Information about the events */
@@ -127,6 +129,7 @@ typedef struct opal_btl_usnic_module_t {
127129
int sd_num;
128130
int rd_num;
129131
int cq_num;
132+
int av_eq_num;
130133
int prio_sd_num;
131134
int prio_rd_num;
132135

opal/mca/btl/usnic/help-mpi-btl-usnic.txt

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- text -*-
22
#
3-
# Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved.
3+
# Copyright (c) 2012-2016 Cisco Systems, Inc. All rights reserved.
44
#
55
# $COPYRIGHT$
66
#
@@ -240,6 +240,20 @@ abort.
240240
usNIC interface: %s
241241
Current ARP timeout: %d (btl_usnic_arp_timeout MCA param)
242242
#
243+
[fi_av_eq too small]
244+
245+
The usnic BTL was told to create an address resolution queue that was
246+
too small via the mca_btl_usnic_av_eq_num MCA parameter. This
247+
parameter controls how many outstanding peer address resolutions can
248+
be outstanding at a time. Larger values allow more concurrent address
249+
resolutions, but consume more memory.
250+
251+
Server: %s
252+
av_eq_num param value: %d
253+
av_eq_num minimum value: %d
254+
255+
Your job will likely either perform poorly, or will abort.
256+
#
243257
[unreachable peer IP]
244258
WARNING: Open MPI failed to find a route to a peer IP address via a
245259
specific usNIC interface. This usually indicates a problem in the IP

0 commit comments

Comments
 (0)