Skip to content

Commit f265358

Browse files
committed
usnic: handle FI_MSG_PREFIX differences libfabric v1.0.0->v1.1.0
In libfabric v1.0.0 (i.e., API v1.0), the usnic provider handled FI_MSG_PREFIX inconsistently between sends and receives. This has been fixed in libfabric v1.1.0 (i.e., API v1.1): FI_MSG_PREFIX is handled consistently for both sends and receives. Run-time detect which libfabric we are running with and adapt behavior appropriately.
1 parent ddd0de6 commit f265358

File tree

5 files changed

+90
-31
lines changed

5 files changed

+90
-31
lines changed

opal/mca/btl/usnic/btl_usnic.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,12 @@ typedef struct opal_btl_usnic_component_t {
220220
API >=v1.1, the usnic provider returned 1 upon success. */
221221
ssize_t cq_readerr_success_value;
222222
ssize_t cq_readerr_try_again_value;
223+
224+
/** Offset into the send buffer where the payload will go. For
225+
libfabric v1.0.0 / API v1.0, this is 0. For libfabric >=v1.1
226+
/ API >=v1.1, this is the endpoint.msg_prefix_size (i.e.,
227+
component.transport_header_len). */
228+
uint32_t prefix_send_offset;
223229
} opal_btl_usnic_component_t;
224230

225231
OPAL_MODULE_DECLSPEC extern opal_btl_usnic_component_t mca_btl_usnic_component;

opal/mca/btl/usnic/btl_usnic_component.c

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ static int usnic_component_open(void)
163163
mca_btl_usnic_component.usnic_all_modules = NULL;
164164
mca_btl_usnic_component.usnic_active_modules = NULL;
165165
mca_btl_usnic_component.transport_header_len = -1;
166+
mca_btl_usnic_component.prefix_send_offset = 0;
166167

167168
/* initialize objects */
168169
OBJ_CONSTRUCT(&mca_btl_usnic_component.usnic_procs, opal_list_t);
@@ -630,7 +631,29 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
630631
hints.ep_attr = &ep_attr;
631632
hints.fabric_attr = &fabric_attr;
632633

633-
ret = fi_getinfo(FI_VERSION(1, 0), NULL, 0, 0, &hints, &info_list);
634+
/* This code understands libfabric API v1.0 and v1.1. Even if we
635+
were compiled with libfabric API v1.0, we still want to request
636+
v1.1 -- here's why:
637+
638+
- In libfabric v1.0.0 (i.e., API v1.0), the usnic provider did
639+
not check the value of the "version" parameter passed into
640+
fi_getinfo()
641+
642+
- If you pass FI_VERSION(1,0) to libfabric v1.1.0 (i.e., API
643+
v1.1), the usnic provider will disable FI_MSG_PREFIX support
644+
(on the assumption that the application will not handle
645+
FI_MSG_PREFIX properly). This can happen if you compile OMPI
646+
against libfabric v1.0.0 (i.e., API v1.0) and run OMPI
647+
against libfabric v1.1.0 (i.e., API v1.1).
648+
649+
So never request API v1.0 -- always request a minimum of
650+
v1.1. */
651+
uint32_t libfabric_api;
652+
libfabric_api = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION);
653+
if (libfabric_api == FI_VERSION(1, 0)) {
654+
libfabric_api = FI_VERSION(1, 1);
655+
}
656+
ret = fi_getinfo(libfabric_api, NULL, 0, 0, &hints, &info_list);
634657
if (0 != ret) {
635658
opal_output_verbose(5, USNIC_OUT,
636659
"btl:usnic: disqualifiying myself due to fi_getinfo failure: %s (%d)", strerror(-ret), ret);
@@ -671,8 +694,9 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
671694
The ambiguities were clarified in libfabric v1.1.0 (i.e., API
672695
v1.1); the usnic provider returned 1 from fi_cq_readerr() upon
673696
success.
674-
*/
675-
uint32_t libfabric_api;
697+
698+
So query to see what version of the libfabric API we are
699+
running with, and adapt accordingly. */
676700
libfabric_api = fi_version();
677701
if (1 == FI_MAJOR(libfabric_api) &&
678702
0 == FI_MINOR(libfabric_api)) {

opal/mca/btl/usnic/btl_usnic_frag.c

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,23 +30,22 @@
3030
#include "btl_usnic_ack.h"
3131

3232
static void
33-
common_send_seg_helper(
34-
opal_btl_usnic_send_segment_t *seg,
35-
int offset)
33+
common_send_seg_helper(opal_btl_usnic_send_segment_t *seg)
3634
{
3735
opal_btl_usnic_segment_t *bseg;
3836

39-
bseg = &seg->ss_base;
40-
41-
bseg->us_btl_header = (opal_btl_usnic_btl_header_t *)
42-
(((char*) bseg->us_list.ptr) + offset);
43-
bseg->us_btl_header->sender = mca_btl_usnic_component.my_hashed_rte_name;
44-
37+
/* send ptr for fi_send(). ss_len will be filled in right before
38+
the actual send. */
39+
seg->ss_ptr = (uint8_t *) seg->ss_base.us_list.ptr;
4540
seg->ss_send_posted = 0;
4641
seg->ss_ack_pending = false;
4742

48-
/* send ptr, len will be filled in just before send */
49-
seg->ss_ptr = (uint8_t *)bseg->us_btl_header;
43+
/* Offset the BTL header by (prefix_send_offset) bytes into the
44+
raw buffer */
45+
bseg = &seg->ss_base;
46+
bseg->us_btl_header = (opal_btl_usnic_btl_header_t *)
47+
(seg->ss_ptr + mca_btl_usnic_component.prefix_send_offset);
48+
bseg->us_btl_header->sender = mca_btl_usnic_component.my_hashed_rte_name;
5049
}
5150

5251
static void
@@ -59,7 +58,7 @@ chunk_seg_constructor(
5958
bseg->us_type = OPAL_BTL_USNIC_SEG_CHUNK;
6059

6160
/* some more common initializaiton */
62-
common_send_seg_helper(seg, mca_btl_usnic_component.transport_header_len);
61+
common_send_seg_helper(seg);
6362

6463
/* payload starts next byte beyond BTL chunk header */
6564
bseg->us_payload.raw = (uint8_t *)(bseg->us_btl_chunk_header + 1);
@@ -77,7 +76,7 @@ frag_seg_constructor(
7776
bseg->us_type = OPAL_BTL_USNIC_SEG_FRAG;
7877

7978
/* some more common initializaiton */
80-
common_send_seg_helper(seg, mca_btl_usnic_component.transport_header_len);
79+
common_send_seg_helper(seg);
8180

8281
/* payload starts next byte beyond BTL header */
8382
bseg->us_payload.raw = (uint8_t *)(bseg->us_btl_header + 1);
@@ -95,7 +94,7 @@ ack_seg_constructor(
9594
bseg->us_type = OPAL_BTL_USNIC_SEG_ACK;
9695

9796
/* some more common initializaiton */
98-
common_send_seg_helper(ack, mca_btl_usnic_component.transport_header_len);
97+
common_send_seg_helper(ack);
9998

10099
/* ACK value embedded in BTL header */
101100
bseg->us_btl_header->payload_type = OPAL_BTL_USNIC_PAYLOAD_TYPE_ACK;

opal/mca/btl/usnic/btl_usnic_module.c

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1421,7 +1421,7 @@ static int create_ep(opal_btl_usnic_module_t* module,
14211421
opal_process_info.my_local_rank);
14221422
}
14231423

1424-
rc = fi_getinfo(FI_VERSION(1, 0), NULL, 0, 0, hint, &channel->info);
1424+
rc = fi_getinfo(FI_VERSION(1, 1), NULL, 0, 0, hint, &channel->info);
14251425
fi_freeinfo(hint);
14261426
if (0 != rc) {
14271427
opal_show_help("help-mpi-btl-usnic.txt",
@@ -1634,6 +1634,9 @@ static int init_one_channel(opal_btl_usnic_module_t *module,
16341634
goto error;
16351635
}
16361636

1637+
assert(channel->info->ep_attr->msg_prefix_size ==
1638+
(uint32_t) mca_btl_usnic_component.transport_header_len);
1639+
16371640
/*
16381641
* Initialize pool of receive segments. Round MTU up to cache
16391642
* line size so that each segment is guaranteed to start on a
@@ -1777,6 +1780,33 @@ static void init_find_transport_header_len(opal_btl_usnic_module_t *module)
17771780
module->fabric_info->ep_attr->msg_prefix_size;
17781781
mca_btl_usnic_component.transport_protocol =
17791782
module->fabric_info->ep_attr->protocol;
1783+
1784+
/* The usnic provider in libfabric v1.0.0 (i.e., API v1.0) treated
1785+
FI_MSG_PREFIX inconsistently between senders and receivers. It
1786+
was corrected in libfabric v1.1.0 (i.e., API v1.1), meaning
1787+
that FI_MSG_PREFIX is treated consistently between senders and
1788+
receivers.
1789+
1790+
So check what version of the libfabric API we have, and setup
1791+
to use the "old" (inconsistent) MSG_PREFIX behavior, or the
1792+
"new" MSG_PREFIX (consistent) behavior.
1793+
1794+
NOTE: This is a little redundant; we're setting a
1795+
component-level attribute during each module's setup. We do
1796+
this here (and not earlier, when we check fi_version() during
1797+
the component setup) because we can't obtain the value of the
1798+
endpoint msg_prefix_size until we setup the first module.
1799+
Also, it's safe because each module will set the component
1800+
attribute to the same value. So it's ok. */
1801+
uint32_t libfabric_api;
1802+
libfabric_api = fi_version();
1803+
if (1 == FI_MAJOR(libfabric_api) &&
1804+
0 == FI_MINOR(libfabric_api)) {
1805+
mca_btl_usnic_component.prefix_send_offset = 0;
1806+
} else {
1807+
mca_btl_usnic_component.prefix_send_offset =
1808+
module->fabric_info->ep_attr->msg_prefix_size;
1809+
}
17801810
}
17811811

17821812
/*
@@ -1835,13 +1865,15 @@ static void init_payload_lengths(opal_btl_usnic_module_t *module)
18351865
/* Find the max payload this port can handle */
18361866
module->max_frag_payload =
18371867
module->local_modex.max_msg_size - /* start with the MTU */
1838-
sizeof(opal_btl_usnic_btl_header_t); /* subtract size of
1839-
the BTL header */
1868+
sizeof(opal_btl_usnic_btl_header_t) - /* subtract size of
1869+
the BTL header */
1870+
mca_btl_usnic_component.prefix_send_offset;
18401871

18411872
/* same, but use chunk header */
18421873
module->max_chunk_payload =
18431874
module->local_modex.max_msg_size -
1844-
sizeof(opal_btl_usnic_btl_chunk_header_t);
1875+
sizeof(opal_btl_usnic_btl_chunk_header_t) -
1876+
mca_btl_usnic_component.prefix_send_offset;
18451877

18461878
/* Priorirty queue MTU and max size */
18471879
if (0 == module->max_tiny_msg_size) {
@@ -2097,15 +2129,14 @@ static void init_freelists(opal_btl_usnic_module_t *module)
20972129
uint32_t segsize;
20982130

20992131
segsize = (module->local_modex.max_msg_size +
2100-
module->fabric_info->ep_attr->msg_prefix_size +
21012132
opal_cache_line_size - 1) &
21022133
~(opal_cache_line_size - 1);
21032134

21042135
/* Send frags freelists */
21052136
OBJ_CONSTRUCT(&module->small_send_frags, opal_free_list_t);
21062137
rc = usnic_compat_free_list_init(&module->small_send_frags,
21072138
sizeof(opal_btl_usnic_small_send_frag_t) +
2108-
mca_btl_usnic_component.transport_header_len,
2139+
mca_btl_usnic_component.prefix_send_offset,
21092140
opal_cache_line_size,
21102141
OBJ_CLASS(opal_btl_usnic_small_send_frag_t),
21112142
segsize,
@@ -2123,7 +2154,7 @@ static void init_freelists(opal_btl_usnic_module_t *module)
21232154
OBJ_CONSTRUCT(&module->large_send_frags, opal_free_list_t);
21242155
rc = usnic_compat_free_list_init(&module->large_send_frags,
21252156
sizeof(opal_btl_usnic_large_send_frag_t) +
2126-
mca_btl_usnic_component.transport_header_len,
2157+
mca_btl_usnic_component.prefix_send_offset,
21272158
opal_cache_line_size,
21282159
OBJ_CLASS(opal_btl_usnic_large_send_frag_t),
21292160
0, /* payload size */
@@ -2141,7 +2172,7 @@ static void init_freelists(opal_btl_usnic_module_t *module)
21412172
OBJ_CONSTRUCT(&module->put_dest_frags, opal_free_list_t);
21422173
rc = usnic_compat_free_list_init(&module->put_dest_frags,
21432174
sizeof(opal_btl_usnic_put_dest_frag_t) +
2144-
mca_btl_usnic_component.transport_header_len,
2175+
mca_btl_usnic_component.prefix_send_offset,
21452176
opal_cache_line_size,
21462177
OBJ_CLASS(opal_btl_usnic_put_dest_frag_t),
21472178
0, /* payload size */
@@ -2160,7 +2191,7 @@ static void init_freelists(opal_btl_usnic_module_t *module)
21602191
OBJ_CONSTRUCT(&module->chunk_segs, opal_free_list_t);
21612192
rc = usnic_compat_free_list_init(&module->chunk_segs,
21622193
sizeof(opal_btl_usnic_chunk_segment_t) +
2163-
mca_btl_usnic_component.transport_header_len,
2194+
mca_btl_usnic_component.prefix_send_offset,
21642195
opal_cache_line_size,
21652196
OBJ_CLASS(opal_btl_usnic_chunk_segment_t),
21662197
segsize,
@@ -2178,12 +2209,11 @@ static void init_freelists(opal_btl_usnic_module_t *module)
21782209
/* ACK segments freelist */
21792210
uint32_t ack_segment_len;
21802211
ack_segment_len = (sizeof(opal_btl_usnic_btl_header_t) +
2181-
module->fabric_info->ep_attr->msg_prefix_size +
21822212
opal_cache_line_size - 1) & ~(opal_cache_line_size - 1);
21832213
OBJ_CONSTRUCT(&module->ack_segs, opal_free_list_t);
21842214
rc = usnic_compat_free_list_init(&module->ack_segs,
21852215
sizeof(opal_btl_usnic_ack_segment_t) +
2186-
mca_btl_usnic_component.transport_header_len,
2216+
mca_btl_usnic_component.prefix_send_offset,
21872217
opal_cache_line_size,
21882218
OBJ_CLASS(opal_btl_usnic_ack_segment_t),
21892219
ack_segment_len,

opal/mca/btl/usnic/btl_usnic_send.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
2+
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
33
* $COPYRIGHT$
44
*
55
* Additional copyrights may follow
@@ -79,7 +79,7 @@ opal_btl_usnic_post_segment(
7979
/* Send the segment */
8080
ret = fi_send(channel->ep,
8181
sseg->ss_ptr,
82-
sseg->ss_len,
82+
sseg->ss_len + mca_btl_usnic_component.prefix_send_offset,
8383
NULL,
8484
endpoint->endpoint_remote_addrs[channel_id],
8585
sseg);
@@ -128,7 +128,7 @@ opal_btl_usnic_post_ack(
128128

129129
ret = fi_send(channel->ep,
130130
sseg->ss_ptr,
131-
sseg->ss_len,
131+
sseg->ss_len + mca_btl_usnic_component.prefix_send_offset,
132132
NULL,
133133
endpoint->endpoint_remote_addrs[channel_id],
134134
sseg);

0 commit comments

Comments
 (0)