Skip to content

Commit ec3a383

Browse files
committed
Merge pull request #688 from jsquyres/pr/usnic-libfabric-msg-prefix-fix
usnic fixes for differences between libfabric v1.0.0 and v1.1.0
2 parents 46a87ca + 633da66 commit ec3a383

File tree

10 files changed

+180
-75
lines changed

10 files changed

+180
-75
lines changed

opal/mca/btl/usnic/btl_usnic.h

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ extern opal_rng_buff_t opal_btl_usnic_rand_buff;
9393

9494
/* Set to >0 to randomly drop received frags. The higher the number,
9595
the more frequent the drops. */
96-
#define WANT_RECV_FRAG_DROPS 0
96+
#define WANT_RECV_DROPS 0
9797
/* Set to >0 to randomly fail to send an ACK, mimicing a lost ACK.
9898
The higher the number, the more frequent the failed-to-send-ACK. */
9999
#define WANT_FAIL_TO_SEND_ACK 0
@@ -102,10 +102,10 @@ extern opal_rng_buff_t opal_btl_usnic_rand_buff;
102102
the failed-to-resend-frag. */
103103
#define WANT_FAIL_TO_RESEND_FRAG 0
104104

105-
#if WANT_RECV_FRAG_DROPS > 0
106-
#define FAKE_RECV_FRAG_DROP (opal_rand(&opal_btl_usnic_rand_buff) < WANT_RECV_FRAG_DROPS)
105+
#if WANT_RECV_DROPS > 0
106+
#define FAKE_RECV_DROP (opal_rand(&opal_btl_usnic_rand_buff) < WANT_RECV_DROPS)
107107
#else
108-
#define FAKE_RECV_FRAG_DROP 0
108+
#define FAKE_RECV_DROP 0
109109
#endif
110110

111111
#if WANT_FAIL_TO_SEND_ACK > 0
@@ -213,6 +213,19 @@ typedef struct opal_btl_usnic_component_t {
213213
/* Prefix for the connectivity map filename (map will be output if
214214
the prefix is non-NULL) */
215215
char *connectivity_map_prefix;
216+
217+
/** Expected return value from fi_cq_readerr() upon success. In
218+
libfabric v1.0.0 / API v1.0, the usnic provider returned
219+
sizeof(fi_cq_err_entry) upon success. In libfabric >=v1.1 /
220+
API >=v1.1, the usnic provider returned 1 upon success. */
221+
ssize_t cq_readerr_success_value;
222+
ssize_t cq_readerr_try_again_value;
223+
224+
/** Offset into the send buffer where the payload will go. For
225+
libfabric v1.0.0 / API v1.0, this is 0. For libfabric >=v1.1
226+
/ API >=v1.1, this is the endpoint.msg_prefix_size (i.e.,
227+
component.transport_header_len). */
228+
uint32_t prefix_send_offset;
216229
} opal_btl_usnic_component_t;
217230

218231
OPAL_MODULE_DECLSPEC extern opal_btl_usnic_component_t mca_btl_usnic_component;

opal/mca/btl/usnic/btl_usnic_ack.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,7 @@ opal_btl_usnic_ack_send(
201201
/* Get an ACK frag. If we don't get one, just discard this ACK. */
202202
ack = opal_btl_usnic_ack_segment_alloc(module);
203203
if (OPAL_UNLIKELY(NULL == ack)) {
204-
opal_output(0, "====================== No frag for sending the ACK -- skipped");
205-
abort();
204+
return;
206205
}
207206

208207
/* send the seq of the lowest item in the window that
@@ -252,6 +251,7 @@ opal_btl_usnic_ack_complete(opal_btl_usnic_module_t *module,
252251
opal_btl_usnic_ack_segment_t *ack)
253252
{
254253
opal_btl_usnic_ack_segment_return(module, ack);
254+
++module->mod_channels[ack->ss_channel].credits;
255255
}
256256

257257
/*****************************************************************************/

opal/mca/btl/usnic/btl_usnic_compat.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -715,6 +715,25 @@ opal_btl_usnic_put(struct mca_btl_base_module_t *base_module,
715715
sfrag->sf_size = size;
716716
sfrag->sf_ack_bytes_left = size;
717717

718+
719+
720+
/* JMS NOTE: This is currently broken, and is deactivated by
721+
removing the MCA_BTL_FLAGS_PUT from .btl_flags in btl_module.c.
722+
723+
Overwriting the uf_local_seg values is not a good idea, and
724+
doesn't do anything to actually send the data in the
725+
progression past finish_put_or_send().
726+
727+
The proper fix is to change the plumbing here to eventually
728+
call fi_sendv() with an iov[0] = the internal buffer that's
729+
already allocated, and iov[1] = the user's buffer. The usnic
730+
provider in fi_sendv() will be smart enough to figure out which
731+
is more performance: memcpy'ing the 2 buffers together and
732+
doing a single xfer down to the hardware, or actually doing a
733+
SG list down to the hardware. */
734+
735+
736+
718737
opal_btl_usnic_frag_t *frag;
719738
frag = &sfrag->sf_base;
720739
frag->uf_local_seg[0].seg_len = size;

opal/mca/btl/usnic/btl_usnic_component.c

Lines changed: 68 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ static int usnic_component_open(void)
163163
mca_btl_usnic_component.usnic_all_modules = NULL;
164164
mca_btl_usnic_component.usnic_active_modules = NULL;
165165
mca_btl_usnic_component.transport_header_len = -1;
166+
mca_btl_usnic_component.prefix_send_offset = 0;
166167

167168
/* initialize objects */
168169
OBJ_CONSTRUCT(&mca_btl_usnic_component.usnic_procs, opal_list_t);
@@ -630,7 +631,29 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
630631
hints.ep_attr = &ep_attr;
631632
hints.fabric_attr = &fabric_attr;
632633

633-
ret = fi_getinfo(FI_VERSION(1, 0), NULL, 0, 0, &hints, &info_list);
634+
/* This code understands libfabric API v1.0 and v1.1. Even if we
635+
were compiled with libfabric API v1.0, we still want to request
636+
v1.1 -- here's why:
637+
638+
- In libfabric v1.0.0 (i.e., API v1.0), the usnic provider did
639+
not check the value of the "version" parameter passed into
640+
fi_getinfo()
641+
642+
- If you pass FI_VERSION(1,0) to libfabric v1.1.0 (i.e., API
643+
v1.1), the usnic provider will disable FI_MSG_PREFIX support
644+
(on the assumption that the application will not handle
645+
FI_MSG_PREFIX properly). This can happen if you compile OMPI
646+
against libfabric v1.0.0 (i.e., API v1.0) and run OMPI
647+
against libfabric v1.1.0 (i.e., API v1.1).
648+
649+
So never request API v1.0 -- always request a minimum of
650+
v1.1. */
651+
uint32_t libfabric_api;
652+
libfabric_api = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION);
653+
if (libfabric_api == FI_VERSION(1, 0)) {
654+
libfabric_api = FI_VERSION(1, 1);
655+
}
656+
ret = fi_getinfo(libfabric_api, NULL, 0, 0, &hints, &info_list);
634657
if (0 != ret) {
635658
opal_output_verbose(5, USNIC_OUT,
636659
"btl:usnic: disqualifiying myself due to fi_getinfo failure: %s (%d)", strerror(-ret), ret);
@@ -664,6 +687,29 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
664687
opal_output_verbose(5, USNIC_OUT,
665688
"btl:usnic: usNIC fabrics found");
666689

690+
/* Due to ambiguities in documentation, in libfabric v1.0.0 (i.e.,
691+
API v1.0) the usnic provider returned sizeof(struct
692+
fi_cq_err_entry) from fi_cq_readerr() upon success.
693+
694+
The ambiguities were clarified in libfabric v1.1.0 (i.e., API
695+
v1.1); the usnic provider returned 1 from fi_cq_readerr() upon
696+
success.
697+
698+
So query to see what version of the libfabric API we are
699+
running with, and adapt accordingly. */
700+
libfabric_api = fi_version();
701+
if (1 == FI_MAJOR(libfabric_api) &&
702+
0 == FI_MINOR(libfabric_api)) {
703+
// Old fi_cq_readerr() behavior: success=sizeof(...), try again=0
704+
mca_btl_usnic_component.cq_readerr_success_value =
705+
sizeof(struct fi_cq_err_entry);
706+
mca_btl_usnic_component.cq_readerr_try_again_value = 0;
707+
} else {
708+
// New fi_cq_readerr() behavior: success=1, try again=-FI_EAGAIN
709+
mca_btl_usnic_component.cq_readerr_success_value = 1;
710+
mca_btl_usnic_component.cq_readerr_try_again_value = -FI_EAGAIN;
711+
}
712+
667713
/* libnl initialization */
668714
opal_proc_t *me = opal_proc_local_get();
669715
opal_process_name_t *name = &(me->proc_name);
@@ -1087,34 +1133,28 @@ static int usnic_handle_completion(
10871133
seg = (opal_btl_usnic_segment_t*)completion->op_context;
10881134
rseg = (opal_btl_usnic_recv_segment_t*)seg;
10891135

1136+
/* Make the completion be Valgrind-defined */
1137+
opal_memchecker_base_mem_defined(seg, sizeof(*seg));
1138+
10901139
/* Handle work completions */
10911140
switch(seg->us_type) {
10921141

10931142
/**** Send ACK completions ****/
10941143
case OPAL_BTL_USNIC_SEG_ACK:
10951144
opal_btl_usnic_ack_complete(module,
10961145
(opal_btl_usnic_ack_segment_t *)seg);
1097-
{ opal_btl_usnic_send_segment_t *sseg = (opal_btl_usnic_send_segment_t *)seg;
1098-
++module->mod_channels[sseg->ss_channel].credits;
1099-
}
11001146
break;
11011147

11021148
/**** Send of frag segment completion ****/
11031149
case OPAL_BTL_USNIC_SEG_FRAG:
11041150
opal_btl_usnic_frag_send_complete(module,
11051151
(opal_btl_usnic_frag_segment_t*)seg);
1106-
{ opal_btl_usnic_send_segment_t *sseg = (opal_btl_usnic_send_segment_t *)seg;
1107-
++module->mod_channels[sseg->ss_channel].credits;
1108-
}
11091152
break;
11101153

11111154
/**** Send of chunk segment completion ****/
11121155
case OPAL_BTL_USNIC_SEG_CHUNK:
11131156
opal_btl_usnic_chunk_send_complete(module,
11141157
(opal_btl_usnic_chunk_segment_t*)seg);
1115-
{ opal_btl_usnic_send_segment_t *sseg = (opal_btl_usnic_send_segment_t *)seg;
1116-
++module->mod_channels[sseg->ss_channel].credits;
1117-
}
11181158
break;
11191159

11201160
/**** Receive completions ****/
@@ -1145,17 +1185,27 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module,
11451185
}
11461186

11471187
rc = fi_cq_readerr(channel->cq, &err_entry, 0);
1148-
if (rc != sizeof(err_entry)) {
1149-
BTL_ERROR(("%s: cq_readerr ret = %d",
1150-
module->fabric_info->fabric_attr->name, rc));
1188+
if (rc == mca_btl_usnic_component.cq_readerr_try_again_value) {
1189+
return;
1190+
} else if (rc != mca_btl_usnic_component.cq_readerr_success_value) {
1191+
BTL_ERROR(("%s: cq_readerr ret = %d (expected %d)",
1192+
module->fabric_info->fabric_attr->name, rc,
1193+
(int) mca_btl_usnic_component.cq_readerr_success_value));
11511194
channel->chan_error = true;
1152-
} else if (err_entry.prov_errno == 1) {
1195+
}
1196+
1197+
/* Silently count CRC errors. Truncation errors are usually a
1198+
different symptom of a CRC error. */
1199+
else if (FI_ECRC == err_entry.prov_errno ||
1200+
FI_ETRUNC == err_entry.prov_errno) {
11531201
#if MSGDEBUG1
11541202
static int once = 0;
11551203
if (once++ == 0) {
1156-
BTL_ERROR(("%s: Channel %d, CRC error",
1157-
module->fabric_info->fabric_attr->name,
1158-
channel->chan_index));
1204+
BTL_ERROR(("%s: Channel %d, %s",
1205+
module->fabric_info->fabric_attr->name,
1206+
channel->chan_index,
1207+
FI_ECRC == err_entry.prov_errno ?
1208+
"CRC error" : "message truncation"));
11591209
}
11601210
#endif
11611211

@@ -1171,23 +1221,10 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module,
11711221
rseg->rs_next = channel->repost_recv_head;
11721222
channel->repost_recv_head = rseg;
11731223
}
1174-
} else if (FI_ETRUNC == err_entry.prov_errno) {
1175-
/* This error is usually a different symptom of a CRC error */
1176-
#if MSGDEBUG1
1177-
static int once = 0;
1178-
if (once++ == 0) {
1179-
BTL_ERROR(("%s: Channel %d, message truncation",
1180-
module->fabric_info->fabric_attr->name,
1181-
channel->chan_index));
1182-
}
1183-
#endif
1184-
1185-
/* silently count CRC errors */
1186-
++module->stats.num_crc_errors;
11871224
} else {
11881225
BTL_ERROR(("%s: CQ[%d] prov_err = %d",
11891226
module->fabric_info->fabric_attr->name, channel->chan_index,
1190-
err_entry.prov_errno));
1227+
err_entry.prov_errno));
11911228
channel->chan_error = true;
11921229
}
11931230
}

opal/mca/btl/usnic/btl_usnic_frag.c

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -30,23 +30,22 @@
3030
#include "btl_usnic_ack.h"
3131

3232
static void
33-
common_send_seg_helper(
34-
opal_btl_usnic_send_segment_t *seg,
35-
int offset)
33+
common_send_seg_helper(opal_btl_usnic_send_segment_t *seg)
3634
{
3735
opal_btl_usnic_segment_t *bseg;
3836

39-
bseg = &seg->ss_base;
40-
41-
bseg->us_btl_header = (opal_btl_usnic_btl_header_t *)
42-
(((char*) bseg->us_list.ptr) + offset);
43-
bseg->us_btl_header->sender = mca_btl_usnic_component.my_hashed_rte_name;
44-
37+
/* send ptr for fi_send(). ss_len will be filled in right before
38+
the actual send. */
39+
seg->ss_ptr = (uint8_t *) seg->ss_base.us_list.ptr;
4540
seg->ss_send_posted = 0;
4641
seg->ss_ack_pending = false;
4742

48-
/* send ptr, len will be filled in just before send */
49-
seg->ss_ptr = (uint8_t *)bseg->us_btl_header;
43+
/* Offset the BTL header by (prefix_send_offset) bytes into the
44+
raw buffer */
45+
bseg = &seg->ss_base;
46+
bseg->us_btl_header = (opal_btl_usnic_btl_header_t *)
47+
(seg->ss_ptr + mca_btl_usnic_component.prefix_send_offset);
48+
bseg->us_btl_header->sender = mca_btl_usnic_component.my_hashed_rte_name;
5049
}
5150

5251
static void
@@ -59,7 +58,7 @@ chunk_seg_constructor(
5958
bseg->us_type = OPAL_BTL_USNIC_SEG_CHUNK;
6059

6160
/* some more common initializaiton */
62-
common_send_seg_helper(seg, mca_btl_usnic_component.transport_header_len);
61+
common_send_seg_helper(seg);
6362

6463
/* payload starts next byte beyond BTL chunk header */
6564
bseg->us_payload.raw = (uint8_t *)(bseg->us_btl_chunk_header + 1);
@@ -77,7 +76,7 @@ frag_seg_constructor(
7776
bseg->us_type = OPAL_BTL_USNIC_SEG_FRAG;
7877

7978
/* some more common initializaiton */
80-
common_send_seg_helper(seg, mca_btl_usnic_component.transport_header_len);
79+
common_send_seg_helper(seg);
8180

8281
/* payload starts next byte beyond BTL header */
8382
bseg->us_payload.raw = (uint8_t *)(bseg->us_btl_header + 1);
@@ -95,7 +94,7 @@ ack_seg_constructor(
9594
bseg->us_type = OPAL_BTL_USNIC_SEG_ACK;
9695

9796
/* some more common initializaiton */
98-
common_send_seg_helper(ack, mca_btl_usnic_component.transport_header_len);
97+
common_send_seg_helper(ack);
9998

10099
/* ACK value embedded in BTL header */
101100
bseg->us_btl_header->payload_type = OPAL_BTL_USNIC_PAYLOAD_TYPE_ACK;
@@ -176,12 +175,13 @@ send_frag_constructor(opal_btl_usnic_send_frag_t *frag)
176175
static void
177176
send_frag_destructor(opal_btl_usnic_send_frag_t *frag)
178177
{
179-
mca_btl_base_descriptor_t *desc;
180-
178+
#if OPAL_ENABLE_DEBUG
181179
/* make sure nobody twiddled these values after the constructor */
180+
mca_btl_base_descriptor_t *desc;
182181
desc = &frag->sf_base.uf_base;
183182
assert(desc->USNIC_SEND_LOCAL == frag->sf_base.uf_local_seg);
184183
assert(0 == frag->sf_base.uf_local_seg[0].seg_len);
184+
#endif
185185

186186
/* PML may change desc->des_remote to point elsewhere, cannot assert that it
187187
* still points to our embedded segment */

0 commit comments

Comments
 (0)